1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge pull request #209 from ikreymer/warcio-split

Warcio split
This commit is contained in:
Ilya Kreymer 2017-03-08 16:35:08 -08:00 committed by GitHub
commit 738fc0e427
60 changed files with 533 additions and 2706 deletions

View File

@ -3,8 +3,9 @@ from pywb.cdx.cdxobject import TIMESTAMP, STATUSCODE, MIMETYPE, DIGEST
from pywb.cdx.cdxobject import OFFSET, LENGTH, FILENAME
from pywb.cdx.query import CDXQuery
from pywb.utils.timeutils import timestamp_to_sec, pad_timestamp
from pywb.utils.timeutils import PAD_14_DOWN, PAD_14_UP
from warcio.timeutils import timestamp_to_sec, pad_timestamp
from warcio.timeutils import PAD_14_DOWN, PAD_14_UP
import bisect

View File

@ -14,7 +14,7 @@ com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ
from fakeredis import FakeStrictRedis
from mock import patch
from pywb.utils.timeutils import timestamp_to_sec
from warcio.timeutils import timestamp_to_sec
from pywb.cdx.cdxsource import RedisCDXSource
from pywb.cdx.cdxserver import CDXServer

View File

@ -13,7 +13,7 @@ from pywb.cdx.cdxsource import CDXSource
from pywb.cdx.cdxobject import IDXObject, CDXException
from pywb.utils.loaders import BlockLoader, read_last_line
from pywb.utils.bufferedreaders import gzip_decompressor
from warcio.bufferedreaders import gzip_decompressor
from pywb.utils.binsearch import iter_range, linearsearch, search

View File

@ -1,6 +1,6 @@
from pywb.utils.wbexception import BadRequestException
from pywb.utils.timeutils import http_date_to_timestamp
from pywb.utils.timeutils import timestamp_to_http_date
from warcio.timeutils import http_date_to_timestamp
from warcio.timeutils import timestamp_to_http_date
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
from pywb.rewrite.wburl import WbUrl

View File

@ -4,6 +4,7 @@ from pywb.framework.wbrequestresponse import WbResponse, WbRequest
from pywb.framework.archivalrouter import ArchivalRouter
from six.moves.urllib.parse import urlsplit
from six import iteritems
import base64
import socket
@ -15,8 +16,8 @@ from pywb.rewrite.url_rewriter import SchemeOnlyUrlRewriter, UrlRewriter
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.utils.wbexception import BadRequestException
from pywb.utils.bufferedreaders import BufferedReader
from pywb.utils.loaders import to_native_str
from warcio.bufferedreaders import BufferedReader
from warcio.utils import to_native_str
from pywb.framework.proxy_resolvers import ProxyAuthResolver, CookieResolver, IPCacheResolver
@ -250,7 +251,8 @@ class ProxyRouter(object):
# add extra headers for replay responses
if wbrequest.wb_url and wbrequest.wb_url.is_replay():
response.status_headers.replace_headers(self.extra_headers)
for name, value in iteritems(self.extra_headers):
response.status_headers.replace_header(name, value)
# check for content-length
res = response.status_headers.get_header('content-length')

View File

@ -1,7 +1,6 @@
from pywb.framework.wbrequestresponse import WbResponse
from pywb.utils.loaders import extract_client_cookie
from pywb.utils.wbexception import WbException
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.rewrite.wburl import WbUrl
from pywb.framework.cache import create_cache
@ -10,7 +9,8 @@ from pywb.framework.basehandlers import WbUrlHandler
from six.moves.urllib.parse import parse_qs, urlsplit
import six
from pywb.utils.loaders import to_native_str
from warcio.statusandheaders import StatusAndHeaders
from warcio.utils import to_native_str
import base64
import os

View File

@ -61,7 +61,7 @@
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders
from warcio.statusandheaders import StatusAndHeaders
from pywb.framework.wbrequestresponse import WbRequest, WbResponse

View File

@ -1,4 +1,4 @@
from pywb.utils.statusandheaders import StatusAndHeaders
from warcio.statusandheaders import StatusAndHeaders
from pywb.utils.loaders import extract_post_query, append_post_query
from io import BytesIO

View File

@ -1,7 +1,10 @@
from pywb.utils.wbexception import WbException, NotFoundException
from pywb.utils.loaders import load_yaml_config, to_native_str
from pywb.utils.loaders import load_yaml_config
from pywb.utils.loaders import load_yaml_config
from warcio.utils import to_native_str
from pywb.framework.wbrequestresponse import WbResponse, StatusAndHeaders
from pywb.framework.wbrequestresponse import WbResponse
from warcio.statusandheaders import StatusAndHeaders
import os

View File

@ -13,7 +13,7 @@ from pkg_resources import resource_string
from argparse import ArgumentParser, RawTextHelpFormatter
from pywb.utils.loaders import load_yaml_config
from pywb.utils.timeutils import timestamp20_now
from warcio.timeutils import timestamp20_now
from pywb import DEFAULT_CONFIG

View File

@ -1,4 +1,4 @@
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_iso_date
from warcio.timeutils import timestamp_to_datetime, datetime_to_iso_date
import re

View File

@ -0,0 +1,262 @@
import base64
import datetime
import os
import shutil
import traceback
import portalocker
from warcio.timeutils import timestamp20_now
from warcio.warcwriter import BaseWARCWriter
from pywb.webagg.utils import res_template
# ============================================================================
class MultiFileWARCWriter(BaseWARCWriter):
FILE_TEMPLATE = 'rec-{timestamp}-{hostname}.warc.gz'
def __init__(self, dir_template, filename_template=None, max_size=0,
max_idle_secs=1800, *args, **kwargs):
super(MultiFileWARCWriter, self).__init__(*args, **kwargs)
self.header_filter = kwargs.get('header_filter')
if not filename_template:
dir_template, filename_template = os.path.split(dir_template)
dir_template += os.path.sep
if not filename_template:
filename_template = self.FILE_TEMPLATE
self.dir_template = dir_template
self.key_template = kwargs.get('key_template', self.dir_template)
self.dedup_index = kwargs.get('dedup_index')
self.filename_template = filename_template
self.max_size = max_size
if max_idle_secs > 0:
self.max_idle_time = datetime.timedelta(seconds=max_idle_secs)
else:
self.max_idle_time = None
self.fh_cache = {}
def _check_revisit(self, record, params):
if not self.dedup_index or record.rec_type != 'response':
return record
try:
url = record.rec_headers.get_header('WARC-Target-URI')
digest = record.rec_headers.get_header('WARC-Payload-Digest')
iso_dt = record.rec_headers.get_header('WARC-Date')
result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt)
except Exception as e:
traceback.print_exc()
result = None
if result == 'skip':
return None
if isinstance(result, tuple) and result[0] == 'revisit':
record = self.create_revisit_record(url, digest, result[1], result[2],
http_headers=record.http_headers)
return record
def _set_header_buff(self, record):
exclude_list = None
if self.header_filter:
exclude_list = self.header_filter(record)
buff = record.http_headers.to_bytes(exclude_list)
record.http_headers.headers_buff = buff
def get_new_filename(self, dir_, params):
timestamp = timestamp20_now()
randstr = base64.b32encode(os.urandom(5)).decode('utf-8')
filename = dir_ + res_template(self.filename_template, params,
hostname=self.hostname,
timestamp=timestamp,
random=randstr)
return filename
def allow_new_file(self, filename, params):
return True
def _open_file(self, filename, params):
path, name = os.path.split(filename)
try:
os.makedirs(path)
except:
pass
fh = open(filename, 'a+b')
if self.dedup_index:
self.dedup_index.add_warc_file(filename, params)
return fh
def _close_file(self, fh):
try:
portalocker.lock(fh, portalocker.LOCK_UN)
fh.close()
except Exception as e:
print(e)
def get_dir_key(self, params):
return res_template(self.key_template, params)
def close_key(self, dir_key):
if isinstance(dir_key, dict):
dir_key = self.get_dir_key(dir_key)
result = self.fh_cache.pop(dir_key, None)
if not result:
return
out, filename = result
self._close_file(out)
return filename
def close_file(self, match_filename):
for dir_key, out, filename in self.iter_open_files():
if filename == match_filename:
return self.close_key(dir_key)
def _is_write_resp(self, resp, params):
return True
def _is_write_req(self, req, params):
return True
def write_record(self, record, params=None):
params = params or {}
self._do_write_req_resp(None, record, params)
def _do_write_req_resp(self, req, resp, params):
resp = self._check_revisit(resp, params)
if not resp:
print('Skipping due to dedup')
return
def write_callback(out, filename):
#url = resp.rec_headers.get_header('WARC-Target-URI')
#print('Writing req/resp {0} to {1} '.format(url, filename))
if resp and self._is_write_resp(resp, params):
self._write_warc_record(out, resp)
if req and self._is_write_req(req, params):
self._write_warc_record(out, req)
return self._write_to_file(params, write_callback)
def write_stream_to_file(self, params, stream):
def write_callback(out, filename):
#print('Writing stream to {0}'.format(filename))
shutil.copyfileobj(stream, out)
return self._write_to_file(params, write_callback)
def _write_to_file(self, params, write_callback):
full_dir = res_template(self.dir_template, params)
dir_key = self.get_dir_key(params)
result = self.fh_cache.get(dir_key)
close_file = False
new_size = start = 0
if result:
out, filename = result
is_new = False
else:
filename = self.get_new_filename(full_dir, params)
if not self.allow_new_file(filename, params):
return False
out = self._open_file(filename, params)
is_new = True
try:
start = out.tell()
write_callback(out, filename)
out.flush()
new_size = out.tell()
out.seek(start)
if self.dedup_index:
self.dedup_index.add_urls_to_index(out, params,
filename,
new_size - start)
return True
except Exception as e:
traceback.print_exc()
close_file = True
return False
finally:
# check for rollover
if self.max_size and new_size > self.max_size:
close_file = True
if close_file:
self._close_file(out)
if not is_new:
self.fh_cache.pop(dir_key, None)
elif is_new:
portalocker.lock(out, portalocker.LOCK_EX | portalocker.LOCK_NB)
self.fh_cache[dir_key] = (out, filename)
def iter_open_files(self):
for n, v in list(self.fh_cache.items()):
out, filename = v
yield n, out, filename
def close(self):
for dir_key, out, filename in self.iter_open_files():
self._close_file(out)
self.fh_cache = {}
def close_idle_files(self):
if not self.max_idle_time:
return
now = datetime.datetime.now()
for dir_key, out, filename in self.iter_open_files():
try:
mtime = os.path.getmtime(filename)
except:
self.close_key(dir_key)
return
mtime = datetime.datetime.fromtimestamp(mtime)
if (now - mtime) > self.max_idle_time:
print('Closing idle ' + filename)
self.close_key(dir_key)
# ============================================================================
class PerRecordWARCWriter(MultiFileWARCWriter):
def __init__(self, *args, **kwargs):
kwargs['max_size'] = 1
super(PerRecordWARCWriter, self).__init__(*args, **kwargs)

View File

@ -5,11 +5,12 @@ from pywb.webagg.inputrequest import DirectWSGIInputRequest
from pywb.recorder.filters import SkipRangeRequestFilter, CollectionFilter
from six.moves.urllib.parse import parse_qsl
import six
import json
import tempfile
from requests.structures import CaseInsensitiveDict
#from requests.structures import CaseInsensitiveDict
import requests
import traceback
@ -67,12 +68,22 @@ class RecorderApp(object):
req_head, req_pay, resp_head, resp_pay, params = result
resp_type, resp = self.writer.read_resp_record(resp_head, resp_pay)
#resp_type, resp = self.writer.read_resp_record(resp_head, resp_pay)
resp_length = resp_pay.tell()
resp_pay.seek(0)
resp = self.writer.create_record_from_stream(resp_pay, resp_length)
if resp_type == 'response':
req = self.writer.create_req_record(req_head, req_pay)
if resp.rec_type == 'response':
uri = resp.rec_headers.get_header('WARC-Target-Uri')
req_length = req_pay.tell()
req_pay.seek(0)
req = self.writer.create_warc_record(uri=uri,
record_type='request',
payload=req_pay,
length=req_length,
warc_headers_dict=req_head)
self.writer.write_req_resp(req, resp, params)
self.writer.write_request_response_pair(req, resp, params)
else:
self.writer.write_record(resp, params)
@ -127,16 +138,20 @@ class RecorderApp(object):
content_type = headers.get('Content-Type')
record = self.writer.create_custom_record(params['url'],
req_stream.out,
record_type,
content_type,
req_stream.headers)
payload_length = req_stream.out.tell()
req_stream.out.seek(0)
record = self.writer.create_warc_record(uri=params['url'],
record_type=record_type,
payload=req_stream.out,
length=payload_length,
warc_content_type=content_type,
warc_headers_dict=req_stream.headers)
self.writer.write_record(record, params)
msg = {'success': 'true',
'WARC-Date': record.rec_headers.get('WARC-Date')}
'WARC-Date': record.rec_headers.get_header('WARC-Date')}
finally:
if req_stream:
@ -311,11 +326,11 @@ class RespWrapper(Wrapper):
class ReqWrapper(Wrapper):
def __init__(self, stream, req_headers, params, create_func):
super(ReqWrapper, self).__init__(stream, params, create_func)
self.headers = CaseInsensitiveDict(req_headers)
self.headers = {}
for n in req_headers.keys():
if not n.upper().startswith('WARC-'):
del self.headers[n]
for n in six.iterkeys(req_headers):
if n.upper().startswith('WARC-'):
self.headers[n] = req_headers[n]
def close(self):
# no need to close wsgi.input

View File

@ -1,7 +1,8 @@
from pywb.utils.canonicalize import calc_search_range
from pywb.cdx.cdxobject import CDXObject
from pywb.warc.cdxindexer import write_cdx_index
from pywb.utils.timeutils import iso_date_to_timestamp
from warcio.timeutils import iso_date_to_timestamp
from io import BytesIO
import os

View File

@ -3,7 +3,7 @@ from gevent import monkey; monkey.patch_all()
from pywb.recorder.recorderapp import RecorderApp
from pywb.recorder.redisindexer import WritableRedisIndexer
from pywb.recorder.warcwriter import MultiFileWARCWriter
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
from pywb.recorder.filters import SkipDupePolicy
import atexit

View File

@ -13,18 +13,20 @@ from fakeredis import FakeStrictRedis
from pywb.recorder.recorderapp import RecorderApp
from pywb.recorder.redisindexer import WritableRedisIndexer
from pywb.recorder.warcwriter import PerRecordWARCWriter, MultiFileWARCWriter, SimpleTempWARCWriter
from pywb.recorder.multifilewarcwriter import PerRecordWARCWriter, MultiFileWARCWriter
from pywb.recorder.filters import ExcludeSpecificHeaders
from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
from pywb.webagg.utils import MementoUtils
from pywb.cdx.cdxobject import CDXObject
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.warc.recordloader import ArcWarcRecordLoader
from warcio.statusandheaders import StatusAndHeadersParser
from warcio.bufferedreaders import DecompressingBufferedReader
from warcio.recordloader import ArcWarcRecordLoader
from warcio.archiveiterator import ArchiveIterator
from pywb.warc.cdxindexer import write_cdx_index
from pywb.warc.archiveiterator import ArchiveIterator
from six.moves.urllib.parse import quote, unquote, urlencode
from io import BytesIO
@ -94,6 +96,8 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
files = [x for x in os.listdir(coll_dir) if os.path.isfile(os.path.join(coll_dir, x))]
assert len(files) == num
assert all(x.endswith('.warc.gz') for x in files)
self._verify_content_len(coll_dir, files)
return files, coll_dir
def _load_resp_req(self, base_path):
@ -105,7 +109,7 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
stored_req = None
with open(os.path.join(base_path, warc), 'rb') as fh:
for rec in ArchiveIterator(fh)():
for rec in ArchiveIterator(fh):
if rec.rec_type == 'response':
stored_resp = rec
elif rec.rec_type == 'request':
@ -115,6 +119,15 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert stored_req is not None
return stored_req, stored_resp
def _verify_content_len(self, base_dir, files):
for filename in files:
filename = os.path.join(base_dir, filename)
with open(filename, 'rb') as fh:
for record in ArchiveIterator(fh, no_record_parse=True):
assert record.http_headers == None
assert int(record.rec_headers.get_header('Content-Length')) == record.length
assert record.length == len(record.raw_stream.read())
def test_record_warc_1(self):
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')))
@ -157,16 +170,18 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
buff = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(buff)
assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers
assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers
stored_req, stored_resp = self._load_resp_req(base_path)
assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.status_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.status_headers.headers
assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.http_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.http_headers.headers
assert ('X-Other', 'foo') in stored_req.status_headers.headers
assert ('Cookie', 'boo=far') in stored_req.status_headers.headers
assert ('X-Other', 'foo') in stored_req.http_headers.headers
assert ('Cookie', 'boo=far') in stored_req.http_headers.headers
self._test_all_warcs('/warcs/cookiecheck/', 1)
def test_record_cookies_skip_header(self):
warc_path = to_path(self.root_dir + '/warcs/cookieskip/')
@ -180,16 +195,18 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
buff = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(buff)
assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers
assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers
stored_req, stored_resp = self._load_resp_req(warc_path)
assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.status_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.status_headers.headers
assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.http_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.http_headers.headers
assert ('X-Other', 'foo') in stored_req.status_headers.headers
assert ('Cookie', 'boo=far') not in stored_req.status_headers.headers
assert ('X-Other', 'foo') in stored_req.http_headers.headers
assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers
self._test_all_warcs('/warcs/cookieskip/', 1)
def test_record_skip_wrong_coll(self):
recorder_app = RecorderApp(self.upstream_url,
@ -470,34 +487,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
self._test_all_warcs('/warcs/GOO/', 2)
def test_warcinfo_record(self):
simplewriter = SimpleTempWARCWriter(gzip=False)
params = {'software': 'recorder test',
'format': 'WARC File Format 1.0',
'json-metadata': json.dumps({'foo': 'bar'})}
record = simplewriter.create_warcinfo_record('testfile.warc.gz', params)
simplewriter.write_record(record)
buff = simplewriter.get_buffer()
assert isinstance(buff, bytes)
buff = BytesIO(buff)
parsed_record = ArcWarcRecordLoader().parse_record_stream(buff)
assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
assert parsed_record.rec_headers.get_header('Content-Type') == 'application/warc-fields'
assert parsed_record.rec_headers.get_header('WARC-Filename') == 'testfile.warc.gz'
buff = parsed_record.stream.read().decode('utf-8')
length = parsed_record.rec_headers.get_header('Content-Length')
assert len(buff) == int(length)
assert 'json-metadata: {"foo": "bar"}\r\n' in buff
assert 'format: WARC File Format 1.0\r\n' in buff
assert 'json-metadata: {"foo": "bar"}\r\n' in buff
def test_record_custom_record(self):
dedup_index = self._get_dedup_index(user=False)
@ -543,10 +532,10 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert status_headers.get_header('Content-Length') == str(len(buff))
assert status_headers.get_header('WARC-Custom') == 'foo'
assert record.stream.read() == buff
assert record.raw_stream.read() == buff
status_headers = record.status_headers
assert len(record.status_headers.headers) == 2
status_headers = record.http_headers
assert len(record.http_headers.headers) == 2
assert status_headers.get_header('Content-Type') == 'text/plain'
assert status_headers.get_header('Content-Length') == str(len(buff))
@ -584,4 +573,3 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert status_headers.get_header('Content-Type') == 'application/vnd.youtube-dl_formats+json'
assert status_headers.get_header('WARC-Block-Digest') != ''
assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')

View File

@ -1,559 +0,0 @@
import tempfile
import uuid
import base64
import hashlib
import datetime
import zlib
import sys
import os
import six
import shutil
import traceback
from collections import OrderedDict
from socket import gethostname
from io import BytesIO
import portalocker
from pywb.utils.loaders import LimitReader, to_native_str
from pywb.utils.bufferedreaders import BufferedReader
from pywb.utils.timeutils import timestamp20_now, datetime_to_iso_date
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.warc.recordloader import ArcWarcRecord
from pywb.warc.recordloader import ArcWarcRecordLoader
from requests.structures import CaseInsensitiveDict
from pywb.webagg.utils import res_template, BUFF_SIZE
from pywb.recorder.filters import ExcludeNone
# ============================================================================
class BaseWARCWriter(object):
WARC_RECORDS = {'warcinfo': 'application/warc-fields',
'response': 'application/http; msgtype=response',
'revisit': 'application/http; msgtype=response',
'request': 'application/http; msgtype=request',
'metadata': 'application/warc-fields',
}
REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'
FILE_TEMPLATE = 'rec-{timestamp}-{hostname}.warc.gz'
def __init__(self, gzip=True, dedup_index=None,
header_filter=ExcludeNone(), *args, **kwargs):
self.gzip = gzip
self.dedup_index = dedup_index
self.header_filter = header_filter
self.hostname = gethostname()
self.parser = StatusAndHeadersParser([], verify=False)
@staticmethod
def _iter_stream(stream):
while True:
buf = stream.read(BUFF_SIZE)
if not buf:
return
yield buf
def ensure_digest(self, record):
block_digest = record.rec_headers.get('WARC-Block-Digest')
payload_digest = record.rec_headers.get('WARC-Payload-Digest')
if block_digest and payload_digest:
return
block_digester = self._create_digester()
payload_digester = self._create_digester()
pos = record.stream.tell()
if record.status_headers and hasattr(record.status_headers, 'headers_buff'):
block_digester.update(record.status_headers.headers_buff)
for buf in self._iter_stream(record.stream):
block_digester.update(buf)
payload_digester.update(buf)
record.stream.seek(pos)
record.rec_headers['WARC-Block-Digest'] = str(block_digester)
record.rec_headers['WARC-Payload-Digest'] = str(payload_digester)
def _create_digester(self):
return Digester('sha1')
def _set_header_buff(self, record):
exclude_list = self.header_filter(record)
buff = record.status_headers.to_bytes(exclude_list)
record.status_headers.headers_buff = buff
def write_req_resp(self, req, resp, params):
url = resp.rec_headers.get('WARC-Target-URI')
dt = resp.rec_headers.get('WARC-Date')
#req.rec_headers['Content-Type'] = req.content_type
req.rec_headers['WARC-Target-URI'] = url
req.rec_headers['WARC-Date'] = dt
resp_id = resp.rec_headers.get('WARC-Record-ID')
if resp_id:
req.rec_headers['WARC-Concurrent-To'] = resp_id
resp = self._check_revisit(resp, params)
if not resp:
print('Skipping due to dedup')
return
self._do_write_req_resp(req, resp, params)
def create_req_record(self, req_headers, payload):
len_ = payload.tell()
payload.seek(0)
warc_headers = req_headers
warc_headers['WARC-Type'] = 'request'
if not warc_headers.get('WARC-Record-ID'):
warc_headers['WARC-Record-ID'] = self._make_warc_id()
status_headers = self.parser.parse(payload)
record = ArcWarcRecord('warc', 'request', warc_headers, payload,
status_headers, '', len_)
self._set_header_buff(record)
return record
def read_resp_record(self, resp_headers, payload):
len_ = payload.tell()
payload.seek(0)
warc_headers = self.parser.parse(payload)
warc_headers = CaseInsensitiveDict(warc_headers.headers)
record_type = warc_headers.get('WARC-Type', 'response')
if record_type == 'response':
status_headers = self.parser.parse(payload)
else:
status_headers = None
record = ArcWarcRecord('warc', record_type, warc_headers, payload,
status_headers, '', len_)
if record_type == 'response':
self._set_header_buff(record)
self.ensure_digest(record)
return record_type, record
def create_warcinfo_record(self, filename, info):
warc_headers = {}
warc_headers['WARC-Record-ID'] = self._make_warc_id()
warc_headers['WARC-Type'] = 'warcinfo'
if filename:
warc_headers['WARC-Filename'] = filename
warc_headers['WARC-Date'] = datetime_to_iso_date(datetime.datetime.utcnow())
warcinfo = BytesIO()
for n, v in six.iteritems(info):
self._header(warcinfo, n, v)
warcinfo.seek(0)
record = ArcWarcRecord('warc', 'warcinfo', warc_headers, warcinfo,
None, '', len(warcinfo.getvalue()))
return record
def create_custom_record(self, uri, payload, record_type, content_type,
warc_headers=None):
len_ = payload.tell()
payload.seek(0)
warc_headers = warc_headers or {}
warc_headers['WARC-Record-ID'] = self._make_warc_id()
warc_headers['WARC-Type'] = record_type
warc_headers['WARC-Target-URI'] = uri
if 'WARC-Date' not in warc_headers:
warc_headers['WARC-Date'] = datetime_to_iso_date(datetime.datetime.utcnow())
record = ArcWarcRecord('warc', record_type, warc_headers, payload,
None, content_type, len_)
self.ensure_digest(record)
return record
def _check_revisit(self, record, params):
if not self.dedup_index:
return record
try:
url = record.rec_headers.get('WARC-Target-URI')
digest = record.rec_headers.get('WARC-Payload-Digest')
iso_dt = record.rec_headers.get('WARC-Date')
result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt)
except Exception as e:
traceback.print_exc()
result = None
if result == 'skip':
return None
if isinstance(result, tuple) and result[0] == 'revisit':
record.rec_headers['WARC-Type'] = 'revisit'
record.rec_headers['WARC-Profile'] = self.REVISIT_PROFILE
record.rec_headers['WARC-Refers-To-Target-URI'] = result[1]
record.rec_headers['WARC-Refers-To-Date'] = result[2]
return record
def _write_warc_record(self, out, record, adjust_cl=True):
if self.gzip:
out = GzippingWrapper(out)
self._line(out, b'WARC/1.0')
for n, v in six.iteritems(record.rec_headers):
if n.lower() in ('content-length', 'content-type'):
continue
self._header(out, n, v)
content_type = record.rec_headers.get('Content-Type')
if not content_type:
content_type = record.content_type
if not content_type:
content_type = self.WARC_RECORDS.get(record.rec_headers['WARC-Type'])
if content_type:
self._header(out, 'Content-Type', content_type)
if record.rec_headers['WARC-Type'] == 'revisit':
http_headers_only = True
else:
http_headers_only = False
if record.length:
actual_len = 0
if record.status_headers:
actual_len = len(record.status_headers.headers_buff)
if not http_headers_only:
if adjust_cl:
diff = record.stream.tell() - actual_len
else:
diff = 0
actual_len = record.length - diff
self._header(out, 'Content-Length', str(actual_len))
# add empty line
self._line(out, b'')
# write headers buffer, if any
if record.status_headers:
out.write(record.status_headers.headers_buff)
if not http_headers_only:
for buf in self._iter_stream(record.stream):
out.write(buf)
#out.write(record.stream.read())
# add two lines
self._line(out, b'\r\n')
else:
# add three lines (1 for end of header, 2 for end of record)
self._line(out, b'Content-Length: 0\r\n\r\n')
out.flush()
def _header(self, out, name, value):
if not value:
return
self._line(out, (name + ': ' + str(value)).encode('latin-1'))
def _line(self, out, line):
out.write(line + b'\r\n')
@staticmethod
def _make_warc_id(id_=None):
if not id_:
id_ = uuid.uuid1()
return '<urn:uuid:{0}>'.format(id_)
# ============================================================================
class GzippingWrapper(object):
def __init__(self, out):
self.compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16)
self.out = out
def write(self, buff):
#if isinstance(buff, str):
# buff = buff.encode('utf-8')
buff = self.compressor.compress(buff)
self.out.write(buff)
def flush(self):
buff = self.compressor.flush()
self.out.write(buff)
self.out.flush()
# ============================================================================
class Digester(object):
def __init__(self, type_='sha1'):
self.type_ = type_
self.digester = hashlib.new(type_)
def update(self, buff):
self.digester.update(buff)
def __str__(self):
return self.type_ + ':' + to_native_str(base64.b32encode(self.digester.digest()))
# ============================================================================
class MultiFileWARCWriter(BaseWARCWriter):
def __init__(self, dir_template, filename_template=None, max_size=0,
max_idle_secs=1800, *args, **kwargs):
super(MultiFileWARCWriter, self).__init__(*args, **kwargs)
if not filename_template:
dir_template, filename_template = os.path.split(dir_template)
dir_template += os.path.sep
if not filename_template:
filename_template = self.FILE_TEMPLATE
self.dir_template = dir_template
self.key_template = kwargs.get('key_template', self.dir_template)
self.filename_template = filename_template
self.max_size = max_size
if max_idle_secs > 0:
self.max_idle_time = datetime.timedelta(seconds=max_idle_secs)
else:
self.max_idle_time = None
self.fh_cache = {}
def get_new_filename(self, dir_, params):
timestamp = timestamp20_now()
randstr = base64.b32encode(os.urandom(5)).decode('utf-8')
filename = dir_ + res_template(self.filename_template, params,
hostname=self.hostname,
timestamp=timestamp,
random=randstr)
return filename
def allow_new_file(self, filename, params):
return True
def _open_file(self, filename, params):
path, name = os.path.split(filename)
try:
os.makedirs(path)
except:
pass
fh = open(filename, 'a+b')
if self.dedup_index:
self.dedup_index.add_warc_file(filename, params)
return fh
def _close_file(self, fh):
try:
portalocker.lock(fh, portalocker.LOCK_UN)
fh.close()
except Exception as e:
print(e)
def get_dir_key(self, params):
return res_template(self.key_template, params)
def close_key(self, dir_key):
if isinstance(dir_key, dict):
dir_key = self.get_dir_key(dir_key)
result = self.fh_cache.pop(dir_key, None)
if not result:
return
out, filename = result
self._close_file(out)
return filename
def close_file(self, match_filename):
for dir_key, out, filename in self.iter_open_files():
if filename == match_filename:
return self.close_key(dir_key)
def _is_write_resp(self, resp, params):
return True
def _is_write_req(self, req, params):
return True
def write_record(self, record, params=None):
params = params or {}
self._do_write_req_resp(None, record, params)
def _do_write_req_resp(self, req, resp, params):
def write_callback(out, filename):
url = resp.rec_headers.get('WARC-Target-URI')
#print('Writing req/resp {0} to {1} '.format(url, filename))
if resp and self._is_write_resp(resp, params):
self._write_warc_record(out, resp)
if req and self._is_write_req(req, params):
self._write_warc_record(out, req)
return self._write_to_file(params, write_callback)
def write_stream_to_file(self, params, stream):
def write_callback(out, filename):
#print('Writing stream to {0}'.format(filename))
shutil.copyfileobj(stream, out)
return self._write_to_file(params, write_callback)
def _write_to_file(self, params, write_callback):
full_dir = res_template(self.dir_template, params)
dir_key = self.get_dir_key(params)
result = self.fh_cache.get(dir_key)
close_file = False
if result:
out, filename = result
is_new = False
else:
filename = self.get_new_filename(full_dir, params)
if not self.allow_new_file(filename, params):
return False
out = self._open_file(filename, params)
is_new = True
try:
start = out.tell()
write_callback(out, filename)
out.flush()
new_size = out.tell()
out.seek(start)
if self.dedup_index:
self.dedup_index.add_urls_to_index(out, params,
filename,
new_size - start)
return True
except Exception as e:
traceback.print_exc()
close_file = True
return False
finally:
# check for rollover
if self.max_size and new_size > self.max_size:
close_file = True
if close_file:
self._close_file(out)
if not is_new:
self.fh_cache.pop(dir_key, None)
elif is_new:
portalocker.lock(out, portalocker.LOCK_EX | portalocker.LOCK_NB)
self.fh_cache[dir_key] = (out, filename)
def iter_open_files(self):
for n, v in list(self.fh_cache.items()):
out, filename = v
yield n, out, filename
def close(self):
for dir_key, out, filename in self.iter_open_files():
self._close_file(out)
self.fh_cache = {}
def close_idle_files(self):
if not self.max_idle_time:
return
now = datetime.datetime.now()
for dir_key, out, filename in self.iter_open_files():
try:
mtime = os.path.getmtime(filename)
except:
self.close_key(dir_key)
return
mtime = datetime.datetime.fromtimestamp(mtime)
if (now - mtime) > self.max_idle_time:
print('Closing idle ' + filename)
self.close_key(dir_key)
# ============================================================================
class PerRecordWARCWriter(MultiFileWARCWriter):
def __init__(self, *args, **kwargs):
kwargs['max_size'] = 1
super(PerRecordWARCWriter, self).__init__(*args, **kwargs)
# ============================================================================
class SimpleTempWARCWriter(BaseWARCWriter):
def __init__(self, *args, **kwargs):
super(SimpleTempWARCWriter, self).__init__(*args, **kwargs)
self.out = self._create_buffer()
def _create_buffer(self):
return tempfile.SpooledTemporaryFile(max_size=512*1024)
def _do_write_req_resp(self, req, resp, params):
self._write_warc_record(self.out, resp)
self._write_warc_record(self.out, req)
def write_record(self, record, params=None):
self._write_warc_record(self.out, record)
def get_buffer(self):
pos = self.out.tell()
self.out.seek(0)
buff = self.out.read()
self.out.seek(pos)
return buff

View File

@ -1,5 +1,5 @@
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.timeutils import datetime_to_http_date
from warcio.statusandheaders import StatusAndHeaders
from warcio.timeutils import datetime_to_http_date
from datetime import datetime, timedelta
import six

View File

@ -12,10 +12,11 @@ from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
from pywb.rewrite.rewriterules import RewriteRules
from pywb.utils.dsrules import RuleSet
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.bufferedreaders import ChunkedDataReader, BufferedReader
from pywb.utils.loaders import to_native_str
from warcio.statusandheaders import StatusAndHeaders
from warcio.bufferedreaders import DecompressingBufferedReader
from warcio.bufferedreaders import ChunkedDataReader, BufferedReader
from warcio.utils import to_native_str
from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter

View File

@ -11,10 +11,11 @@ import os
from six.moves.urllib.parse import urlsplit
import six
from warcio.timeutils import timestamp_now
from warcio.statusandheaders import StatusAndHeaders
from pywb.utils.loaders import is_http, LimitReader, LocalFileLoader, to_file_url
from pywb.utils.loaders import extract_client_cookie
from pywb.utils.timeutils import timestamp_now
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.canonicalize import canonicalize
from pywb.rewrite.rewrite_content import RewriteContent

View File

@ -50,9 +50,9 @@ HTTP Headers Rewriting
from pywb.rewrite.header_rewriter import HeaderRewriter
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders
from warcio.statusandheaders import StatusAndHeaders
from pywb.utils.timeutils import datetime_to_http_date
from warcio.timeutils import datetime_to_http_date
from datetime import datetime
import pprint

View File

@ -1,5 +1,5 @@
from pywb.rewrite.cookie_rewriter import WbUrlBaseCookieRewriter, HostScopeCookieRewriter
from pywb.utils.timeutils import datetime_to_http_date
from warcio.timeutils import datetime_to_http_date
from six.moves import zip
import redis

View File

@ -6,10 +6,10 @@ from pywb.framework.archivalrouter import Route
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.rewrite.wburl import WbUrl
from pywb.warc.recordloader import ArcWarcRecordLoader
from warcio.recordloader import ArcWarcRecordLoader
from pywb.webapp.live_rewrite_handler import RewriteHandler
from pywb.utils.canonicalize import canonicalize
from pywb.utils.timeutils import http_date_to_timestamp
from warcio.timeutils import http_date_to_timestamp
from pywb.cdx.cdxobject import CDXObject
from io import BytesIO
@ -81,7 +81,7 @@ class PlatformHandler(RewriteHandler):
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter,
record.status_headers,
record.http_headers,
record.stream,
head_insert_func,
urlkey,

View File

@ -6,14 +6,15 @@ from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.wbexception import WbException
from pywb.utils.canonicalize import canonicalize
from pywb.utils.timeutils import http_date_to_timestamp
from pywb.utils.loaders import extract_client_cookie
from pywb.utils.bufferedreaders import BufferedReader
from warcio.timeutils import http_date_to_timestamp
from warcio.bufferedreaders import BufferedReader
from warcio.recordloader import ArcWarcRecordLoader
from pywb.webagg.utils import BUFF_SIZE
from pywb.cdx.cdxobject import CDXObject
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.framework.wbrequestresponse import WbResponse
from pywb.webagg.utils import MementoUtils, buffer_iter
@ -200,11 +201,11 @@ class RewriterApp(object):
self._add_custom_params(cdx, r.headers, kwargs)
if readd_range:
content_length = (record.status_headers.
content_length = (record.http_headers.
get_header('Content-Length'))
try:
content_length = int(content_length)
record.status_headers.add_range(0, content_length,
record.http_headers.add_range(0, content_length,
content_length)
except (ValueError, TypeError):
pass
@ -229,8 +230,8 @@ class RewriterApp(object):
cookie_key)
result = self.content_rewriter.rewrite_content(urlrewriter,
record.status_headers,
record.stream,
record.http_headers,
record.raw_stream,
head_insert_func,
urlkey,
cdx,

View File

@ -1,5 +1,6 @@
from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec
from pywb.utils.timeutils import timestamp_now
from warcio.timeutils import timestamp_to_datetime, timestamp_to_sec
from warcio.timeutils import timestamp_now
from pywb.utils.loaders import load
from six.moves.urllib.parse import urlsplit

View File

@ -1,336 +0,0 @@
from io import BytesIO
import zlib
import brotli
#=================================================================
def gzip_decompressor():
"""
Decompressor which can handle decompress gzip stream
"""
return zlib.decompressobj(16 + zlib.MAX_WBITS)
def deflate_decompressor():
return zlib.decompressobj()
def deflate_decompressor_alt():
return zlib.decompressobj(-zlib.MAX_WBITS)
def brotli_decompressor():
decomp = brotli.Decompressor()
decomp.unused_data = None
return decomp
#=================================================================
class BufferedReader(object):
"""
A wrapping line reader which wraps an existing reader.
Read operations operate on underlying buffer, which is filled to
block_size (1024 default)
If an optional decompress type is specified,
data is fed through the decompressor when read from the buffer.
Currently supported decompression: gzip
If unspecified, default decompression is None
If decompression is specified, and decompress fails on first try,
data is assumed to not be compressed and no exception is thrown.
If a failure occurs after data has been
partially decompressed, the exception is propagated.
"""
DECOMPRESSORS = {'gzip': gzip_decompressor,
'deflate': deflate_decompressor,
'deflate_alt': deflate_decompressor_alt,
'br': brotli_decompressor
}
def __init__(self, stream, block_size=1024,
decomp_type=None,
starting_data=None):
self.stream = stream
self.block_size = block_size
self._init_decomp(decomp_type)
self.buff = None
self.starting_data = starting_data
self.num_read = 0
self.buff_size = 0
def set_decomp(self, decomp_type):
self._init_decomp(decomp_type)
def _init_decomp(self, decomp_type):
self.num_block_read = 0
if decomp_type:
try:
self.decomp_type = decomp_type
self.decompressor = self.DECOMPRESSORS[decomp_type.lower()]()
except KeyError:
raise Exception('Decompression type not supported: ' +
decomp_type)
else:
self.decomp_type = None
self.decompressor = None
def _fillbuff(self, block_size=None):
if not self.empty():
return
# can't read past next member
if self.rem_length() > 0:
return
if self.starting_data:
data = self.starting_data
self.starting_data = None
else:
if not block_size:
block_size = self.block_size
data = self.stream.read(block_size)
self._process_read(data)
def _process_read(self, data):
data = self._decompress(data)
self.buff_size = len(data)
self.num_read += self.buff_size
self.num_block_read += self.buff_size
self.buff = BytesIO(data)
def _decompress(self, data):
if self.decompressor and data:
try:
data = self.decompressor.decompress(data)
except Exception as e:
# if first read attempt, assume non-gzipped stream
if self.num_block_read == 0:
if self.decomp_type == 'deflate':
self._init_decomp('deflate_alt')
data = self._decompress(data)
else:
self.decompressor = None
# otherwise (partly decompressed), something is wrong
else:
print(str(e))
return b''
return data
def read(self, length=None):
"""
Fill bytes and read some number of bytes
(up to length if specified)
<= length bytes may be read if reached the end of input
if at buffer boundary, will attempt to read again until
specified length is read
"""
all_buffs = []
while length is None or length > 0:
self._fillbuff()
buff = self.buff.read(length)
if not buff:
break
all_buffs.append(buff)
if length:
length -= len(buff)
return b''.join(all_buffs)
def readline(self, length=None):
"""
Fill buffer and read a full line from the buffer
(up to specified length, if provided)
If no newline found at end, try filling buffer again in case
at buffer boundary.
"""
if length == 0:
return b''
self._fillbuff()
linebuff = self.buff.readline(length)
# we may be at a boundary
while not linebuff.endswith(b'\n'):
if length:
length -= len(linebuff)
if length <= 0:
break
self._fillbuff()
if self.empty():
break
linebuff += self.buff.readline(length)
return linebuff
def empty(self):
return not self.buff or self.buff.tell() >= self.buff_size
def read_next_member(self):
if not self.decompressor or not self.decompressor.unused_data:
return False
self.starting_data = self.decompressor.unused_data
self._init_decomp(self.decomp_type)
return True
def rem_length(self):
rem = 0
if self.buff:
rem = self.buff_size - self.buff.tell()
if self.decompressor and self.decompressor.unused_data:
rem += len(self.decompressor.unused_data)
return rem
def close(self):
if self.stream:
self.stream.close()
self.stream = None
@classmethod
def get_supported_decompressors(cls):
return cls.DECOMPRESSORS.keys()
#=================================================================
class DecompressingBufferedReader(BufferedReader):
"""
A BufferedReader which defaults to gzip decompression,
(unless different type specified)
"""
def __init__(self, *args, **kwargs):
if 'decomp_type' not in kwargs:
kwargs['decomp_type'] = 'gzip'
super(DecompressingBufferedReader, self).__init__(*args, **kwargs)
#=================================================================
class ChunkedDataException(Exception):
def __init__(self, msg, data=b''):
Exception.__init__(self, msg)
self.data = data
#=================================================================
class ChunkedDataReader(BufferedReader):
r"""
A ChunkedDataReader is a DecompressingBufferedReader
which also supports de-chunking of the data if it happens
to be http 'chunk-encoded'.
If at any point the chunked header is not available, the stream is
assumed to not be chunked and no more dechunking occurs.
"""
def __init__(self, stream, raise_exceptions=False, **kwargs):
super(ChunkedDataReader, self).__init__(stream, **kwargs)
self.all_chunks_read = False
self.not_chunked = False
# if False, we'll use best-guess fallback for parse errors
self.raise_chunked_data_exceptions = raise_exceptions
def _fillbuff(self, block_size=None):
if self.not_chunked:
return super(ChunkedDataReader, self)._fillbuff(block_size)
# Loop over chunks until there is some data (not empty())
# In particular, gzipped data may require multiple chunks to
# return any decompressed result
while (self.empty() and
not self.all_chunks_read and
not self.not_chunked):
try:
length_header = self.stream.readline(64)
self._try_decode(length_header)
except ChunkedDataException as e:
if self.raise_chunked_data_exceptions:
raise
# Can't parse the data as chunked.
# It's possible that non-chunked data is served
# with a Transfer-Encoding: chunked.
# Treat this as non-chunk encoded from here on.
self._process_read(length_header + e.data)
self.not_chunked = True
# parse as block as non-chunked
return super(ChunkedDataReader, self)._fillbuff(block_size)
def _try_decode(self, length_header):
# decode length header
try:
chunk_size = int(length_header.strip().split(b';')[0], 16)
except ValueError:
raise ChunkedDataException(b"Couldn't decode length header " +
length_header)
if not chunk_size:
# chunk_size 0 indicates end of file
self.all_chunks_read = True
self._process_read(b'')
return
data_len = 0
data = b''
# read chunk
while data_len < chunk_size:
new_data = self.stream.read(chunk_size - data_len)
# if we unexpectedly run out of data,
# either raise an exception or just stop reading,
# assuming file was cut off
if not new_data:
if self.raise_chunked_data_exceptions:
msg = 'Ran out of data before end of chunk'
raise ChunkedDataException(msg, data)
else:
chunk_size = data_len
self.all_chunks_read = True
data += new_data
data_len = len(data)
# if we successfully read a block without running out,
# it should end in \r\n
if not self.all_chunks_read:
clrf = self.stream.read(2)
if clrf != b'\r\n':
raise ChunkedDataException(b"Chunk terminator not found.",
data)
# hand to base class for further processing
self._process_read(data)
def read(self, length=None):
""" read bytes from stream, if length specified,
may read across multiple chunks to get exact length
"""
buf = super(ChunkedDataReader, self).read(length)
if not length:
return buf
# if length specified, attempt to read exact length
rem = length - len(buf)
while rem > 0:
new_buf = super(ChunkedDataReader, self).read(rem)
if not new_buf:
break
buf += new_buf
rem -= len(new_buf)
return buf

View File

@ -17,6 +17,7 @@ import base64
import cgi
from io import open, BytesIO
from warcio.limitreader import LimitReader
try:
from boto import connect_s3
@ -500,78 +501,6 @@ class HMACCookieMaker(object):
return cookie
#=================================================================
# Limit Reader
#=================================================================
class LimitReader(object):
"""
A reader which will not read more than specified limit
"""
def __init__(self, stream, limit):
self.stream = stream
self.limit = limit
if hasattr(stream, 'tell'):
self.tell = self._tell
def _update(self, buff):
length = len(buff)
self.limit -= length
return buff
def read(self, length=None):
if length is not None:
length = min(length, self.limit)
else:
length = self.limit
if length == 0:
return b''
buff = self.stream.read(length)
return self._update(buff)
def readline(self, length=None):
if length is not None:
length = min(length, self.limit)
else:
length = self.limit
if length == 0:
return b''
buff = self.stream.readline(length)
return self._update(buff)
def close(self):
self.stream.close()
def _tell(self):
return self.stream.tell()
@staticmethod
def wrap_stream(stream, content_length):
"""
If given content_length is an int > 0, wrap the stream
in a LimitReader. Otherwise, return the stream unaltered
"""
try:
content_length = int(content_length)
if content_length >= 0:
# optimize: if already a LimitStream, set limit to
# the smaller of the two limits
if isinstance(stream, LimitReader):
stream.limit = min(stream.limit, content_length)
else:
stream = LimitReader(stream, content_length)
except (ValueError, TypeError):
pass
return stream
# ============================================================================
BlockLoader.init_default_loaders()

View File

@ -1,280 +0,0 @@
"""
Representation and parsing of HTTP-style status + headers
"""
from pprint import pformat
from copy import copy
from six.moves import range
from six import iteritems
from pywb.utils.loaders import to_native_str
import uuid
WRAP_WIDTH = 80
#=================================================================
class StatusAndHeaders(object):
"""
Representation of parsed http-style status line and headers
Status Line if first line of request/response
Headers is a list of (name, value) tuples
An optional protocol which appears on first line may be specified
"""
def __init__(self, statusline, headers, protocol='', total_len=0):
self.statusline = statusline
self.headers = headers
self.protocol = protocol
self.total_len = total_len
def get_header(self, name):
"""
return header (name, value)
if found
"""
name_lower = name.lower()
for value in self.headers:
if value[0].lower() == name_lower:
return value[1]
def replace_header(self, name, value):
"""
replace header with new value or add new header
return old header value, if any
"""
name_lower = name.lower()
for index in range(len(self.headers) - 1, -1, -1):
curr_name, curr_value = self.headers[index]
if curr_name.lower() == name_lower:
self.headers[index] = (curr_name, value)
return curr_value
self.headers.append((name, value))
return None
def replace_headers(self, header_dict):
"""
replace all headers in header_dict that already exist
add any remaining headers
"""
header_dict = copy(header_dict)
for index in range(len(self.headers) - 1, -1, -1):
curr_name, curr_value = self.headers[index]
name_lower = curr_name.lower()
if name_lower in header_dict:
self.headers[index] = (curr_name, header_dict[name_lower])
del header_dict[name_lower]
for name, value in iteritems(header_dict):
self.headers.append((name, value))
def remove_header(self, name):
"""
Remove header (case-insensitive)
return True if header removed, False otherwise
"""
name_lower = name.lower()
for index in range(len(self.headers) - 1, -1, -1):
if self.headers[index][0].lower() == name_lower:
del self.headers[index]
return True
return False
def get_statuscode(self):
"""
Return the statuscode part of the status response line
(Assumes no protocol in the statusline)
"""
code = self.statusline.split(' ', 1)[0]
return code
def validate_statusline(self, valid_statusline):
"""
Check that the statusline is valid, eg. starts with a numeric
code. If not, replace with passed in valid_statusline
"""
code = self.get_statuscode()
try:
code = int(code)
assert(code > 0)
return True
except(ValueError, AssertionError):
self.statusline = valid_statusline
return False
def add_range(self, start, part_len, total_len):
"""
Add range headers indicating that this a partial response
"""
content_range = 'bytes {0}-{1}/{2}'.format(start,
start + part_len - 1,
total_len)
self.statusline = '206 Partial Content'
self.replace_header('Content-Range', content_range)
self.replace_header('Accept-Ranges', 'bytes')
return self
def __repr__(self):
headers_str = pformat(self.headers, indent=2, width=WRAP_WIDTH)
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
headers = {2})".format(self.protocol, self.statusline, headers_str)
def __eq__(self, other):
return (self.statusline == other.statusline and
self.headers == other.headers and
self.protocol == other.protocol)
def __str__(self, exclude_list=None):
return self.to_str(exclude_list)
def __bool__(self):
return bool(self.statusline or self.headers)
__nonzero__ = __bool__
def to_str(self, exclude_list):
string = self.protocol
if string and self.statusline:
string += ' '
if self.statusline:
string += self.statusline
if string:
string += '\r\n'
for h in self.headers:
if exclude_list and h[0].lower() in exclude_list:
continue
string += ': '.join(h) + '\r\n'
return string
def to_bytes(self, exclude_list=None):
return self.to_str(exclude_list).encode('iso-8859-1') + b'\r\n'
#=================================================================
def _strip_count(string, total_read):
length = len(string)
return string.rstrip(), total_read + length
#=================================================================
class StatusAndHeadersParser(object):
"""
Parser which consumes a stream support readline() to read
status and headers and return a StatusAndHeaders object
"""
def __init__(self, statuslist, verify=True):
self.statuslist = statuslist
self.verify = verify
def parse(self, stream, full_statusline=None):
"""
parse stream for status line and headers
return a StatusAndHeaders object
support continuation headers starting with space or tab
"""
def readline():
return to_native_str(stream.readline())
# status line w newlines intact
if full_statusline is None:
full_statusline = readline()
else:
full_statusline = to_native_str(full_statusline)
statusline, total_read = _strip_count(full_statusline, 0)
headers = []
# at end of stream
if total_read == 0:
raise EOFError()
elif not statusline:
return StatusAndHeaders(statusline=statusline,
headers=headers,
protocol='',
total_len=total_read)
# validate only if verify is set
if self.verify:
protocol_status = self.split_prefix(statusline, self.statuslist)
if not protocol_status:
msg = 'Expected Status Line starting with {0} - Found: {1}'
msg = msg.format(self.statuslist, statusline)
raise StatusAndHeadersParserException(msg, full_statusline)
else:
protocol_status = statusline.split(' ', 1)
line, total_read = _strip_count(readline(), total_read)
while line:
result = line.split(':', 1)
if len(result) == 2:
name = result[0].rstrip(' \t')
value = result[1].lstrip()
else:
name = result[0]
value = None
next_line, total_read = _strip_count(readline(),
total_read)
# append continuation lines, if any
while next_line and next_line.startswith((' ', '\t')):
if value is not None:
value += next_line
next_line, total_read = _strip_count(readline(),
total_read)
if value is not None:
header = (name, value)
headers.append(header)
line = next_line
if len(protocol_status) > 1:
statusline = protocol_status[1].strip()
else:
statusline = ''
return StatusAndHeaders(statusline=statusline,
headers=headers,
protocol=protocol_status[0],
total_len=total_read)
@staticmethod
def split_prefix(key, prefixs):
"""
split key string into prefix and remainder
for first matching prefix from a list
"""
key_upper = key.upper()
for prefix in prefixs:
if key_upper.startswith(prefix):
plen = len(prefix)
return (key_upper[:plen], key[plen:])
@staticmethod
def make_warc_id(id_=None):
if not id_:
id_ = uuid.uuid1()
return '<urn:uuid:{0}>'.format(id_)
#=================================================================
class StatusAndHeadersParserException(Exception):
"""
status + headers parsing exception
"""
def __init__(self, msg, statusline):
super(StatusAndHeadersParserException, self).__init__(msg)
self.statusline = statusline

View File

@ -1,174 +0,0 @@
r"""
# DecompressingBufferedReader Tests
#=================================================================
# DecompressingBufferedReader readline()
>>> print_str(DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline())
' CDX N b a m s k r M S V g\n'
# detect not compressed
>>> print_str(DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb'), decomp_type = 'gzip').readline())
' CDX N b a m s k r M S V g\n'
# decompress with on the fly compression, default gzip compression
>>> print_str(DecompressingBufferedReader(BytesIO(compress('ABC\n1234\n'))).read())
'ABC\n1234\n'
# decompress with on the fly compression, default 'inflate' compression
>>> print_str(DecompressingBufferedReader(BytesIO(compress_alt('ABC\n1234\n')), decomp_type='deflate').read())
'ABC\n1234\n'
# error: invalid compress type
>>> DecompressingBufferedReader(BytesIO(compress('ABC')), decomp_type = 'bzip2').read()
Traceback (most recent call last):
Exception: Decompression type not supported: bzip2
# invalid output when reading compressed data as not compressed
>>> DecompressingBufferedReader(BytesIO(compress('ABC')), decomp_type = None).read() != b'ABC'
True
# DecompressingBufferedReader readline() with decompression (zipnum file, no header)
>>> print_str(DecompressingBufferedReader(open(test_zip_dir + 'zipnum-sample.cdx.gz', 'rb'), decomp_type = 'gzip').readline())
'com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz\n'
# test very small block size
>>> dbr = DecompressingBufferedReader(BytesIO(b'ABCDEFG\nHIJKLMN\nOPQR\nXYZ'), block_size = 3)
>>> print_str(dbr.readline()); print_str(dbr.readline(4)); print_str(dbr.readline()); print_str(dbr.readline()); print_str(dbr.readline(2)); print_str(dbr.readline()); print_str(dbr.readline())
'ABCDEFG\n'
'HIJK'
'LMN\n'
'OPQR\n'
'XY'
'Z'
''
# test zero length reads
>>> x = DecompressingBufferedReader(LimitReader(BytesIO(b'\r\n'), 1))
>>> print_str(x.readline(0)); print_str(x.read(0))
''
''
# Chunk-Decoding Buffered Reader Tests
#=================================================================
Properly formatted chunked data:
>>> c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n0\r\n\r\n"));
>>> print_str(c.read() + c.read() + c.read())
'1234'
Non-chunked data:
>>> print_str(ChunkedDataReader(BytesIO(b"xyz123!@#")).read())
'xyz123!@#'
Non-chunked, compressed data, specify decomp_type
>>> print_str(ChunkedDataReader(BytesIO(compress('ABCDEF')), decomp_type='gzip').read())
'ABCDEF'
Non-chunked, compressed data, specifiy compression seperately
>>> c = ChunkedDataReader(BytesIO(compress('ABCDEF'))); c.set_decomp('gzip'); print_str(c.read())
'ABCDEF'
Non-chunked, compressed data, wrap in DecompressingBufferedReader
>>> print_str(DecompressingBufferedReader(ChunkedDataReader(BytesIO(compress('\nABCDEF\nGHIJ')))).read())
'\nABCDEF\nGHIJ'
Chunked compressed data
Split compressed stream into 10-byte chunk and a remainder chunk
>>> b = compress('ABCDEFGHIJKLMNOP')
>>> l = len(b)
>>> in_ = format(10, 'x').encode('utf-8') + b"\r\n" + b[:10] + b"\r\n" + format(l - 10, 'x').encode('utf-8') + b"\r\n" + b[10:] + b"\r\n0\r\n\r\n"
>>> c = ChunkedDataReader(BytesIO(in_), decomp_type='gzip')
>>> print_str(c.read())
'ABCDEFGHIJKLMNOP'
Starts like chunked data, but isn't:
>>> c = ChunkedDataReader(BytesIO(b"1\r\nxyz123!@#"));
>>> print_str(c.read() + c.read())
'1\r\nx123!@#'
Chunked data cut off part way through:
>>> c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n4\r\n12"));
>>> print_str(c.read() + c.read())
'123412'
Zero-Length chunk:
>>> print_str(ChunkedDataReader(BytesIO(b"0\r\n\r\n")).read())
''
"""
from io import BytesIO
from pywb.utils.bufferedreaders import ChunkedDataReader, ChunkedDataException
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.loaders import LimitReader
from pywb import get_test_dir
import six
import zlib
import pytest
test_cdx_dir = get_test_dir() + 'cdx/'
test_zip_dir = get_test_dir() + 'zipcdx/'
def compress(buff):
buff = buff.encode('utf-8')
compressobj = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
compressed = compressobj.compress(buff)
compressed += compressobj.flush()
return compressed
# plain "inflate"
def compress_alt(buff):
buff = buff.encode('utf-8')
compressobj = zlib.compressobj(6, zlib.DEFLATED)
compressed = compressobj.compress(buff)
compressed += compressobj.flush()
# drop gzip headers/tail
compressed = compressed[2:-4]
return compressed
# Brotli
def test_brotli():
with open(get_test_dir() + 'text_content/quickfox_repeated.compressed', 'rb') as fh:
x = DecompressingBufferedReader(fh, decomp_type='br')
x.read() == b'The quick brown fox jumps over the lazy dog' * 4096
# Compression
def test_compress_mix():
# error: compressed member, followed by not compressed -- now allowed!
x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip')
b = x.read()
assert b == b'ABC'
x.read_next_member()
assert x.read() == b'123'
#with pytest.raises(zlib.error):
# x.read()
#error: Error -3 while decompressing: incorrect header check
# Errors
def test_err_chunk_cut_off():
# Chunked data cut off with exceptions
c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n4\r\n12"), raise_exceptions=True)
with pytest.raises(ChunkedDataException):
c.read() + c.read()
#ChunkedDataException: Ran out of data before end of chunk
def print_str(string):
return string.decode('utf-8') if six.PY3 else string
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,27 +1,5 @@
#=================================================================
r"""
# LimitReader Tests
>>> LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26)
'abcdefghji'
>>> LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26)
'abcdefgh'
>>> LimitReader.wrap_stream(LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 8), 4).readline(26)
'abcd'
>>> read_multiple(LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
'efghji'
# zero-length read
>>> print_str(LimitReader(StringIO('a'), 0).readline(0))
''
# don't wrap if invalid length
>>> b = StringIO('b')
>>> LimitReader.wrap_stream(b, 'abc') == b
True
#=================================================================
# BlockLoader Tests (includes LimitReader)
# Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
>>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read(400))
@ -141,28 +119,16 @@ from io import BytesIO
import requests
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query
from pywb.utils.loaders import extract_client_cookie, extract_post_query
from pywb.utils.loaders import append_post_query, read_last_line
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from warcio.bufferedreaders import DecompressingBufferedReader
from pywb import get_test_dir
test_cdx_dir = get_test_dir() + 'cdx/'
def read_multiple(reader, inc_reads):
result = None
for x in inc_reads:
result = reader.read(x)
return result
def seek_read_full(seekable_reader, offset):
seekable_reader.seek(offset)
seekable_reader.readline() #skip
return seekable_reader.readline()
def test_s3_read_1():
pytest.importorskip('boto')
@ -177,15 +143,6 @@ def test_s3_read_1():
assert reader.readline() == b'WARC/1.0\r\n'
assert reader.readline() == b'WARC-Type: response\r\n'
def test_limit_post():
reader = LimitReader(BytesIO(b'abcdefg'), 3)
r = requests.request(method='POST',
url='http://httpbin.org/post',
data=reader,
headers={'Content-Length': '3'})
assert '"abc"' in r.text
# Error
def test_err_no_such_file():
# no such file

View File

@ -1,182 +0,0 @@
"""
>>> st1 = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
>>> st1
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
('Some', 'Value'),
('Multi-Line', 'Value1 Also This')])
# add range
>>> StatusAndHeaders(statusline = '200 OK', headers=[('Content-Type', 'text/plain')]).add_range(10, 4, 100)
StatusAndHeaders(protocol = '', statusline = '206 Partial Content', headers = [ ('Content-Type', 'text/plain'),
('Content-Range', 'bytes 10-13/100'),
('Accept-Ranges', 'bytes')])
# other protocol expected
>>> StatusAndHeadersParser(['Other']).parse(StringIO(status_headers_1)) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
>>> StatusAndHeadersParser(['Other'], verify=False).parse(StringIO(status_headers_1))
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
('Some', 'Value'),
('Multi-Line', 'Value1 Also This')])
# verify protocol line
>>> StatusAndHeadersParser(['HTTP/1.0'], verify=True).parse(StringIO(unknown_protocol_headers)) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0'] - Found: OtherBlah
# allow unexpected/invalid protocol line
>>> StatusAndHeadersParser(['HTTP/1.0'], verify=False).parse(StringIO(unknown_protocol_headers))
StatusAndHeaders(protocol = 'OtherBlah', statusline = '', headers = [('Foo', 'Bar')])
# test equality op
>>> st1 == StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
True
# replace header, print new headers
>>> st1.replace_header('some', 'Another-Value'); st1
'Value'
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
('Some', 'Another-Value'),
('Multi-Line', 'Value1 Also This')])
# remove header
>>> st1.remove_header('some')
True
# already removed
>>> st1.remove_header('Some')
False
# empty
>>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2
StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = [])
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_3))
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 Empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')])
# case-insensitive match
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_4))
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')])
"""
from pywb.utils.statusandheaders import StatusAndHeadersParser, StatusAndHeaders
from six import StringIO
import pytest
status_headers_1 = "\
HTTP/1.0 200 OK\r\n\
Content-Type: ABC\r\n\
HTTP/1.0 200 OK\r\n\
Some: Value\r\n\
Multi-Line: Value1\r\n\
Also This\r\n\
\r\n\
Body"
status_headers_2 = """
"""
status_headers_3 = "\
HTTP/1.0 204 Empty\r\n\
Content-Type: Value\r\n\
%Invalid%\r\n\
\tMultiline\r\n\
Content-Length: 0\r\n\
\r\n"
status_headers_4 = "\
http/1.0 204 empty\r\n\
Content-Type: Value\r\n\
%Invalid%\r\n\
\tMultiline\r\n\
Content-Length: 0\r\n\
\r\n"
unknown_protocol_headers = "\
OtherBlah\r\n\
Foo: Bar\r\n\
\r\n"
req_headers = "\
GET / HTTP/1.0\r\n\
Foo: Bar\r\n\
Content-Length: 0\r\n"
if __name__ == "__main__":
import doctest
doctest.testmod()
def test_to_str_1():
res = str(StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1)))
exp = "\
HTTP/1.0 200 OK\r\n\
Content-Type: ABC\r\n\
Some: Value\r\n\
Multi-Line: Value1 Also This\r\n\
"
assert(res == exp)
def test_to_str_exclude():
sah = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
res = sah.to_str(['multi-line'])
exp = "\
HTTP/1.0 200 OK\r\n\
Content-Type: ABC\r\n\
Some: Value\r\n\
"
assert(res == exp)
assert(sah.to_bytes(['multi-line']) == (exp.encode('latin-1') + b'\r\n'))
def test_to_str_2():
res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers)))
assert(res == req_headers)
res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers + '\r\n')))
assert(res == req_headers)
def test_to_str_with_remove():
res = StatusAndHeadersParser(['GET']).parse(StringIO(req_headers))
res.remove_header('Foo')
exp = "\
GET / HTTP/1.0\r\n\
Content-Length: 0\r\n"
assert(str(res) == exp)
def test_status_empty():
with pytest.raises(EOFError):
StatusAndHeadersParser([], verify=False).parse(StringIO(''))
def test_status_one_word():
res = StatusAndHeadersParser(['GET'], verify=False).parse(StringIO('A'))
assert(str(res) == 'A\r\n')

View File

@ -1,330 +0,0 @@
"""
utility functions for converting between
datetime, iso date and 14-digit timestamp
"""
import re
import time
import datetime
import calendar
from six.moves import map
from email.utils import parsedate, formatdate
#=================================================================
# str <-> datetime conversion
#=================================================================
DATE_TIMESPLIT = re.compile(r'[^\d]')
TIMESTAMP_14 = '%Y%m%d%H%M%S'
ISO_DT = '%Y-%m-%dT%H:%M:%SZ'
PAD_14_DOWN = '10000101000000'
PAD_14_UP = '29991231235959'
PAD_6_UP = '299912'
def iso_date_to_datetime(string):
"""
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
datetime.datetime(2013, 12, 26, 10, 11, 12)
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
datetime.datetime(2013, 12, 26, 10, 11, 12)
"""
nums = DATE_TIMESPLIT.split(string)
if nums[-1] == '':
nums = nums[:-1]
the_datetime = datetime.datetime(*map(int, nums))
return the_datetime
def http_date_to_datetime(string):
"""
>>> http_date_to_datetime('Thu, 26 Dec 2013 09:50:10 GMT')
datetime.datetime(2013, 12, 26, 9, 50, 10)
"""
return datetime.datetime(*parsedate(string)[:6])
def datetime_to_http_date(the_datetime):
"""
>>> datetime_to_http_date(datetime.datetime(2013, 12, 26, 9, 50, 10))
'Thu, 26 Dec 2013 09:50:10 GMT'
# Verify inverses
>>> x = 'Thu, 26 Dec 2013 09:50:10 GMT'
>>> datetime_to_http_date(http_date_to_datetime(x)) == x
True
"""
timeval = calendar.timegm(the_datetime.utctimetuple())
return formatdate(timeval=timeval,
localtime=False,
usegmt=True)
def datetime_to_iso_date(the_datetime):
"""
>>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12))
'2013-12-26T10:11:12Z'
>>> datetime_to_iso_date( datetime.datetime(2013, 12, 26, 10, 11, 12))
'2013-12-26T10:11:12Z'
"""
return the_datetime.strftime(ISO_DT)
def datetime_to_timestamp(the_datetime):
"""
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
'20131226101112'
"""
return the_datetime.strftime(TIMESTAMP_14)
def timestamp_now():
"""
>>> len(timestamp_now())
14
"""
return datetime_to_timestamp(datetime.datetime.utcnow())
def timestamp20_now():
"""
Create 20-digit timestamp, useful to timestamping temp files
>>> n = timestamp20_now()
>>> timestamp20_now() >= n
True
>>> len(n)
20
"""
now = datetime.datetime.utcnow()
return now.strftime('%Y%m%d%H%M%S%f')
def iso_date_to_timestamp(string):
"""
>>> iso_date_to_timestamp('2013-12-26T10:11:12Z')
'20131226101112'
>>> iso_date_to_timestamp('2013-12-26T10:11:12')
'20131226101112'
"""
return datetime_to_timestamp(iso_date_to_datetime(string))
def timestamp_to_iso_date(string):
"""
>>> timestamp_to_iso_date('20131226101112')
'2013-12-26T10:11:12Z'
>>> timestamp_to_iso_date('20131226101112')
'2013-12-26T10:11:12Z'
"""
return datetime_to_iso_date(timestamp_to_datetime(string))
def http_date_to_timestamp(string):
"""
>>> http_date_to_timestamp('Thu, 26 Dec 2013 09:50:00 GMT')
'20131226095000'
>>> http_date_to_timestamp('Sun, 26 Jan 2014 20:08:04 GMT')
'20140126200804'
"""
return datetime_to_timestamp(http_date_to_datetime(string))
# pad to certain length (default 6)
def pad_timestamp(string, pad_str=PAD_6_UP):
"""
>>> pad_timestamp('20')
'209912'
>>> pad_timestamp('2014')
'201412'
>>> pad_timestamp('20141011')
'20141011'
>>> pad_timestamp('201410110010')
'201410110010'
"""
str_len = len(string)
pad_len = len(pad_str)
if str_len < pad_len:
string = string + pad_str[str_len:]
return string
def timestamp_to_datetime(string):
"""
# >14-digit -- rest ignored
>>> timestamp_to_datetime('2014122609501011')
datetime.datetime(2014, 12, 26, 9, 50, 10)
# 14-digit
>>> timestamp_to_datetime('20141226095010')
datetime.datetime(2014, 12, 26, 9, 50, 10)
# 13-digit padding
>>> timestamp_to_datetime('2014122609501')
datetime.datetime(2014, 12, 26, 9, 50, 59)
# 12-digit padding
>>> timestamp_to_datetime('201412260950')
datetime.datetime(2014, 12, 26, 9, 50, 59)
# 11-digit padding
>>> timestamp_to_datetime('20141226095')
datetime.datetime(2014, 12, 26, 9, 59, 59)
# 10-digit padding
>>> timestamp_to_datetime('2014122609')
datetime.datetime(2014, 12, 26, 9, 59, 59)
# 9-digit padding
>>> timestamp_to_datetime('201412260')
datetime.datetime(2014, 12, 26, 23, 59, 59)
# 8-digit padding
>>> timestamp_to_datetime('20141226')
datetime.datetime(2014, 12, 26, 23, 59, 59)
# 7-digit padding
>>> timestamp_to_datetime('2014122')
datetime.datetime(2014, 12, 31, 23, 59, 59)
# 6-digit padding
>>> timestamp_to_datetime('201410')
datetime.datetime(2014, 10, 31, 23, 59, 59)
# 5-digit padding
>>> timestamp_to_datetime('20141')
datetime.datetime(2014, 12, 31, 23, 59, 59)
# 4-digit padding
>>> timestamp_to_datetime('2014')
datetime.datetime(2014, 12, 31, 23, 59, 59)
# 3-digit padding
>>> timestamp_to_datetime('201')
datetime.datetime(2019, 12, 31, 23, 59, 59)
# 2-digit padding
>>> timestamp_to_datetime('20')
datetime.datetime(2099, 12, 31, 23, 59, 59)
# 1-digit padding
>>> timestamp_to_datetime('2')
datetime.datetime(2999, 12, 31, 23, 59, 59)
# 1-digit out-of-range padding
>>> timestamp_to_datetime('3')
datetime.datetime(2999, 12, 31, 23, 59, 59)
# 0-digit padding
>>> timestamp_to_datetime('')
datetime.datetime(2999, 12, 31, 23, 59, 59)
# bad month
>>> timestamp_to_datetime('20131709005601')
datetime.datetime(2013, 12, 9, 0, 56, 1)
# all out of range except minutes
>>> timestamp_to_datetime('40001965252477')
datetime.datetime(2999, 12, 31, 23, 24, 59)
# not a number!
>>> timestamp_to_datetime('2010abc')
datetime.datetime(2010, 12, 31, 23, 59, 59)
"""
# pad to 6 digits
string = pad_timestamp(string, PAD_6_UP)
def clamp(val, min_, max_):
try:
val = int(val)
val = max(min_, min(val, max_))
return val
except:
return max_
def extract(string, start, end, min_, max_):
if len(string) >= end:
return clamp(string[start:end], min_, max_)
else:
return max_
# now parse, clamp to boundary
year = extract(string, 0, 4, 1900, 2999)
month = extract(string, 4, 6, 1, 12)
day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1])
hour = extract(string, 8, 10, 0, 23)
minute = extract(string, 10, 12, 0, 59)
second = extract(string, 12, 14, 0, 59)
return datetime.datetime(year=year,
month=month,
day=day,
hour=hour,
minute=minute,
second=second)
#return time.strptime(pad_timestamp(string), TIMESTAMP_14)
def timestamp_to_sec(string):
"""
>>> timestamp_to_sec('20131226095010')
1388051410
# rounds to end of 2014
>>> timestamp_to_sec('2014')
1420070399
"""
return calendar.timegm(timestamp_to_datetime(string).utctimetuple())
def sec_to_timestamp(secs):
"""
>>> sec_to_timestamp(1388051410)
'20131226095010'
>>> sec_to_timestamp(1420070399)
'20141231235959'
"""
return datetime_to_timestamp(datetime.datetime.utcfromtimestamp(secs))
def timestamp_to_http_date(string):
"""
>>> timestamp_to_http_date('20131226095000')
'Thu, 26 Dec 2013 09:50:00 GMT'
>>> timestamp_to_http_date('20140126200804')
'Sun, 26 Jan 2014 20:08:04 GMT'
"""
return datetime_to_http_date(timestamp_to_datetime(string))
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,12 +1,12 @@
from pywb.utils.timeutils import iso_date_to_timestamp
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.canonicalize import canonicalize
from pywb.utils.loaders import extract_post_query, append_post_query
from pywb.warc.recordloader import ArcWarcRecordLoader
from warcio.timeutils import iso_date_to_timestamp
from warcio.archiveiterator import ArchiveIterator
import hashlib
import base64
import six
import re
import sys
@ -17,224 +17,6 @@ except ImportError: # pragma: no cover
from ordereddict import OrderedDict
#=================================================================
class ArchiveIterator(object):
""" Iterate over records in WARC and ARC files, both gzip chunk
compressed and uncompressed
The indexer will automatically detect format, and decompress
if necessary.
"""
GZIP_ERR_MSG = """
ERROR: Non-chunked gzip file detected, gzip block continues
beyond single record.
This file is probably not a multi-chunk gzip but a single gzip file.
To allow seek, a gzipped {1} must have each record compressed into
a single gzip chunk and concatenated together.
This file is likely still valid and you can use it by decompressing it:
gunzip myfile.{0}.gz
You can then also use the 'warc2warc' tool from the 'warc-tools'
package which will create a properly chunked gzip file:
warc2warc -Z myfile.{0} > myfile.{0}.gz
"""
INC_RECORD = """\
WARNING: Record not followed by newline, perhaps Content-Length is invalid
Offset: {0}
Remainder: {1}
"""
def __init__(self, fileobj, no_record_parse=False,
verify_http=False, arc2warc=False):
self.fh = fileobj
self.loader = ArcWarcRecordLoader(verify_http=verify_http,
arc2warc=arc2warc)
self.reader = None
self.offset = 0
self.known_format = None
self.mixed_arc_warc = arc2warc
self.member_info = None
self.no_record_parse = no_record_parse
def __call__(self, block_size=16384):
""" iterate over each record
"""
decomp_type = 'gzip'
self.reader = DecompressingBufferedReader(self.fh,
block_size=block_size)
self.offset = self.fh.tell()
self.next_line = None
raise_invalid_gzip = False
empty_record = False
record = None
while True:
try:
curr_offset = self.fh.tell()
record = self._next_record(self.next_line)
if raise_invalid_gzip:
self._raise_invalid_gzip_err()
yield record
except EOFError:
empty_record = True
if record:
self.read_to_end(record)
if self.reader.decompressor:
# if another gzip member, continue
if self.reader.read_next_member():
continue
# if empty record, then we're done
elif empty_record:
break
# otherwise, probably a gzip
# containing multiple non-chunked records
# raise this as an error
else:
raise_invalid_gzip = True
# non-gzip, so we're done
elif empty_record:
break
def _raise_invalid_gzip_err(self):
""" A gzip file with multiple ARC/WARC records, non-chunked
has been detected. This is not valid for replay, so notify user
"""
frmt = 'warc/arc'
if self.known_format:
frmt = self.known_format
frmt_up = frmt.upper()
msg = self.GZIP_ERR_MSG.format(frmt, frmt_up)
raise Exception(msg)
def _consume_blanklines(self):
""" Consume blank lines that are between records
- For warcs, there are usually 2
- For arcs, may be 1 or 0
- For block gzipped files, these are at end of each gzip envelope
and are included in record length which is the full gzip envelope
- For uncompressed, they are between records and so are NOT part of
the record length
count empty_size so that it can be substracted from
the record length for uncompressed
if first line read is not blank, likely error in WARC/ARC,
display a warning
"""
empty_size = 0
first_line = True
while True:
line = self.reader.readline()
if len(line) == 0:
return None, empty_size
stripped = line.rstrip()
if len(stripped) == 0 or first_line:
empty_size += len(line)
if len(stripped) != 0:
# if first line is not blank,
# likely content-length was invalid, display warning
err_offset = self.fh.tell() - self.reader.rem_length() - empty_size
sys.stderr.write(self.INC_RECORD.format(err_offset, line))
first_line = False
continue
return line, empty_size
def read_to_end(self, record, payload_callback=None):
""" Read remainder of the stream
If a digester is included, update it
with the data read
"""
# already at end of this record, don't read until it is consumed
if self.member_info:
return None
num = 0
curr_offset = self.offset
while True:
b = record.stream.read(8192)
if not b:
break
num += len(b)
if payload_callback:
payload_callback(b)
"""
- For compressed files, blank lines are consumed
since they are part of record length
- For uncompressed files, blank lines are read later,
and not included in the record length
"""
#if self.reader.decompressor:
self.next_line, empty_size = self._consume_blanklines()
self.offset = self.fh.tell() - self.reader.rem_length()
#if self.offset < 0:
# raise Exception('Not Gzipped Properly')
if self.next_line:
self.offset -= len(self.next_line)
length = self.offset - curr_offset
if not self.reader.decompressor:
length -= empty_size
self.member_info = (curr_offset, length)
#return self.member_info
#return next_line
def _next_record(self, next_line):
""" Use loader to parse the record from the reader stream
Supporting warc and arc records
"""
record = self.loader.parse_record_stream(self.reader,
next_line,
self.known_format,
self.no_record_parse)
self.member_info = None
# Track known format for faster parsing of other records
if not self.mixed_arc_warc:
self.known_format = record.format
return record
#=================================================================
class ArchiveIndexEntryMixin(object):
MIME_RE = re.compile('[; ]')
@ -289,7 +71,7 @@ class ArchiveIndexEntryMixin(object):
self['urlkey'] = canonicalize(url, surt_ordered)
other['urlkey'] = self['urlkey']
referer = other.record.status_headers.get_header('referer')
referer = other.record.http_headers.get_header('referer')
if referer:
self['_referer'] = referer
@ -349,7 +131,6 @@ class DefaultRecordParser(object):
def create_record_iter(self, raw_iter):
append_post = self.options.get('append_post')
include_all = self.options.get('include_all')
block_size = self.options.get('block_size', 16384)
surt_ordered = self.options.get('surt_ordered', True)
minimal = self.options.get('minimal')
@ -357,10 +138,10 @@ class DefaultRecordParser(object):
raise Exception('Sorry, minimal index option and ' +
'append POST options can not be used together')
for record in raw_iter(block_size):
for record in raw_iter:
entry = None
if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):
if not include_all and not minimal and (record.http_headers.get_statuscode() == '-'):
continue
if record.rec_type == 'arc_header':
@ -394,13 +175,13 @@ class DefaultRecordParser(object):
compute_digest = True
elif not minimal and record.rec_type == 'request' and append_post:
method = record.status_headers.protocol
len_ = record.status_headers.get_header('Content-Length')
method = record.http_headers.protocol
len_ = record.http_headers.get_header('Content-Length')
post_query = extract_post_query(method,
entry.get('_content_type'),
len_,
record.stream)
record.raw_stream)
entry['_post_query'] = post_query
@ -455,7 +236,7 @@ class DefaultRecordParser(object):
if record.rec_type == 'warcinfo':
entry['url'] = record.rec_headers.get_header('WARC-Filename')
entry['urlkey'] = entry['url']
entry['_warcinfo'] = record.stream.read(record.length)
entry['_warcinfo'] = record.raw_stream.read(record.length)
return entry
entry['url'] = record.rec_headers.get_header('WARC-Target-Uri')
@ -471,13 +252,13 @@ class DefaultRecordParser(object):
entry['mime'] = '-'
else:
def_mime = '-' if record.rec_type == 'request' else 'unk'
entry.extract_mime(record.status_headers.
entry.extract_mime(record.http_headers.
get_header('Content-Type'),
def_mime)
# status -- only for response records (by convention):
if record.rec_type == 'response' and not self.options.get('minimal'):
entry.extract_status(record.status_headers)
entry.extract_status(record.http_headers)
else:
entry['status'] = '-'
@ -523,7 +304,7 @@ class DefaultRecordParser(object):
entry.extract_mime(record.rec_headers.get_header('content-type'))
# status
entry.extract_status(record.status_headers)
entry.extract_status(record.http_headers)
# digest
entry['digest'] = '-'

View File

@ -0,0 +1,33 @@
from warcio.bufferedreaders import DecompressingBufferedReader
from warcio.recordloader import ArcWarcRecordLoader
from pywb.utils.loaders import BlockLoader
#=================================================================
class BlockArcWarcRecordLoader(ArcWarcRecordLoader):
def __init__(self, loader=None, cookie_maker=None, block_size=8192, *args, **kwargs):
if not loader:
loader = BlockLoader(cookie_maker=cookie_maker)
self.loader = loader
self.block_size = block_size
super(BlockArcWarcRecordLoader, self).__init__(*args, **kwargs)
def load(self, url, offset, length, no_record_parse=False):
""" Load a single record from given url at offset with length
and parse as either warc or arc record
"""
try:
length = int(length)
except:
length = -1
stream = self.loader.load(url, int(offset), length)
decomp_type = 'gzip'
# Create decompressing stream
stream = DecompressingBufferedReader(stream=stream,
decomp_type=decomp_type,
block_size=self.block_size)
return self.parse_record_stream(stream, no_record_parse=no_record_parse)

View File

@ -31,7 +31,7 @@ from bisect import insort
from six import StringIO
from pywb.warc.archiveiterator import DefaultRecordParser
from pywb.warc.archiveindexer import DefaultRecordParser
import codecs
import six

View File

@ -1,328 +0,0 @@
import collections
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.statusandheaders import StatusAndHeadersParserException
from pywb.utils.loaders import BlockLoader, LimitReader
from pywb.utils.loaders import to_native_str
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.wbexception import WbException
from pywb.utils.timeutils import timestamp_to_iso_date
from six.moves import zip
import six
#=================================================================
#ArcWarcRecord = collections.namedtuple('ArcWarcRecord',
# 'format, rec_type, rec_headers, ' +
# 'stream, status_headers, ' +
# 'content_type, length')
#=================================================================
class ArcWarcRecord(object):
def __init__(self, *args):
(self.format, self.rec_type, self.rec_headers, self.stream,
self.status_headers, self.content_type, self.length) = args
#=================================================================
class ArchiveLoadFailed(WbException):
def __init__(self, reason, filename=''):
if filename:
msg = filename + ': ' + str(reason)
else:
msg = str(reason)
super(ArchiveLoadFailed, self).__init__(msg)
def status(self):
return '503 Service Unavailable'
#=================================================================
class ArcWarcRecordLoader(object):
WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']
HTTP_VERBS = ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE',
'OPTIONS', 'CONNECT', 'PATCH']
NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource')
NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
HTTP_SCHEMES = ('http:', 'https:')
def __init__(self, loader=None, cookie_maker=None, block_size=8192,
verify_http=True, arc2warc=True):
if not loader:
loader = BlockLoader(cookie_maker=cookie_maker)
self.loader = loader
self.block_size = block_size
if arc2warc:
self.arc_parser = ARC2WARCHeadersParser()
else:
self.arc_parser = ARCHeadersParser()
self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
def load(self, url, offset, length, no_record_parse=False):
""" Load a single record from given url at offset with length
and parse as either warc or arc record
"""
try:
length = int(length)
except:
length = -1
stream = self.loader.load(url, int(offset), length)
decomp_type = 'gzip'
# Create decompressing stream
stream = DecompressingBufferedReader(stream=stream,
decomp_type=decomp_type,
block_size=self.block_size)
return self.parse_record_stream(stream, no_record_parse=no_record_parse)
def parse_record_stream(self, stream,
statusline=None,
known_format=None,
no_record_parse=False):
""" Parse file-like stream and return an ArcWarcRecord
encapsulating the record headers, http headers (if any),
and a stream limited to the remainder of the record.
Pass statusline and known_format to detect_type_loader_headers()
to faciliate parsing.
"""
(the_format, rec_headers) = (self.
_detect_type_load_headers(stream,
statusline,
known_format))
if the_format == 'arc':
uri = rec_headers.get_header('uri')
length = rec_headers.get_header('length')
content_type = rec_headers.get_header('content-type')
sub_len = rec_headers.total_len
if uri and uri.startswith('filedesc://'):
rec_type = 'arc_header'
else:
rec_type = 'response'
elif the_format in ('warc', 'arc2warc'):
rec_type = rec_headers.get_header('WARC-Type')
uri = rec_headers.get_header('WARC-Target-URI')
length = rec_headers.get_header('Content-Length')
content_type = rec_headers.get_header('Content-Type')
if the_format == 'warc':
sub_len = 0
else:
sub_len = rec_headers.total_len
the_format = 'warc'
is_err = False
try:
if length is not None:
length = int(length) - sub_len
if length < 0:
is_err = True
except (ValueError, TypeError):
is_err = True
# err condition
if is_err:
length = 0
# or status and headers are completely empty (blank lines found)
elif not rec_headers:
length = 0
# limit stream to the length for all valid records
if length is not None and length >= 0:
stream = LimitReader.wrap_stream(stream, length)
# don't parse the http record at all
if no_record_parse:
status_headers = None#StatusAndHeaders('', [])
# if empty record (error or otherwise) set status to 204
elif length == 0:
if is_err:
msg = '204 Possible Error'
else:
msg = '204 No Content'
status_headers = StatusAndHeaders(msg, [])
# response record or non-empty revisit: parse HTTP status and headers!
elif (rec_type in ('response', 'revisit')
and uri.startswith(self.HTTP_SCHEMES)):
status_headers = self.http_parser.parse(stream)
# request record: parse request
elif ((rec_type == 'request')
and uri.startswith(self.HTTP_SCHEMES)):
status_headers = self.http_req_parser.parse(stream)
# everything else: create a no-status entry, set content-type
else:
content_type_header = [('Content-Type', content_type)]
if length is not None and length >= 0:
content_type_header.append(('Content-Length', str(length)))
status_headers = StatusAndHeaders('200 OK', content_type_header)
return ArcWarcRecord(the_format, rec_type,
rec_headers, stream, status_headers,
content_type, length)
def _detect_type_load_headers(self, stream,
statusline=None, known_format=None):
""" If known_format is specified ('warc' or 'arc'),
parse only as that format.
Otherwise, try parsing record as WARC, then try parsing as ARC.
if neither one succeeds, we're out of luck.
"""
if known_format != 'arc':
# try as warc first
try:
rec_headers = self.warc_parser.parse(stream, statusline)
return 'warc', rec_headers
except StatusAndHeadersParserException as se:
if known_format == 'warc':
msg = 'Invalid WARC record, first line: '
raise ArchiveLoadFailed(msg + str(se.statusline))
statusline = se.statusline
pass
# now try as arc
try:
rec_headers = self.arc_parser.parse(stream, statusline)
return self.arc_parser.get_rec_type(), rec_headers
except StatusAndHeadersParserException as se:
if known_format == 'arc':
msg = 'Invalid ARC record, first line: '
else:
msg = 'Unknown archive format, first line: '
raise ArchiveLoadFailed(msg + str(se.statusline))
#=================================================================
class ARCHeadersParser(object):
# ARC 1.0 headers
ARC_HEADERS = ["uri", "ip-address", "archive-date",
"content-type", "length"]
def __init__(self):
self.headernames = self.get_header_names()
def get_rec_type(self):
return 'arc'
def parse(self, stream, headerline=None):
total_read = 0
def readline():
return to_native_str(stream.readline())
# if headerline passed in, use that
if headerline is None:
headerline = readline()
else:
headerline = to_native_str(headerline)
header_len = len(headerline)
if header_len == 0:
raise EOFError()
headerline = headerline.rstrip()
headernames = self.headernames
# if arc header, consume next two lines
if headerline.startswith('filedesc://'):
version = readline() # skip version
spec = readline() # skip header spec, use preset one
total_read += len(version)
total_read += len(spec)
parts = headerline.split(' ')
if len(parts) != len(headernames):
msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'
msg = msg.format(headernames, parts)
raise StatusAndHeadersParserException(msg, parts)
protocol, headers = self._get_protocol_and_headers(headerline, parts)
return StatusAndHeaders(statusline='',
headers=headers,
protocol='WARC/1.0',
total_len=total_read)
@classmethod
def get_header_names(cls):
return cls.ARC_HEADERS
def _get_protocol_and_headers(self, headerline, parts):
headers = []
for name, value in zip(self.headernames, parts):
headers.append((name, value))
return ('ARC/1.0', headers)
#=================================================================
class ARC2WARCHeadersParser(ARCHeadersParser):
# Headers for converting ARC -> WARC Header
ARC_TO_WARC_HEADERS = ["WARC-Target-URI",
"WARC-IP-Address",
"WARC-Date",
"Content-Type",
"Content-Length"]
def get_rec_type(self):
return 'arc2warc'
@classmethod
def get_header_names(cls):
return cls.ARC_TO_WARC_HEADERS
def _get_protocol_and_headers(self, headerline, parts):
headers = []
for name, value in zip(self.headernames, parts):
if name == 'WARC-Date':
value = timestamp_to_iso_date(value)
headers.append((name, value))
if headerline.startswith('filedesc://'):
rec_type = 'arc_header'
else:
rec_type = 'response'
headers.append(('WARC-Type', rec_type))
headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id()))
return ('WARC/1.0', headers)

View File

@ -1,5 +1,7 @@
from pywb.utils.timeutils import iso_date_to_timestamp
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
from warcio.recordloader import ArchiveLoadFailed
from warcio.timeutils import iso_date_to_timestamp
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
from pywb.utils.wbexception import NotFoundException
import six
@ -9,7 +11,7 @@ import six
class ResolvingLoader(object):
MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded'
def __init__(self, path_resolvers, record_loader=ArcWarcRecordLoader(), no_record_parse=False):
def __init__(self, path_resolvers, record_loader=BlockArcWarcRecordLoader(), no_record_parse=False):
self.path_resolvers = path_resolvers
self.record_loader = record_loader
self.no_record_parse = no_record_parse
@ -26,20 +28,20 @@ class ResolvingLoader(object):
elif headers_record != payload_record:
# close remainder of stream as this record only used for
# (already parsed) headers
headers_record.stream.close()
headers_record.raw_stream.close()
# special case: check if headers record is actually empty
# (eg empty revisit), then use headers from revisit
if not headers_record.status_headers.headers:
if not headers_record.http_headers.headers:
headers_record = payload_record
if not headers_record or not payload_record:
raise ArchiveLoadFailed('Could not load ' + str(cdx))
# ensure status line is valid from here
headers_record.status_headers.validate_statusline('204 No Content')
headers_record.http_headers.validate_statusline('204 No Content')
return (headers_record.status_headers, payload_record.stream)
return (headers_record.http_headers, payload_record.raw_stream)
def load_headers_and_payload(self, cdx, failed_files, cdx_loader):
"""
@ -106,7 +108,7 @@ class ResolvingLoader(object):
# optimization: if same file already failed this request,
# don't try again
if failed_files is not None and filename in failed_files:
raise ArchiveLoadFailed('Skipping Already Failed', filename)
raise ArchiveLoadFailed('Skipping Already Failed: ' + filename)
any_found = False
last_exc = None
@ -143,7 +145,7 @@ class ResolvingLoader(object):
msg = 'Archive File Not Found'
#raise ArchiveLoadFailed(msg, filename), None, last_traceback
six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(msg, filename), last_traceback)
six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(filename + ': ' + msg), last_traceback)
def _load_different_url_payload(self, cdx, headers_record,
failed_files, cdx_loader):

View File

@ -370,6 +370,30 @@ def test_cdxj_empty():
assert buff.getvalue() == b''
def test_cdxj_middle_empty_records():
empty_gzip_record = b'\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00'
new_warc = BytesIO()
with open(TEST_WARC_DIR + 'example2.warc.gz', 'rb') as fh:
new_warc.write(empty_gzip_record)
new_warc.write(fh.read())
new_warc.write(empty_gzip_record)
new_warc.write(empty_gzip_record)
fh.seek(0)
new_warc.write(fh.read())
options = dict(cdxj=True)
buff = BytesIO()
new_warc.seek(0)
write_cdx_index(buff, new_warc, 'empty.warc.gz', **options)
lines = buff.getvalue().rstrip().split(b'\n')
assert len(lines) == 2, lines
if __name__ == "__main__":
import doctest

View File

@ -292,12 +292,13 @@ import sys
import pprint
import six
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
from warcio.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
from pywb.warc.pathresolvers import PathResolverMapper
from pywb.cdx.cdxobject import CDXObject
import pywb.utils.statusandheaders
import warcio.statusandheaders
from pywb import get_test_dir
@ -326,14 +327,16 @@ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
def load_test_archive(test_file, offset, length):
path = test_warc_dir + test_file
testloader = ArcWarcRecordLoader()
testloader = BlockArcWarcRecordLoader()
archive = testloader.load(path, offset, length)
pywb.utils.statusandheaders.WRAP_WIDTH = 160
warcio.statusandheaders.WRAP_WIDTH = 160
pprint.pprint(((archive.format, archive.rec_type),
archive.rec_headers, archive.status_headers), indent=1, width=160)
archive.rec_headers, archive.http_headers), indent=1, width=160)
warcio.statusandheaders.WRAP_WIDTH = 80
#==============================================================================

View File

@ -5,7 +5,8 @@ import json
import time
import os
from pywb.utils.timeutils import timestamp_now
from warcio.timeutils import timestamp_now
from pywb.cdx.cdxops import process_cdx
from pywb.cdx.query import CDXQuery

View File

@ -107,7 +107,11 @@ class ResAggApp(object):
if self.debug:
traceback.print_exc()
status = last_exc.status()
if not hasattr(last_exc, 'status'):
status = '503 Archive Not Available'
else:
status = last_exc.status()
message = last_exc.msg
res = {'message': message}

View File

@ -2,6 +2,7 @@ from pywb.webagg.responseloader import WARCPathLoader, LiveWebLoader, VideoLoad
from pywb.webagg.utils import MementoUtils
from pywb.utils.wbexception import BadRequestException, WbException
from pywb.utils.wbexception import NotFoundException
from warcio.recordloader import ArchiveLoadFailed
from pywb.cdx.query import CDXQuery
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
@ -151,7 +152,7 @@ class ResourceHandler(IndexHandler):
out_headers, resp = loader(cdx, params)
if resp is not None:
return out_headers, resp, errs
except WbException as e:
except (WbException, ArchiveLoadFailed) as e:
last_exc = e
errs[str(loader)] = str(e)

View File

@ -1,6 +1,6 @@
from pywb.utils.binsearch import iter_range
from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp
from pywb.utils.timeutils import timestamp_now
from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp
from warcio.timeutils import timestamp_now
from pywb.utils.canonicalize import canonicalize
from pywb.utils.wbexception import NotFoundException

View File

@ -1,6 +1,7 @@
from warcio.limitreader import LimitReader
from warcio.statusandheaders import StatusAndHeadersParser
from pywb.utils.loaders import extract_post_query, append_post_query
from pywb.utils.loaders import LimitReader
from pywb.utils.statusandheaders import StatusAndHeadersParser
from six.moves.urllib.parse import urlsplit, quote
from six import iteritems, StringIO

View File

@ -3,7 +3,7 @@ from pywb.utils.wbexception import NotFoundException
from pywb.webagg.indexsource import BaseIndexSource, RemoteIndexSource
from pywb.webagg.responseloader import LiveWebLoader
from pywb.webagg.utils import ParamFormatter, res_template
from pywb.utils.timeutils import timestamp_now
from warcio.timeutils import timestamp_now
#=============================================================================

View File

@ -2,12 +2,13 @@ from pywb.webagg.utils import MementoUtils, StreamIter, compress_gzip_iter
from pywb.webagg.utils import ParamFormatter
from pywb.webagg.indexsource import RedisIndexSource
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp
from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date
from pywb.utils.timeutils import http_date_to_datetime, datetime_to_http_date
from warcio.timeutils import timestamp_to_datetime, datetime_to_timestamp
from warcio.timeutils import iso_date_to_datetime, datetime_to_iso_date
from warcio.timeutils import http_date_to_datetime, datetime_to_http_date
from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
from pywb.utils.wbexception import LiveResourceException, WbException
from pywb.utils.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
from pywb.warc.resolvingloader import ResolvingLoader
@ -243,11 +244,11 @@ class WARCPathLoader(BaseLoader):
status = cdx.get('status')
if not status or status.startswith('3'):
status_headers = self.headers_parser.parse(payload.stream)
http_headers = self.headers_parser.parse(payload.raw_stream)
self.raise_on_self_redirect(params, cdx,
status_headers.get_statuscode(),
status_headers.get_header('Location'))
http_headers_buff = status_headers.to_bytes()
http_headers.get_statuscode(),
http_headers.get_header('Location'))
http_headers_buff = http_headers.to_bytes()
else:
http_headers_buff = None
@ -266,9 +267,9 @@ class WARCPathLoader(BaseLoader):
warc_headers.replace_header('WARC-Date',
headers.rec_headers.get_header('WARC-Date'))
headers.stream.close()
headers.raw_stream.close()
return (warc_headers, http_headers_buff, payload.stream)
return (warc_headers, http_headers_buff, payload.raw_stream)
def __str__(self):
return 'WARCPathLoader'

View File

@ -13,10 +13,10 @@ from pywb.webagg.aggregator import DirectoryIndexSource
from pywb.webagg.app import ResAggApp
from pywb.webagg.utils import MementoUtils
from pywb.warc.recordloader import ArcWarcRecordLoader
from warcio.recordloader import ArcWarcRecordLoader
from warcio.statusandheaders import StatusAndHeadersParser
from warcio.bufferedreaders import ChunkedDataReader
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.bufferedreaders import ChunkedDataReader
from io import BytesIO
from six.moves.urllib.parse import urlencode
@ -215,9 +215,9 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
buff = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(buff, no_record_parse=False)
print(record.status_headers)
assert record.status_headers.get_statuscode() == '302'
assert record.status_headers.get_header('Location') == 'https://www.iana.org/'
print(record.http_headers)
assert record.http_headers.get_statuscode() == '302'
assert record.http_headers.get_header('Location') == 'https://www.iana.org/'
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live'))
def test_agg_select_live(self):

View File

@ -3,7 +3,7 @@ from pywb.webagg.indexsource import LiveIndexSource
from pywb.webagg.aggregator import SimpleAggregator
from pywb.utils.timeutils import timestamp_now
from warcio.timeutils import timestamp_now
from .testutils import key_ts_res, TEST_CDX_PATH

View File

@ -8,7 +8,7 @@ from pywb.webagg.handlers import DefaultResourceHandler
from pywb.webagg.aggregator import SimpleAggregator
from pywb.webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource
from pywb.warc.recordloader import ArcWarcRecordLoader
from warcio.recordloader import ArcWarcRecordLoader
from .testutils import LiveServerTests, BaseTestClass
@ -48,7 +48,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
record = ArcWarcRecordLoader().parse_record_stream(resp.raw, no_record_parse=False)
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
assert record.status_headers.get_header('Date') != ''
assert record.http_headers.get_header('Date') != ''
def test_upstream_1(self):
resp = self.testapp.get('/upstream/resource?url=http://httpbin.org/get')
@ -58,7 +58,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False)
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
assert record.status_headers.get_header('Date') != ''
assert record.http_headers.get_header('Date') != ''
def test_upstream_2(self):
resp = self.testapp.get('/upstream_opt/resource?url=http://httpbin.org/get')
@ -68,7 +68,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False)
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
assert record.status_headers.get_header('Date') != ''
assert record.http_headers.get_header('Date') != ''

View File

@ -7,7 +7,9 @@ import zlib
from contextlib import closing
from pywb.utils.timeutils import timestamp_to_http_date
from warcio.timeutils import timestamp_to_http_date
from warcio.utils import BUFF_SIZE
from pywb.utils.wbexception import BadRequestException
from pywb.utils.loaders import load_yaml_config
@ -20,9 +22,6 @@ LINK_SEG_SPLIT = re.compile(';\s*')
LINK_URL = re.compile('<(.*)>')
LINK_PROP = re.compile('([\w]+)="([^"]+)')
BUFF_SIZE = 16384
#=============================================================================
class MementoException(BadRequestException):
pass

View File

@ -15,7 +15,7 @@ from pywb.cdx.cdxobject import IDXObject, CDXException, CDXObject
from pywb.cdx.query import CDXQuery
from pywb.utils.loaders import BlockLoader, read_last_line
from pywb.utils.bufferedreaders import gzip_decompressor
from warcio.bufferedreaders import gzip_decompressor
from pywb.utils.binsearch import iter_range, linearsearch, search

View File

@ -5,21 +5,22 @@ import logging
from datetime import datetime
from warcio.statusandheaders import StatusAndHeaders
from warcio.timeutils import datetime_to_timestamp
from pywb.utils.wbexception import NotFoundException
from pywb.utils.loaders import LocalFileLoader
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
from pywb.warc.pathresolvers import PathResolverMapper
from pywb.webapp.views import J2TemplateView, init_view
from pywb.webapp.replay_views import ReplayView
from pywb.framework.memento import MementoResponse
from pywb.utils.timeutils import datetime_to_timestamp
#=================================================================
@ -134,7 +135,7 @@ class WBHandler(SearchPageWbUrlHandler):
def _init_replay_view(self, config):
cookie_maker = config.get('cookie_maker')
record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker)
record_loader = BlockArcWarcRecordLoader(cookie_maker=cookie_maker)
paths = config.get('archive_paths')

View File

@ -1,5 +1,6 @@
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.loaders import LimitReader
from warcio.statusandheaders import StatusAndHeaders
from warcio.limitreader import LimitReader
from pywb.framework.cache import create_cache
from tempfile import NamedTemporaryFile, mkdtemp

View File

@ -5,16 +5,17 @@ from io import BytesIO
from six.moves.urllib.parse import urlsplit
from itertools import chain
from pywb.utils.statusandheaders import StatusAndHeaders
from warcio.statusandheaders import StatusAndHeaders
from warcio.limitreader import LimitReader
from warcio.timeutils import timestamp_now
from warcio.recordloader import ArchiveLoadFailed
from pywb.utils.wbexception import WbException, NotFoundException
from pywb.utils.loaders import LimitReader
from pywb.utils.timeutils import timestamp_now
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import MementoResponse
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.warc.recordloader import ArchiveLoadFailed
from pywb.webapp.views import HeadInsertView

View File

@ -1,4 +1,4 @@
from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec
from warcio.timeutils import timestamp_to_datetime, timestamp_to_sec
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import make_timemap, LINK_FORMAT

View File

@ -71,6 +71,7 @@ setup(
],
install_requires=[
'six',
'warcio',
'chardet',
'requests',
'redis',