mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
spin-off warcio!
update imports to point to warcio warcio rename fixes: - ArcWarcRecord.stream -> raw_stream - ArcWarcRecord.status_headers -> http_headers - ArchiveLoadFailed single param init
This commit is contained in:
parent
4a94699a65
commit
0784e4e5aa
@ -3,8 +3,9 @@ from pywb.cdx.cdxobject import TIMESTAMP, STATUSCODE, MIMETYPE, DIGEST
|
|||||||
from pywb.cdx.cdxobject import OFFSET, LENGTH, FILENAME
|
from pywb.cdx.cdxobject import OFFSET, LENGTH, FILENAME
|
||||||
|
|
||||||
from pywb.cdx.query import CDXQuery
|
from pywb.cdx.query import CDXQuery
|
||||||
from pywb.utils.timeutils import timestamp_to_sec, pad_timestamp
|
|
||||||
from pywb.utils.timeutils import PAD_14_DOWN, PAD_14_UP
|
from warcio.timeutils import timestamp_to_sec, pad_timestamp
|
||||||
|
from warcio.timeutils import PAD_14_DOWN, PAD_14_UP
|
||||||
|
|
||||||
import bisect
|
import bisect
|
||||||
|
|
||||||
|
@ -14,7 +14,7 @@ com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ
|
|||||||
from fakeredis import FakeStrictRedis
|
from fakeredis import FakeStrictRedis
|
||||||
from mock import patch
|
from mock import patch
|
||||||
|
|
||||||
from pywb.utils.timeutils import timestamp_to_sec
|
from warcio.timeutils import timestamp_to_sec
|
||||||
from pywb.cdx.cdxsource import RedisCDXSource
|
from pywb.cdx.cdxsource import RedisCDXSource
|
||||||
from pywb.cdx.cdxserver import CDXServer
|
from pywb.cdx.cdxserver import CDXServer
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ from pywb.cdx.cdxsource import CDXSource
|
|||||||
from pywb.cdx.cdxobject import IDXObject, CDXException
|
from pywb.cdx.cdxobject import IDXObject, CDXException
|
||||||
|
|
||||||
from pywb.utils.loaders import BlockLoader, read_last_line
|
from pywb.utils.loaders import BlockLoader, read_last_line
|
||||||
from pywb.utils.bufferedreaders import gzip_decompressor
|
from warcio.bufferedreaders import gzip_decompressor
|
||||||
from pywb.utils.binsearch import iter_range, linearsearch, search
|
from pywb.utils.binsearch import iter_range, linearsearch, search
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from pywb.utils.wbexception import BadRequestException
|
from pywb.utils.wbexception import BadRequestException
|
||||||
from pywb.utils.timeutils import http_date_to_timestamp
|
from warcio.timeutils import http_date_to_timestamp
|
||||||
from pywb.utils.timeutils import timestamp_to_http_date
|
from warcio.timeutils import timestamp_to_http_date
|
||||||
|
|
||||||
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
|
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
|
||||||
from pywb.rewrite.wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
|
@ -4,6 +4,7 @@ from pywb.framework.wbrequestresponse import WbResponse, WbRequest
|
|||||||
from pywb.framework.archivalrouter import ArchivalRouter
|
from pywb.framework.archivalrouter import ArchivalRouter
|
||||||
|
|
||||||
from six.moves.urllib.parse import urlsplit
|
from six.moves.urllib.parse import urlsplit
|
||||||
|
from six import iteritems
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
import socket
|
import socket
|
||||||
@ -15,8 +16,8 @@ from pywb.rewrite.url_rewriter import SchemeOnlyUrlRewriter, UrlRewriter
|
|||||||
from pywb.rewrite.rewrite_content import RewriteContent
|
from pywb.rewrite.rewrite_content import RewriteContent
|
||||||
from pywb.utils.wbexception import BadRequestException
|
from pywb.utils.wbexception import BadRequestException
|
||||||
|
|
||||||
from pywb.utils.bufferedreaders import BufferedReader
|
from warcio.bufferedreaders import BufferedReader
|
||||||
from pywb.utils.loaders import to_native_str
|
from warcio.utils import to_native_str
|
||||||
|
|
||||||
from pywb.framework.proxy_resolvers import ProxyAuthResolver, CookieResolver, IPCacheResolver
|
from pywb.framework.proxy_resolvers import ProxyAuthResolver, CookieResolver, IPCacheResolver
|
||||||
|
|
||||||
@ -250,7 +251,8 @@ class ProxyRouter(object):
|
|||||||
|
|
||||||
# add extra headers for replay responses
|
# add extra headers for replay responses
|
||||||
if wbrequest.wb_url and wbrequest.wb_url.is_replay():
|
if wbrequest.wb_url and wbrequest.wb_url.is_replay():
|
||||||
response.status_headers.replace_headers(self.extra_headers)
|
for name, value in iteritems(self.extra_headers):
|
||||||
|
response.status_headers.replace_header(name, value)
|
||||||
|
|
||||||
# check for content-length
|
# check for content-length
|
||||||
res = response.status_headers.get_header('content-length')
|
res = response.status_headers.get_header('content-length')
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
from pywb.framework.wbrequestresponse import WbResponse
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
from pywb.utils.loaders import extract_client_cookie
|
from pywb.utils.loaders import extract_client_cookie
|
||||||
from pywb.utils.wbexception import WbException
|
from pywb.utils.wbexception import WbException
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
|
||||||
from pywb.rewrite.wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
|
|
||||||
from pywb.framework.cache import create_cache
|
from pywb.framework.cache import create_cache
|
||||||
@ -10,7 +9,8 @@ from pywb.framework.basehandlers import WbUrlHandler
|
|||||||
from six.moves.urllib.parse import parse_qs, urlsplit
|
from six.moves.urllib.parse import parse_qs, urlsplit
|
||||||
import six
|
import six
|
||||||
|
|
||||||
from pywb.utils.loaders import to_native_str
|
from warcio.statusandheaders import StatusAndHeaders
|
||||||
|
from warcio.utils import to_native_str
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
import os
|
import os
|
||||||
|
@ -61,7 +61,7 @@
|
|||||||
|
|
||||||
from pywb.rewrite.wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from warcio.statusandheaders import StatusAndHeaders
|
||||||
|
|
||||||
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
|
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from warcio.statusandheaders import StatusAndHeaders
|
||||||
from pywb.utils.loaders import extract_post_query, append_post_query
|
from pywb.utils.loaders import extract_post_query, append_post_query
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
@ -1,7 +1,10 @@
|
|||||||
from pywb.utils.wbexception import WbException, NotFoundException
|
from pywb.utils.wbexception import WbException, NotFoundException
|
||||||
from pywb.utils.loaders import load_yaml_config, to_native_str
|
from pywb.utils.loaders import load_yaml_config
|
||||||
|
from pywb.utils.loaders import load_yaml_config
|
||||||
|
from warcio.utils import to_native_str
|
||||||
|
|
||||||
from pywb.framework.wbrequestresponse import WbResponse, StatusAndHeaders
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
|
from warcio.statusandheaders import StatusAndHeaders
|
||||||
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
@ -13,7 +13,7 @@ from pkg_resources import resource_string
|
|||||||
from argparse import ArgumentParser, RawTextHelpFormatter
|
from argparse import ArgumentParser, RawTextHelpFormatter
|
||||||
|
|
||||||
from pywb.utils.loaders import load_yaml_config
|
from pywb.utils.loaders import load_yaml_config
|
||||||
from pywb.utils.timeutils import timestamp20_now
|
from warcio.timeutils import timestamp20_now
|
||||||
|
|
||||||
from pywb import DEFAULT_CONFIG
|
from pywb import DEFAULT_CONFIG
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_iso_date
|
from warcio.timeutils import timestamp_to_datetime, datetime_to_iso_date
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
@ -7,12 +7,11 @@ import traceback
|
|||||||
|
|
||||||
import portalocker
|
import portalocker
|
||||||
|
|
||||||
from pywb.utils.timeutils import timestamp20_now
|
from warcio.timeutils import timestamp20_now
|
||||||
|
from warcio.warcwriter import BaseWARCWriter
|
||||||
|
|
||||||
from pywb.webagg.utils import res_template
|
from pywb.webagg.utils import res_template
|
||||||
|
|
||||||
from pywb.warc.warcwriter import BaseWARCWriter
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class MultiFileWARCWriter(BaseWARCWriter):
|
class MultiFileWARCWriter(BaseWARCWriter):
|
||||||
@ -22,6 +21,8 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
|||||||
max_idle_secs=1800, *args, **kwargs):
|
max_idle_secs=1800, *args, **kwargs):
|
||||||
super(MultiFileWARCWriter, self).__init__(*args, **kwargs)
|
super(MultiFileWARCWriter, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
self.header_filter = kwargs.get('header_filter')
|
||||||
|
|
||||||
if not filename_template:
|
if not filename_template:
|
||||||
dir_template, filename_template = os.path.split(dir_template)
|
dir_template, filename_template = os.path.split(dir_template)
|
||||||
dir_template += os.path.sep
|
dir_template += os.path.sep
|
||||||
@ -41,27 +42,8 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
|||||||
|
|
||||||
self.fh_cache = {}
|
self.fh_cache = {}
|
||||||
|
|
||||||
def write_req_resp(self, req, resp, params):
|
|
||||||
url = resp.rec_headers.get_header('WARC-Target-URI')
|
|
||||||
dt = resp.rec_headers.get_header('WARC-Date')
|
|
||||||
|
|
||||||
#req.rec_headers['Content-Type'] = req.content_type
|
|
||||||
req.rec_headers.replace_header('WARC-Target-URI', url)
|
|
||||||
req.rec_headers.replace_header('WARC-Date', dt)
|
|
||||||
|
|
||||||
resp_id = resp.rec_headers.get_header('WARC-Record-ID')
|
|
||||||
if resp_id:
|
|
||||||
req.rec_headers.add_header('WARC-Concurrent-To', resp_id)
|
|
||||||
|
|
||||||
resp = self._check_revisit(resp, params)
|
|
||||||
if not resp:
|
|
||||||
print('Skipping due to dedup')
|
|
||||||
return
|
|
||||||
|
|
||||||
self._do_write_req_resp(req, resp, params)
|
|
||||||
|
|
||||||
def _check_revisit(self, record, params):
|
def _check_revisit(self, record, params):
|
||||||
if not self.dedup_index:
|
if not self.dedup_index or record.rec_type != 'response':
|
||||||
return record
|
return record
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -77,14 +59,18 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
if isinstance(result, tuple) and result[0] == 'revisit':
|
if isinstance(result, tuple) and result[0] == 'revisit':
|
||||||
record.rec_headers.replace_header('WARC-Type', 'revisit')
|
record = self.create_revisit_record(url, digest, result[1], result[2],
|
||||||
record.rec_headers.add_header('WARC-Profile', self.REVISIT_PROFILE)
|
http_headers=record.http_headers)
|
||||||
|
|
||||||
record.rec_headers.add_header('WARC-Refers-To-Target-URI', result[1])
|
|
||||||
record.rec_headers.add_header('WARC-Refers-To-Date', result[2])
|
|
||||||
|
|
||||||
return record
|
return record
|
||||||
|
|
||||||
|
def _set_header_buff(self, record):
|
||||||
|
exclude_list = None
|
||||||
|
if self.header_filter:
|
||||||
|
exclude_list = self.header_filter(record)
|
||||||
|
buff = record.http_headers.to_bytes(exclude_list)
|
||||||
|
record.http_headers.headers_buff = buff
|
||||||
|
|
||||||
def get_new_filename(self, dir_, params):
|
def get_new_filename(self, dir_, params):
|
||||||
timestamp = timestamp20_now()
|
timestamp = timestamp20_now()
|
||||||
|
|
||||||
@ -153,6 +139,11 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
|||||||
self._do_write_req_resp(None, record, params)
|
self._do_write_req_resp(None, record, params)
|
||||||
|
|
||||||
def _do_write_req_resp(self, req, resp, params):
|
def _do_write_req_resp(self, req, resp, params):
|
||||||
|
resp = self._check_revisit(resp, params)
|
||||||
|
if not resp:
|
||||||
|
print('Skipping due to dedup')
|
||||||
|
return
|
||||||
|
|
||||||
def write_callback(out, filename):
|
def write_callback(out, filename):
|
||||||
#url = resp.rec_headers.get_header('WARC-Target-URI')
|
#url = resp.rec_headers.get_header('WARC-Target-URI')
|
||||||
#print('Writing req/resp {0} to {1} '.format(url, filename))
|
#print('Writing req/resp {0} to {1} '.format(url, filename))
|
||||||
@ -180,6 +171,8 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
|||||||
|
|
||||||
close_file = False
|
close_file = False
|
||||||
|
|
||||||
|
new_size = start = 0
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
out, filename = result
|
out, filename = result
|
||||||
is_new = False
|
is_new = False
|
||||||
|
@ -69,16 +69,21 @@ class RecorderApp(object):
|
|||||||
req_head, req_pay, resp_head, resp_pay, params = result
|
req_head, req_pay, resp_head, resp_pay, params = result
|
||||||
|
|
||||||
#resp_type, resp = self.writer.read_resp_record(resp_head, resp_pay)
|
#resp_type, resp = self.writer.read_resp_record(resp_head, resp_pay)
|
||||||
resp = self.writer.copy_warc_record(resp_pay)
|
resp_length = resp_pay.tell()
|
||||||
|
resp_pay.seek(0)
|
||||||
|
resp = self.writer.create_record_from_stream(resp_pay, resp_length)
|
||||||
|
|
||||||
if resp.rec_type == 'response':
|
if resp.rec_type == 'response':
|
||||||
uri = resp.rec_headers.get_header('WARC-Target-Uri')
|
uri = resp.rec_headers.get_header('WARC-Target-Uri')
|
||||||
|
req_length = req_pay.tell()
|
||||||
|
req_pay.seek(0)
|
||||||
req = self.writer.create_warc_record(uri=uri,
|
req = self.writer.create_warc_record(uri=uri,
|
||||||
record_type='request',
|
record_type='request',
|
||||||
payload=req_pay,
|
payload=req_pay,
|
||||||
|
length=req_length,
|
||||||
warc_headers_dict=req_head)
|
warc_headers_dict=req_head)
|
||||||
|
|
||||||
self.writer.write_req_resp(req, resp, params)
|
self.writer.write_request_response_pair(req, resp, params)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.writer.write_record(resp, params)
|
self.writer.write_record(resp, params)
|
||||||
@ -133,9 +138,13 @@ class RecorderApp(object):
|
|||||||
|
|
||||||
content_type = headers.get('Content-Type')
|
content_type = headers.get('Content-Type')
|
||||||
|
|
||||||
|
payload_length = req_stream.out.tell()
|
||||||
|
req_stream.out.seek(0)
|
||||||
|
|
||||||
record = self.writer.create_warc_record(uri=params['url'],
|
record = self.writer.create_warc_record(uri=params['url'],
|
||||||
record_type=record_type,
|
record_type=record_type,
|
||||||
payload=req_stream.out,
|
payload=req_stream.out,
|
||||||
|
length=payload_length,
|
||||||
warc_content_type=content_type,
|
warc_content_type=content_type,
|
||||||
warc_headers_dict=req_stream.headers)
|
warc_headers_dict=req_stream.headers)
|
||||||
|
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
from pywb.utils.canonicalize import calc_search_range
|
from pywb.utils.canonicalize import calc_search_range
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
from pywb.warc.cdxindexer import write_cdx_index
|
from pywb.warc.cdxindexer import write_cdx_index
|
||||||
from pywb.utils.timeutils import iso_date_to_timestamp
|
|
||||||
|
from warcio.timeutils import iso_date_to_timestamp
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import os
|
import os
|
||||||
|
@ -20,11 +20,13 @@ from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitD
|
|||||||
from pywb.webagg.utils import MementoUtils
|
from pywb.webagg.utils import MementoUtils
|
||||||
|
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
|
||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
from warcio.statusandheaders import StatusAndHeadersParser
|
||||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
from warcio.bufferedreaders import DecompressingBufferedReader
|
||||||
|
from warcio.recordloader import ArcWarcRecordLoader
|
||||||
|
from warcio.archiveiterator import ArchiveIterator
|
||||||
|
|
||||||
from pywb.warc.cdxindexer import write_cdx_index
|
from pywb.warc.cdxindexer import write_cdx_index
|
||||||
from pywb.warc.archiveiterator import ArchiveIterator
|
|
||||||
|
|
||||||
from six.moves.urllib.parse import quote, unquote, urlencode
|
from six.moves.urllib.parse import quote, unquote, urlencode
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
@ -122,9 +124,9 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
|||||||
filename = os.path.join(base_dir, filename)
|
filename = os.path.join(base_dir, filename)
|
||||||
with open(filename, 'rb') as fh:
|
with open(filename, 'rb') as fh:
|
||||||
for record in ArchiveIterator(fh, no_record_parse=True):
|
for record in ArchiveIterator(fh, no_record_parse=True):
|
||||||
assert record.status_headers == None
|
assert record.http_headers == None
|
||||||
assert int(record.rec_headers.get_header('Content-Length')) == record.length
|
assert int(record.rec_headers.get_header('Content-Length')) == record.length
|
||||||
assert record.length == len(record.stream.read())
|
assert record.length == len(record.raw_stream.read())
|
||||||
|
|
||||||
def test_record_warc_1(self):
|
def test_record_warc_1(self):
|
||||||
recorder_app = RecorderApp(self.upstream_url,
|
recorder_app = RecorderApp(self.upstream_url,
|
||||||
@ -168,16 +170,16 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
|||||||
|
|
||||||
buff = BytesIO(resp.body)
|
buff = BytesIO(resp.body)
|
||||||
record = ArcWarcRecordLoader().parse_record_stream(buff)
|
record = ArcWarcRecordLoader().parse_record_stream(buff)
|
||||||
assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers
|
assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers
|
||||||
assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers
|
assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers
|
||||||
|
|
||||||
stored_req, stored_resp = self._load_resp_req(base_path)
|
stored_req, stored_resp = self._load_resp_req(base_path)
|
||||||
|
|
||||||
assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.status_headers.headers
|
assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.http_headers.headers
|
||||||
assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.status_headers.headers
|
assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.http_headers.headers
|
||||||
|
|
||||||
assert ('X-Other', 'foo') in stored_req.status_headers.headers
|
assert ('X-Other', 'foo') in stored_req.http_headers.headers
|
||||||
assert ('Cookie', 'boo=far') in stored_req.status_headers.headers
|
assert ('Cookie', 'boo=far') in stored_req.http_headers.headers
|
||||||
|
|
||||||
self._test_all_warcs('/warcs/cookiecheck/', 1)
|
self._test_all_warcs('/warcs/cookiecheck/', 1)
|
||||||
|
|
||||||
@ -193,16 +195,16 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
|||||||
|
|
||||||
buff = BytesIO(resp.body)
|
buff = BytesIO(resp.body)
|
||||||
record = ArcWarcRecordLoader().parse_record_stream(buff)
|
record = ArcWarcRecordLoader().parse_record_stream(buff)
|
||||||
assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers
|
assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers
|
||||||
assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers
|
assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers
|
||||||
|
|
||||||
stored_req, stored_resp = self._load_resp_req(warc_path)
|
stored_req, stored_resp = self._load_resp_req(warc_path)
|
||||||
|
|
||||||
assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.status_headers.headers
|
assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.http_headers.headers
|
||||||
assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.status_headers.headers
|
assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.http_headers.headers
|
||||||
|
|
||||||
assert ('X-Other', 'foo') in stored_req.status_headers.headers
|
assert ('X-Other', 'foo') in stored_req.http_headers.headers
|
||||||
assert ('Cookie', 'boo=far') not in stored_req.status_headers.headers
|
assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers
|
||||||
|
|
||||||
self._test_all_warcs('/warcs/cookieskip/', 1)
|
self._test_all_warcs('/warcs/cookieskip/', 1)
|
||||||
|
|
||||||
@ -530,10 +532,10 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
|||||||
assert status_headers.get_header('Content-Length') == str(len(buff))
|
assert status_headers.get_header('Content-Length') == str(len(buff))
|
||||||
assert status_headers.get_header('WARC-Custom') == 'foo'
|
assert status_headers.get_header('WARC-Custom') == 'foo'
|
||||||
|
|
||||||
assert record.stream.read() == buff
|
assert record.raw_stream.read() == buff
|
||||||
|
|
||||||
status_headers = record.status_headers
|
status_headers = record.http_headers
|
||||||
assert len(record.status_headers.headers) == 2
|
assert len(record.http_headers.headers) == 2
|
||||||
|
|
||||||
assert status_headers.get_header('Content-Type') == 'text/plain'
|
assert status_headers.get_header('Content-Type') == 'text/plain'
|
||||||
assert status_headers.get_header('Content-Length') == str(len(buff))
|
assert status_headers.get_header('Content-Length') == str(len(buff))
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from warcio.statusandheaders import StatusAndHeaders
|
||||||
from pywb.utils.timeutils import datetime_to_http_date
|
from warcio.timeutils import datetime_to_http_date
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
import six
|
import six
|
||||||
|
|
||||||
|
@ -12,10 +12,11 @@ from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
|
|||||||
from pywb.rewrite.rewriterules import RewriteRules
|
from pywb.rewrite.rewriterules import RewriteRules
|
||||||
|
|
||||||
from pywb.utils.dsrules import RuleSet
|
from pywb.utils.dsrules import RuleSet
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
|
||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
from warcio.statusandheaders import StatusAndHeaders
|
||||||
from pywb.utils.bufferedreaders import ChunkedDataReader, BufferedReader
|
from warcio.bufferedreaders import DecompressingBufferedReader
|
||||||
from pywb.utils.loaders import to_native_str
|
from warcio.bufferedreaders import ChunkedDataReader, BufferedReader
|
||||||
|
from warcio.utils import to_native_str
|
||||||
|
|
||||||
from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
|
from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
|
||||||
|
|
||||||
|
@ -11,10 +11,11 @@ import os
|
|||||||
from six.moves.urllib.parse import urlsplit
|
from six.moves.urllib.parse import urlsplit
|
||||||
import six
|
import six
|
||||||
|
|
||||||
|
from warcio.timeutils import timestamp_now
|
||||||
|
from warcio.statusandheaders import StatusAndHeaders
|
||||||
|
|
||||||
from pywb.utils.loaders import is_http, LimitReader, LocalFileLoader, to_file_url
|
from pywb.utils.loaders import is_http, LimitReader, LocalFileLoader, to_file_url
|
||||||
from pywb.utils.loaders import extract_client_cookie
|
from pywb.utils.loaders import extract_client_cookie
|
||||||
from pywb.utils.timeutils import timestamp_now
|
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
|
||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
|
|
||||||
from pywb.rewrite.rewrite_content import RewriteContent
|
from pywb.rewrite.rewrite_content import RewriteContent
|
||||||
|
@ -50,9 +50,9 @@ HTTP Headers Rewriting
|
|||||||
|
|
||||||
from pywb.rewrite.header_rewriter import HeaderRewriter
|
from pywb.rewrite.header_rewriter import HeaderRewriter
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from warcio.statusandheaders import StatusAndHeaders
|
||||||
|
|
||||||
from pywb.utils.timeutils import datetime_to_http_date
|
from warcio.timeutils import datetime_to_http_date
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
import pprint
|
import pprint
|
||||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,5 +1,5 @@
|
|||||||
from pywb.rewrite.cookie_rewriter import WbUrlBaseCookieRewriter, HostScopeCookieRewriter
|
from pywb.rewrite.cookie_rewriter import WbUrlBaseCookieRewriter, HostScopeCookieRewriter
|
||||||
from pywb.utils.timeutils import datetime_to_http_date
|
from warcio.timeutils import datetime_to_http_date
|
||||||
from six.moves import zip
|
from six.moves import zip
|
||||||
|
|
||||||
import redis
|
import redis
|
||||||
|
@ -6,10 +6,10 @@ from pywb.framework.archivalrouter import Route
|
|||||||
|
|
||||||
from pywb.rewrite.rewrite_content import RewriteContent
|
from pywb.rewrite.rewrite_content import RewriteContent
|
||||||
from pywb.rewrite.wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
from warcio.recordloader import ArcWarcRecordLoader
|
||||||
from pywb.webapp.live_rewrite_handler import RewriteHandler
|
from pywb.webapp.live_rewrite_handler import RewriteHandler
|
||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
from pywb.utils.timeutils import http_date_to_timestamp
|
from warcio.timeutils import http_date_to_timestamp
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
@ -81,7 +81,7 @@ class PlatformHandler(RewriteHandler):
|
|||||||
|
|
||||||
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
|
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
|
||||||
result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter,
|
result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter,
|
||||||
record.status_headers,
|
record.http_headers,
|
||||||
record.stream,
|
record.stream,
|
||||||
head_insert_func,
|
head_insert_func,
|
||||||
urlkey,
|
urlkey,
|
||||||
|
@ -6,14 +6,15 @@ from pywb.rewrite.url_rewriter import UrlRewriter
|
|||||||
|
|
||||||
from pywb.utils.wbexception import WbException
|
from pywb.utils.wbexception import WbException
|
||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
from pywb.utils.timeutils import http_date_to_timestamp
|
|
||||||
from pywb.utils.loaders import extract_client_cookie
|
from pywb.utils.loaders import extract_client_cookie
|
||||||
from pywb.utils.bufferedreaders import BufferedReader
|
|
||||||
|
from warcio.timeutils import http_date_to_timestamp
|
||||||
|
from warcio.bufferedreaders import BufferedReader
|
||||||
|
from warcio.recordloader import ArcWarcRecordLoader
|
||||||
|
|
||||||
from pywb.webagg.utils import BUFF_SIZE
|
from pywb.webagg.utils import BUFF_SIZE
|
||||||
|
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
|
||||||
from pywb.framework.wbrequestresponse import WbResponse
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
|
|
||||||
from pywb.webagg.utils import MementoUtils, buffer_iter
|
from pywb.webagg.utils import MementoUtils, buffer_iter
|
||||||
@ -200,11 +201,11 @@ class RewriterApp(object):
|
|||||||
self._add_custom_params(cdx, r.headers, kwargs)
|
self._add_custom_params(cdx, r.headers, kwargs)
|
||||||
|
|
||||||
if readd_range:
|
if readd_range:
|
||||||
content_length = (record.status_headers.
|
content_length = (record.http_headers.
|
||||||
get_header('Content-Length'))
|
get_header('Content-Length'))
|
||||||
try:
|
try:
|
||||||
content_length = int(content_length)
|
content_length = int(content_length)
|
||||||
record.status_headers.add_range(0, content_length,
|
record.http_headers.add_range(0, content_length,
|
||||||
content_length)
|
content_length)
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
pass
|
pass
|
||||||
@ -228,8 +229,8 @@ class RewriterApp(object):
|
|||||||
cookie_key)
|
cookie_key)
|
||||||
|
|
||||||
result = self.content_rewriter.rewrite_content(urlrewriter,
|
result = self.content_rewriter.rewrite_content(urlrewriter,
|
||||||
record.status_headers,
|
record.http_headers,
|
||||||
record.stream,
|
record.raw_stream,
|
||||||
head_insert_func,
|
head_insert_func,
|
||||||
urlkey,
|
urlkey,
|
||||||
cdx,
|
cdx,
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec
|
from warcio.timeutils import timestamp_to_datetime, timestamp_to_sec
|
||||||
from pywb.utils.timeutils import timestamp_now
|
from warcio.timeutils import timestamp_now
|
||||||
|
|
||||||
from pywb.utils.loaders import load
|
from pywb.utils.loaders import load
|
||||||
|
|
||||||
from six.moves.urllib.parse import urlsplit
|
from six.moves.urllib.parse import urlsplit
|
||||||
|
@ -1,336 +0,0 @@
|
|||||||
from io import BytesIO
|
|
||||||
import zlib
|
|
||||||
import brotli
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def gzip_decompressor():
|
|
||||||
"""
|
|
||||||
Decompressor which can handle decompress gzip stream
|
|
||||||
"""
|
|
||||||
return zlib.decompressobj(16 + zlib.MAX_WBITS)
|
|
||||||
|
|
||||||
|
|
||||||
def deflate_decompressor():
|
|
||||||
return zlib.decompressobj()
|
|
||||||
|
|
||||||
|
|
||||||
def deflate_decompressor_alt():
|
|
||||||
return zlib.decompressobj(-zlib.MAX_WBITS)
|
|
||||||
|
|
||||||
def brotli_decompressor():
|
|
||||||
decomp = brotli.Decompressor()
|
|
||||||
decomp.unused_data = None
|
|
||||||
return decomp
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class BufferedReader(object):
|
|
||||||
"""
|
|
||||||
A wrapping line reader which wraps an existing reader.
|
|
||||||
Read operations operate on underlying buffer, which is filled to
|
|
||||||
block_size (1024 default)
|
|
||||||
|
|
||||||
If an optional decompress type is specified,
|
|
||||||
data is fed through the decompressor when read from the buffer.
|
|
||||||
Currently supported decompression: gzip
|
|
||||||
If unspecified, default decompression is None
|
|
||||||
|
|
||||||
If decompression is specified, and decompress fails on first try,
|
|
||||||
data is assumed to not be compressed and no exception is thrown.
|
|
||||||
|
|
||||||
If a failure occurs after data has been
|
|
||||||
partially decompressed, the exception is propagated.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
DECOMPRESSORS = {'gzip': gzip_decompressor,
|
|
||||||
'deflate': deflate_decompressor,
|
|
||||||
'deflate_alt': deflate_decompressor_alt,
|
|
||||||
'br': brotli_decompressor
|
|
||||||
}
|
|
||||||
|
|
||||||
def __init__(self, stream, block_size=1024,
|
|
||||||
decomp_type=None,
|
|
||||||
starting_data=None):
|
|
||||||
self.stream = stream
|
|
||||||
self.block_size = block_size
|
|
||||||
|
|
||||||
self._init_decomp(decomp_type)
|
|
||||||
|
|
||||||
self.buff = None
|
|
||||||
self.starting_data = starting_data
|
|
||||||
self.num_read = 0
|
|
||||||
self.buff_size = 0
|
|
||||||
|
|
||||||
def set_decomp(self, decomp_type):
|
|
||||||
self._init_decomp(decomp_type)
|
|
||||||
|
|
||||||
def _init_decomp(self, decomp_type):
|
|
||||||
self.num_block_read = 0
|
|
||||||
if decomp_type:
|
|
||||||
try:
|
|
||||||
self.decomp_type = decomp_type
|
|
||||||
self.decompressor = self.DECOMPRESSORS[decomp_type.lower()]()
|
|
||||||
except KeyError:
|
|
||||||
raise Exception('Decompression type not supported: ' +
|
|
||||||
decomp_type)
|
|
||||||
else:
|
|
||||||
self.decomp_type = None
|
|
||||||
self.decompressor = None
|
|
||||||
|
|
||||||
def _fillbuff(self, block_size=None):
|
|
||||||
if not self.empty():
|
|
||||||
return
|
|
||||||
|
|
||||||
# can't read past next member
|
|
||||||
if self.rem_length() > 0:
|
|
||||||
return
|
|
||||||
|
|
||||||
if self.starting_data:
|
|
||||||
data = self.starting_data
|
|
||||||
self.starting_data = None
|
|
||||||
else:
|
|
||||||
if not block_size:
|
|
||||||
block_size = self.block_size
|
|
||||||
data = self.stream.read(block_size)
|
|
||||||
|
|
||||||
self._process_read(data)
|
|
||||||
|
|
||||||
def _process_read(self, data):
|
|
||||||
data = self._decompress(data)
|
|
||||||
self.buff_size = len(data)
|
|
||||||
self.num_read += self.buff_size
|
|
||||||
self.num_block_read += self.buff_size
|
|
||||||
self.buff = BytesIO(data)
|
|
||||||
|
|
||||||
def _decompress(self, data):
|
|
||||||
if self.decompressor and data:
|
|
||||||
try:
|
|
||||||
data = self.decompressor.decompress(data)
|
|
||||||
except Exception as e:
|
|
||||||
# if first read attempt, assume non-gzipped stream
|
|
||||||
if self.num_block_read == 0:
|
|
||||||
if self.decomp_type == 'deflate':
|
|
||||||
self._init_decomp('deflate_alt')
|
|
||||||
data = self._decompress(data)
|
|
||||||
else:
|
|
||||||
self.decompressor = None
|
|
||||||
# otherwise (partly decompressed), something is wrong
|
|
||||||
else:
|
|
||||||
print(str(e))
|
|
||||||
return b''
|
|
||||||
return data
|
|
||||||
|
|
||||||
def read(self, length=None):
|
|
||||||
"""
|
|
||||||
Fill bytes and read some number of bytes
|
|
||||||
(up to length if specified)
|
|
||||||
<= length bytes may be read if reached the end of input
|
|
||||||
if at buffer boundary, will attempt to read again until
|
|
||||||
specified length is read
|
|
||||||
"""
|
|
||||||
all_buffs = []
|
|
||||||
while length is None or length > 0:
|
|
||||||
self._fillbuff()
|
|
||||||
buff = self.buff.read(length)
|
|
||||||
if not buff:
|
|
||||||
break
|
|
||||||
|
|
||||||
all_buffs.append(buff)
|
|
||||||
if length:
|
|
||||||
length -= len(buff)
|
|
||||||
|
|
||||||
return b''.join(all_buffs)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def readline(self, length=None):
|
|
||||||
"""
|
|
||||||
Fill buffer and read a full line from the buffer
|
|
||||||
(up to specified length, if provided)
|
|
||||||
If no newline found at end, try filling buffer again in case
|
|
||||||
at buffer boundary.
|
|
||||||
"""
|
|
||||||
if length == 0:
|
|
||||||
return b''
|
|
||||||
|
|
||||||
self._fillbuff()
|
|
||||||
linebuff = self.buff.readline(length)
|
|
||||||
|
|
||||||
# we may be at a boundary
|
|
||||||
while not linebuff.endswith(b'\n'):
|
|
||||||
if length:
|
|
||||||
length -= len(linebuff)
|
|
||||||
if length <= 0:
|
|
||||||
break
|
|
||||||
|
|
||||||
self._fillbuff()
|
|
||||||
|
|
||||||
if self.empty():
|
|
||||||
break
|
|
||||||
|
|
||||||
linebuff += self.buff.readline(length)
|
|
||||||
|
|
||||||
return linebuff
|
|
||||||
|
|
||||||
def empty(self):
|
|
||||||
return not self.buff or self.buff.tell() >= self.buff_size
|
|
||||||
|
|
||||||
def read_next_member(self):
|
|
||||||
if not self.decompressor or not self.decompressor.unused_data:
|
|
||||||
return False
|
|
||||||
|
|
||||||
self.starting_data = self.decompressor.unused_data
|
|
||||||
self._init_decomp(self.decomp_type)
|
|
||||||
return True
|
|
||||||
|
|
||||||
def rem_length(self):
|
|
||||||
rem = 0
|
|
||||||
if self.buff:
|
|
||||||
rem = self.buff_size - self.buff.tell()
|
|
||||||
|
|
||||||
if self.decompressor and self.decompressor.unused_data:
|
|
||||||
rem += len(self.decompressor.unused_data)
|
|
||||||
return rem
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
if self.stream:
|
|
||||||
self.stream.close()
|
|
||||||
self.stream = None
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_supported_decompressors(cls):
|
|
||||||
return cls.DECOMPRESSORS.keys()
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class DecompressingBufferedReader(BufferedReader):
|
|
||||||
"""
|
|
||||||
A BufferedReader which defaults to gzip decompression,
|
|
||||||
(unless different type specified)
|
|
||||||
"""
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
if 'decomp_type' not in kwargs:
|
|
||||||
kwargs['decomp_type'] = 'gzip'
|
|
||||||
super(DecompressingBufferedReader, self).__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class ChunkedDataException(Exception):
|
|
||||||
def __init__(self, msg, data=b''):
|
|
||||||
Exception.__init__(self, msg)
|
|
||||||
self.data = data
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class ChunkedDataReader(BufferedReader):
|
|
||||||
r"""
|
|
||||||
A ChunkedDataReader is a DecompressingBufferedReader
|
|
||||||
which also supports de-chunking of the data if it happens
|
|
||||||
to be http 'chunk-encoded'.
|
|
||||||
|
|
||||||
If at any point the chunked header is not available, the stream is
|
|
||||||
assumed to not be chunked and no more dechunking occurs.
|
|
||||||
"""
|
|
||||||
def __init__(self, stream, raise_exceptions=False, **kwargs):
|
|
||||||
super(ChunkedDataReader, self).__init__(stream, **kwargs)
|
|
||||||
self.all_chunks_read = False
|
|
||||||
self.not_chunked = False
|
|
||||||
|
|
||||||
# if False, we'll use best-guess fallback for parse errors
|
|
||||||
self.raise_chunked_data_exceptions = raise_exceptions
|
|
||||||
|
|
||||||
def _fillbuff(self, block_size=None):
|
|
||||||
if self.not_chunked:
|
|
||||||
return super(ChunkedDataReader, self)._fillbuff(block_size)
|
|
||||||
|
|
||||||
# Loop over chunks until there is some data (not empty())
|
|
||||||
# In particular, gzipped data may require multiple chunks to
|
|
||||||
# return any decompressed result
|
|
||||||
while (self.empty() and
|
|
||||||
not self.all_chunks_read and
|
|
||||||
not self.not_chunked):
|
|
||||||
|
|
||||||
try:
|
|
||||||
length_header = self.stream.readline(64)
|
|
||||||
self._try_decode(length_header)
|
|
||||||
except ChunkedDataException as e:
|
|
||||||
if self.raise_chunked_data_exceptions:
|
|
||||||
raise
|
|
||||||
|
|
||||||
# Can't parse the data as chunked.
|
|
||||||
# It's possible that non-chunked data is served
|
|
||||||
# with a Transfer-Encoding: chunked.
|
|
||||||
# Treat this as non-chunk encoded from here on.
|
|
||||||
self._process_read(length_header + e.data)
|
|
||||||
self.not_chunked = True
|
|
||||||
|
|
||||||
# parse as block as non-chunked
|
|
||||||
return super(ChunkedDataReader, self)._fillbuff(block_size)
|
|
||||||
|
|
||||||
def _try_decode(self, length_header):
|
|
||||||
# decode length header
|
|
||||||
try:
|
|
||||||
chunk_size = int(length_header.strip().split(b';')[0], 16)
|
|
||||||
except ValueError:
|
|
||||||
raise ChunkedDataException(b"Couldn't decode length header " +
|
|
||||||
length_header)
|
|
||||||
|
|
||||||
if not chunk_size:
|
|
||||||
# chunk_size 0 indicates end of file
|
|
||||||
self.all_chunks_read = True
|
|
||||||
self._process_read(b'')
|
|
||||||
return
|
|
||||||
|
|
||||||
data_len = 0
|
|
||||||
data = b''
|
|
||||||
|
|
||||||
# read chunk
|
|
||||||
while data_len < chunk_size:
|
|
||||||
new_data = self.stream.read(chunk_size - data_len)
|
|
||||||
|
|
||||||
# if we unexpectedly run out of data,
|
|
||||||
# either raise an exception or just stop reading,
|
|
||||||
# assuming file was cut off
|
|
||||||
if not new_data:
|
|
||||||
if self.raise_chunked_data_exceptions:
|
|
||||||
msg = 'Ran out of data before end of chunk'
|
|
||||||
raise ChunkedDataException(msg, data)
|
|
||||||
else:
|
|
||||||
chunk_size = data_len
|
|
||||||
self.all_chunks_read = True
|
|
||||||
|
|
||||||
data += new_data
|
|
||||||
data_len = len(data)
|
|
||||||
|
|
||||||
# if we successfully read a block without running out,
|
|
||||||
# it should end in \r\n
|
|
||||||
if not self.all_chunks_read:
|
|
||||||
clrf = self.stream.read(2)
|
|
||||||
if clrf != b'\r\n':
|
|
||||||
raise ChunkedDataException(b"Chunk terminator not found.",
|
|
||||||
data)
|
|
||||||
|
|
||||||
# hand to base class for further processing
|
|
||||||
self._process_read(data)
|
|
||||||
|
|
||||||
def read(self, length=None):
|
|
||||||
""" read bytes from stream, if length specified,
|
|
||||||
may read across multiple chunks to get exact length
|
|
||||||
"""
|
|
||||||
buf = super(ChunkedDataReader, self).read(length)
|
|
||||||
if not length:
|
|
||||||
return buf
|
|
||||||
|
|
||||||
# if length specified, attempt to read exact length
|
|
||||||
rem = length - len(buf)
|
|
||||||
while rem > 0:
|
|
||||||
new_buf = super(ChunkedDataReader, self).read(rem)
|
|
||||||
if not new_buf:
|
|
||||||
break
|
|
||||||
|
|
||||||
buf += new_buf
|
|
||||||
rem -= len(new_buf)
|
|
||||||
|
|
||||||
return buf
|
|
@ -1,69 +0,0 @@
|
|||||||
# ============================================================================
|
|
||||||
class LimitReader(object):
|
|
||||||
"""
|
|
||||||
A reader which will not read more than specified limit
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, stream, limit):
|
|
||||||
self.stream = stream
|
|
||||||
self.limit = limit
|
|
||||||
|
|
||||||
if hasattr(stream, 'tell'):
|
|
||||||
self.tell = self._tell
|
|
||||||
|
|
||||||
def _update(self, buff):
|
|
||||||
length = len(buff)
|
|
||||||
self.limit -= length
|
|
||||||
return buff
|
|
||||||
|
|
||||||
def read(self, length=None):
|
|
||||||
if length is not None:
|
|
||||||
length = min(length, self.limit)
|
|
||||||
else:
|
|
||||||
length = self.limit
|
|
||||||
|
|
||||||
if length == 0:
|
|
||||||
return b''
|
|
||||||
|
|
||||||
buff = self.stream.read(length)
|
|
||||||
return self._update(buff)
|
|
||||||
|
|
||||||
def readline(self, length=None):
|
|
||||||
if length is not None:
|
|
||||||
length = min(length, self.limit)
|
|
||||||
else:
|
|
||||||
length = self.limit
|
|
||||||
|
|
||||||
if length == 0:
|
|
||||||
return b''
|
|
||||||
|
|
||||||
buff = self.stream.readline(length)
|
|
||||||
return self._update(buff)
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
self.stream.close()
|
|
||||||
|
|
||||||
def _tell(self):
|
|
||||||
return self.stream.tell()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def wrap_stream(stream, content_length):
|
|
||||||
"""
|
|
||||||
If given content_length is an int > 0, wrap the stream
|
|
||||||
in a LimitReader. Otherwise, return the stream unaltered
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
content_length = int(content_length)
|
|
||||||
if content_length >= 0:
|
|
||||||
# optimize: if already a LimitStream, set limit to
|
|
||||||
# the smaller of the two limits
|
|
||||||
if isinstance(stream, LimitReader):
|
|
||||||
stream.limit = min(stream.limit, content_length)
|
|
||||||
else:
|
|
||||||
stream = LimitReader(stream, content_length)
|
|
||||||
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
return stream
|
|
||||||
|
|
@ -17,7 +17,7 @@ import base64
|
|||||||
import cgi
|
import cgi
|
||||||
|
|
||||||
from io import open, BytesIO
|
from io import open, BytesIO
|
||||||
from pywb.utils.limitreader import LimitReader
|
from warcio.limitreader import LimitReader
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from boto import connect_s3
|
from boto import connect_s3
|
||||||
|
@ -1,285 +0,0 @@
|
|||||||
"""
|
|
||||||
Representation and parsing of HTTP-style status + headers
|
|
||||||
"""
|
|
||||||
|
|
||||||
from pprint import pformat
|
|
||||||
from copy import copy
|
|
||||||
from six.moves import range
|
|
||||||
from six import iteritems
|
|
||||||
from pywb.utils.loaders import to_native_str
|
|
||||||
import uuid
|
|
||||||
|
|
||||||
|
|
||||||
WRAP_WIDTH = 80
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class StatusAndHeaders(object):
|
|
||||||
"""
|
|
||||||
Representation of parsed http-style status line and headers
|
|
||||||
Status Line if first line of request/response
|
|
||||||
Headers is a list of (name, value) tuples
|
|
||||||
An optional protocol which appears on first line may be specified
|
|
||||||
"""
|
|
||||||
def __init__(self, statusline, headers, protocol='', total_len=0):
|
|
||||||
self.statusline = statusline
|
|
||||||
self.headers = headers
|
|
||||||
self.protocol = protocol
|
|
||||||
self.total_len = total_len
|
|
||||||
|
|
||||||
def get_header(self, name, default_value=None):
|
|
||||||
"""
|
|
||||||
return header (name, value)
|
|
||||||
if found
|
|
||||||
"""
|
|
||||||
name_lower = name.lower()
|
|
||||||
for value in self.headers:
|
|
||||||
if value[0].lower() == name_lower:
|
|
||||||
return value[1]
|
|
||||||
|
|
||||||
return default_value
|
|
||||||
|
|
||||||
def add_header(self, name, value):
|
|
||||||
self.headers.append((name, value))
|
|
||||||
|
|
||||||
def replace_header(self, name, value):
|
|
||||||
"""
|
|
||||||
replace header with new value or add new header
|
|
||||||
return old header value, if any
|
|
||||||
"""
|
|
||||||
name_lower = name.lower()
|
|
||||||
for index in range(len(self.headers) - 1, -1, -1):
|
|
||||||
curr_name, curr_value = self.headers[index]
|
|
||||||
if curr_name.lower() == name_lower:
|
|
||||||
self.headers[index] = (curr_name, value)
|
|
||||||
return curr_value
|
|
||||||
|
|
||||||
self.headers.append((name, value))
|
|
||||||
return None
|
|
||||||
|
|
||||||
def replace_headers(self, header_dict):
|
|
||||||
"""
|
|
||||||
replace all headers in header_dict that already exist
|
|
||||||
add any remaining headers
|
|
||||||
"""
|
|
||||||
header_dict = copy(header_dict)
|
|
||||||
|
|
||||||
for index in range(len(self.headers) - 1, -1, -1):
|
|
||||||
curr_name, curr_value = self.headers[index]
|
|
||||||
name_lower = curr_name.lower()
|
|
||||||
if name_lower in header_dict:
|
|
||||||
self.headers[index] = (curr_name, header_dict[name_lower])
|
|
||||||
del header_dict[name_lower]
|
|
||||||
|
|
||||||
for name, value in iteritems(header_dict):
|
|
||||||
self.headers.append((name, value))
|
|
||||||
|
|
||||||
def remove_header(self, name):
|
|
||||||
"""
|
|
||||||
Remove header (case-insensitive)
|
|
||||||
return True if header removed, False otherwise
|
|
||||||
"""
|
|
||||||
name_lower = name.lower()
|
|
||||||
for index in range(len(self.headers) - 1, -1, -1):
|
|
||||||
if self.headers[index][0].lower() == name_lower:
|
|
||||||
del self.headers[index]
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def get_statuscode(self):
|
|
||||||
"""
|
|
||||||
Return the statuscode part of the status response line
|
|
||||||
(Assumes no protocol in the statusline)
|
|
||||||
"""
|
|
||||||
code = self.statusline.split(' ', 1)[0]
|
|
||||||
return code
|
|
||||||
|
|
||||||
def validate_statusline(self, valid_statusline):
|
|
||||||
"""
|
|
||||||
Check that the statusline is valid, eg. starts with a numeric
|
|
||||||
code. If not, replace with passed in valid_statusline
|
|
||||||
"""
|
|
||||||
code = self.get_statuscode()
|
|
||||||
try:
|
|
||||||
code = int(code)
|
|
||||||
assert(code > 0)
|
|
||||||
return True
|
|
||||||
except(ValueError, AssertionError):
|
|
||||||
self.statusline = valid_statusline
|
|
||||||
return False
|
|
||||||
|
|
||||||
def add_range(self, start, part_len, total_len):
|
|
||||||
"""
|
|
||||||
Add range headers indicating that this a partial response
|
|
||||||
"""
|
|
||||||
content_range = 'bytes {0}-{1}/{2}'.format(start,
|
|
||||||
start + part_len - 1,
|
|
||||||
total_len)
|
|
||||||
|
|
||||||
self.statusline = '206 Partial Content'
|
|
||||||
self.replace_header('Content-Range', content_range)
|
|
||||||
self.replace_header('Accept-Ranges', 'bytes')
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
headers_str = pformat(self.headers, indent=2, width=WRAP_WIDTH)
|
|
||||||
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
|
|
||||||
headers = {2})".format(self.protocol, self.statusline, headers_str)
|
|
||||||
|
|
||||||
def __eq__(self, other):
|
|
||||||
return (self.statusline == other.statusline and
|
|
||||||
self.headers == other.headers and
|
|
||||||
self.protocol == other.protocol)
|
|
||||||
|
|
||||||
def __str__(self, exclude_list=None):
|
|
||||||
return self.to_str(exclude_list)
|
|
||||||
|
|
||||||
def __bool__(self):
|
|
||||||
return bool(self.statusline or self.headers)
|
|
||||||
|
|
||||||
__nonzero__ = __bool__
|
|
||||||
|
|
||||||
def to_str(self, exclude_list):
|
|
||||||
string = self.protocol
|
|
||||||
|
|
||||||
if string and self.statusline:
|
|
||||||
string += ' '
|
|
||||||
|
|
||||||
if self.statusline:
|
|
||||||
string += self.statusline
|
|
||||||
|
|
||||||
if string:
|
|
||||||
string += '\r\n'
|
|
||||||
|
|
||||||
for h in self.headers:
|
|
||||||
if exclude_list and h[0].lower() in exclude_list:
|
|
||||||
continue
|
|
||||||
|
|
||||||
string += ': '.join(h) + '\r\n'
|
|
||||||
|
|
||||||
return string
|
|
||||||
|
|
||||||
def to_bytes(self, exclude_list=None):
|
|
||||||
return self.to_str(exclude_list).encode('iso-8859-1') + b'\r\n'
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def _strip_count(string, total_read):
|
|
||||||
length = len(string)
|
|
||||||
return string.rstrip(), total_read + length
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class StatusAndHeadersParser(object):
|
|
||||||
"""
|
|
||||||
Parser which consumes a stream support readline() to read
|
|
||||||
status and headers and return a StatusAndHeaders object
|
|
||||||
"""
|
|
||||||
def __init__(self, statuslist, verify=True):
|
|
||||||
self.statuslist = statuslist
|
|
||||||
self.verify = verify
|
|
||||||
|
|
||||||
def parse(self, stream, full_statusline=None):
|
|
||||||
"""
|
|
||||||
parse stream for status line and headers
|
|
||||||
return a StatusAndHeaders object
|
|
||||||
|
|
||||||
support continuation headers starting with space or tab
|
|
||||||
"""
|
|
||||||
|
|
||||||
def readline():
|
|
||||||
return to_native_str(stream.readline())
|
|
||||||
|
|
||||||
# status line w newlines intact
|
|
||||||
if full_statusline is None:
|
|
||||||
full_statusline = readline()
|
|
||||||
else:
|
|
||||||
full_statusline = to_native_str(full_statusline)
|
|
||||||
|
|
||||||
statusline, total_read = _strip_count(full_statusline, 0)
|
|
||||||
|
|
||||||
headers = []
|
|
||||||
|
|
||||||
# at end of stream
|
|
||||||
if total_read == 0:
|
|
||||||
raise EOFError()
|
|
||||||
elif not statusline:
|
|
||||||
return StatusAndHeaders(statusline=statusline,
|
|
||||||
headers=headers,
|
|
||||||
protocol='',
|
|
||||||
total_len=total_read)
|
|
||||||
|
|
||||||
# validate only if verify is set
|
|
||||||
if self.verify:
|
|
||||||
protocol_status = self.split_prefix(statusline, self.statuslist)
|
|
||||||
|
|
||||||
if not protocol_status:
|
|
||||||
msg = 'Expected Status Line starting with {0} - Found: {1}'
|
|
||||||
msg = msg.format(self.statuslist, statusline)
|
|
||||||
raise StatusAndHeadersParserException(msg, full_statusline)
|
|
||||||
else:
|
|
||||||
protocol_status = statusline.split(' ', 1)
|
|
||||||
|
|
||||||
line, total_read = _strip_count(readline(), total_read)
|
|
||||||
while line:
|
|
||||||
result = line.split(':', 1)
|
|
||||||
if len(result) == 2:
|
|
||||||
name = result[0].rstrip(' \t')
|
|
||||||
value = result[1].lstrip()
|
|
||||||
else:
|
|
||||||
name = result[0]
|
|
||||||
value = None
|
|
||||||
|
|
||||||
next_line, total_read = _strip_count(readline(),
|
|
||||||
total_read)
|
|
||||||
|
|
||||||
# append continuation lines, if any
|
|
||||||
while next_line and next_line.startswith((' ', '\t')):
|
|
||||||
if value is not None:
|
|
||||||
value += next_line
|
|
||||||
next_line, total_read = _strip_count(readline(),
|
|
||||||
total_read)
|
|
||||||
|
|
||||||
if value is not None:
|
|
||||||
header = (name, value)
|
|
||||||
headers.append(header)
|
|
||||||
|
|
||||||
line = next_line
|
|
||||||
|
|
||||||
if len(protocol_status) > 1:
|
|
||||||
statusline = protocol_status[1].strip()
|
|
||||||
else:
|
|
||||||
statusline = ''
|
|
||||||
|
|
||||||
return StatusAndHeaders(statusline=statusline,
|
|
||||||
headers=headers,
|
|
||||||
protocol=protocol_status[0],
|
|
||||||
total_len=total_read)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def split_prefix(key, prefixs):
|
|
||||||
"""
|
|
||||||
split key string into prefix and remainder
|
|
||||||
for first matching prefix from a list
|
|
||||||
"""
|
|
||||||
key_upper = key.upper()
|
|
||||||
for prefix in prefixs:
|
|
||||||
if key_upper.startswith(prefix):
|
|
||||||
plen = len(prefix)
|
|
||||||
return (key_upper[:plen], key[plen:])
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def make_warc_id(id_=None):
|
|
||||||
if not id_:
|
|
||||||
id_ = uuid.uuid1()
|
|
||||||
return '<urn:uuid:{0}>'.format(id_)
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class StatusAndHeadersParserException(Exception):
|
|
||||||
"""
|
|
||||||
status + headers parsing exception
|
|
||||||
"""
|
|
||||||
def __init__(self, msg, statusline):
|
|
||||||
super(StatusAndHeadersParserException, self).__init__(msg)
|
|
||||||
self.statusline = statusline
|
|
@ -1,174 +0,0 @@
|
|||||||
r"""
|
|
||||||
# DecompressingBufferedReader Tests
|
|
||||||
#=================================================================
|
|
||||||
|
|
||||||
# DecompressingBufferedReader readline()
|
|
||||||
>>> print_str(DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline())
|
|
||||||
' CDX N b a m s k r M S V g\n'
|
|
||||||
|
|
||||||
# detect not compressed
|
|
||||||
>>> print_str(DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb'), decomp_type = 'gzip').readline())
|
|
||||||
' CDX N b a m s k r M S V g\n'
|
|
||||||
|
|
||||||
# decompress with on the fly compression, default gzip compression
|
|
||||||
>>> print_str(DecompressingBufferedReader(BytesIO(compress('ABC\n1234\n'))).read())
|
|
||||||
'ABC\n1234\n'
|
|
||||||
|
|
||||||
# decompress with on the fly compression, default 'inflate' compression
|
|
||||||
>>> print_str(DecompressingBufferedReader(BytesIO(compress_alt('ABC\n1234\n')), decomp_type='deflate').read())
|
|
||||||
'ABC\n1234\n'
|
|
||||||
|
|
||||||
# error: invalid compress type
|
|
||||||
>>> DecompressingBufferedReader(BytesIO(compress('ABC')), decomp_type = 'bzip2').read()
|
|
||||||
Traceback (most recent call last):
|
|
||||||
Exception: Decompression type not supported: bzip2
|
|
||||||
|
|
||||||
# invalid output when reading compressed data as not compressed
|
|
||||||
>>> DecompressingBufferedReader(BytesIO(compress('ABC')), decomp_type = None).read() != b'ABC'
|
|
||||||
True
|
|
||||||
|
|
||||||
|
|
||||||
# DecompressingBufferedReader readline() with decompression (zipnum file, no header)
|
|
||||||
>>> print_str(DecompressingBufferedReader(open(test_zip_dir + 'zipnum-sample.cdx.gz', 'rb'), decomp_type = 'gzip').readline())
|
|
||||||
'com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz\n'
|
|
||||||
|
|
||||||
# test very small block size
|
|
||||||
>>> dbr = DecompressingBufferedReader(BytesIO(b'ABCDEFG\nHIJKLMN\nOPQR\nXYZ'), block_size = 3)
|
|
||||||
>>> print_str(dbr.readline()); print_str(dbr.readline(4)); print_str(dbr.readline()); print_str(dbr.readline()); print_str(dbr.readline(2)); print_str(dbr.readline()); print_str(dbr.readline())
|
|
||||||
'ABCDEFG\n'
|
|
||||||
'HIJK'
|
|
||||||
'LMN\n'
|
|
||||||
'OPQR\n'
|
|
||||||
'XY'
|
|
||||||
'Z'
|
|
||||||
''
|
|
||||||
|
|
||||||
# test zero length reads
|
|
||||||
>>> x = DecompressingBufferedReader(LimitReader(BytesIO(b'\r\n'), 1))
|
|
||||||
>>> print_str(x.readline(0)); print_str(x.read(0))
|
|
||||||
''
|
|
||||||
''
|
|
||||||
|
|
||||||
# Chunk-Decoding Buffered Reader Tests
|
|
||||||
#=================================================================
|
|
||||||
|
|
||||||
Properly formatted chunked data:
|
|
||||||
>>> c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n0\r\n\r\n"));
|
|
||||||
>>> print_str(c.read() + c.read() + c.read())
|
|
||||||
'1234'
|
|
||||||
|
|
||||||
Non-chunked data:
|
|
||||||
>>> print_str(ChunkedDataReader(BytesIO(b"xyz123!@#")).read())
|
|
||||||
'xyz123!@#'
|
|
||||||
|
|
||||||
Non-chunked, compressed data, specify decomp_type
|
|
||||||
>>> print_str(ChunkedDataReader(BytesIO(compress('ABCDEF')), decomp_type='gzip').read())
|
|
||||||
'ABCDEF'
|
|
||||||
|
|
||||||
Non-chunked, compressed data, specifiy compression seperately
|
|
||||||
>>> c = ChunkedDataReader(BytesIO(compress('ABCDEF'))); c.set_decomp('gzip'); print_str(c.read())
|
|
||||||
'ABCDEF'
|
|
||||||
|
|
||||||
Non-chunked, compressed data, wrap in DecompressingBufferedReader
|
|
||||||
>>> print_str(DecompressingBufferedReader(ChunkedDataReader(BytesIO(compress('\nABCDEF\nGHIJ')))).read())
|
|
||||||
'\nABCDEF\nGHIJ'
|
|
||||||
|
|
||||||
Chunked compressed data
|
|
||||||
Split compressed stream into 10-byte chunk and a remainder chunk
|
|
||||||
>>> b = compress('ABCDEFGHIJKLMNOP')
|
|
||||||
>>> l = len(b)
|
|
||||||
>>> in_ = format(10, 'x').encode('utf-8') + b"\r\n" + b[:10] + b"\r\n" + format(l - 10, 'x').encode('utf-8') + b"\r\n" + b[10:] + b"\r\n0\r\n\r\n"
|
|
||||||
>>> c = ChunkedDataReader(BytesIO(in_), decomp_type='gzip')
|
|
||||||
>>> print_str(c.read())
|
|
||||||
'ABCDEFGHIJKLMNOP'
|
|
||||||
|
|
||||||
Starts like chunked data, but isn't:
|
|
||||||
>>> c = ChunkedDataReader(BytesIO(b"1\r\nxyz123!@#"));
|
|
||||||
>>> print_str(c.read() + c.read())
|
|
||||||
'1\r\nx123!@#'
|
|
||||||
|
|
||||||
Chunked data cut off part way through:
|
|
||||||
>>> c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n4\r\n12"));
|
|
||||||
>>> print_str(c.read() + c.read())
|
|
||||||
'123412'
|
|
||||||
|
|
||||||
Zero-Length chunk:
|
|
||||||
>>> print_str(ChunkedDataReader(BytesIO(b"0\r\n\r\n")).read())
|
|
||||||
''
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
from io import BytesIO
|
|
||||||
from pywb.utils.bufferedreaders import ChunkedDataReader, ChunkedDataException
|
|
||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
|
||||||
from pywb.utils.limitreader import LimitReader
|
|
||||||
|
|
||||||
from pywb import get_test_dir
|
|
||||||
|
|
||||||
import six
|
|
||||||
|
|
||||||
import zlib
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
test_cdx_dir = get_test_dir() + 'cdx/'
|
|
||||||
test_zip_dir = get_test_dir() + 'zipcdx/'
|
|
||||||
|
|
||||||
|
|
||||||
def compress(buff):
|
|
||||||
buff = buff.encode('utf-8')
|
|
||||||
compressobj = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
|
|
||||||
compressed = compressobj.compress(buff)
|
|
||||||
compressed += compressobj.flush()
|
|
||||||
|
|
||||||
return compressed
|
|
||||||
|
|
||||||
# plain "inflate"
|
|
||||||
def compress_alt(buff):
|
|
||||||
buff = buff.encode('utf-8')
|
|
||||||
compressobj = zlib.compressobj(6, zlib.DEFLATED)
|
|
||||||
compressed = compressobj.compress(buff)
|
|
||||||
compressed += compressobj.flush()
|
|
||||||
# drop gzip headers/tail
|
|
||||||
compressed = compressed[2:-4]
|
|
||||||
|
|
||||||
return compressed
|
|
||||||
|
|
||||||
# Brotli
|
|
||||||
|
|
||||||
def test_brotli():
|
|
||||||
with open(get_test_dir() + 'text_content/quickfox_repeated.compressed', 'rb') as fh:
|
|
||||||
x = DecompressingBufferedReader(fh, decomp_type='br')
|
|
||||||
x.read() == b'The quick brown fox jumps over the lazy dog' * 4096
|
|
||||||
|
|
||||||
|
|
||||||
# Compression
|
|
||||||
def test_compress_mix():
|
|
||||||
# error: compressed member, followed by not compressed -- now allowed!
|
|
||||||
x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip')
|
|
||||||
b = x.read()
|
|
||||||
assert b == b'ABC'
|
|
||||||
x.read_next_member()
|
|
||||||
assert x.read() == b'123'
|
|
||||||
#with pytest.raises(zlib.error):
|
|
||||||
# x.read()
|
|
||||||
#error: Error -3 while decompressing: incorrect header check
|
|
||||||
|
|
||||||
# Errors
|
|
||||||
|
|
||||||
def test_err_chunk_cut_off():
|
|
||||||
# Chunked data cut off with exceptions
|
|
||||||
c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n4\r\n12"), raise_exceptions=True)
|
|
||||||
with pytest.raises(ChunkedDataException):
|
|
||||||
c.read() + c.read()
|
|
||||||
#ChunkedDataException: Ran out of data before end of chunk
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def print_str(string):
|
|
||||||
return string.decode('utf-8') if six.PY3 else string
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import doctest
|
|
||||||
doctest.testmod()
|
|
@ -1,27 +1,5 @@
|
|||||||
#=================================================================
|
|
||||||
r"""
|
r"""
|
||||||
# LimitReader Tests
|
#=================================================================
|
||||||
>>> LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26)
|
|
||||||
'abcdefghji'
|
|
||||||
|
|
||||||
>>> LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26)
|
|
||||||
'abcdefgh'
|
|
||||||
|
|
||||||
>>> LimitReader.wrap_stream(LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 8), 4).readline(26)
|
|
||||||
'abcd'
|
|
||||||
|
|
||||||
>>> read_multiple(LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
|
|
||||||
'efghji'
|
|
||||||
|
|
||||||
# zero-length read
|
|
||||||
>>> print_str(LimitReader(StringIO('a'), 0).readline(0))
|
|
||||||
''
|
|
||||||
|
|
||||||
# don't wrap if invalid length
|
|
||||||
>>> b = StringIO('b')
|
|
||||||
>>> LimitReader.wrap_stream(b, 'abc') == b
|
|
||||||
True
|
|
||||||
|
|
||||||
# BlockLoader Tests (includes LimitReader)
|
# BlockLoader Tests (includes LimitReader)
|
||||||
# Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
|
# Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
|
||||||
>>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read(400))
|
>>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read(400))
|
||||||
@ -143,27 +121,14 @@ import requests
|
|||||||
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
|
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
|
||||||
from pywb.utils.loaders import extract_client_cookie, extract_post_query
|
from pywb.utils.loaders import extract_client_cookie, extract_post_query
|
||||||
from pywb.utils.loaders import append_post_query, read_last_line
|
from pywb.utils.loaders import append_post_query, read_last_line
|
||||||
from pywb.utils.limitreader import LimitReader
|
|
||||||
|
|
||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
from warcio.bufferedreaders import DecompressingBufferedReader
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
|
|
||||||
test_cdx_dir = get_test_dir() + 'cdx/'
|
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||||
|
|
||||||
|
|
||||||
def read_multiple(reader, inc_reads):
|
|
||||||
result = None
|
|
||||||
for x in inc_reads:
|
|
||||||
result = reader.read(x)
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def seek_read_full(seekable_reader, offset):
|
|
||||||
seekable_reader.seek(offset)
|
|
||||||
seekable_reader.readline() #skip
|
|
||||||
return seekable_reader.readline()
|
|
||||||
|
|
||||||
def test_s3_read_1():
|
def test_s3_read_1():
|
||||||
pytest.importorskip('boto')
|
pytest.importorskip('boto')
|
||||||
|
|
||||||
@ -178,15 +143,6 @@ def test_s3_read_1():
|
|||||||
assert reader.readline() == b'WARC/1.0\r\n'
|
assert reader.readline() == b'WARC/1.0\r\n'
|
||||||
assert reader.readline() == b'WARC-Type: response\r\n'
|
assert reader.readline() == b'WARC-Type: response\r\n'
|
||||||
|
|
||||||
def test_limit_post():
|
|
||||||
reader = LimitReader(BytesIO(b'abcdefg'), 3)
|
|
||||||
r = requests.request(method='POST',
|
|
||||||
url='http://httpbin.org/post',
|
|
||||||
data=reader,
|
|
||||||
headers={'Content-Length': '3'})
|
|
||||||
|
|
||||||
assert '"abc"' in r.text
|
|
||||||
|
|
||||||
# Error
|
# Error
|
||||||
def test_err_no_such_file():
|
def test_err_no_such_file():
|
||||||
# no such file
|
# no such file
|
||||||
|
@ -1,182 +0,0 @@
|
|||||||
"""
|
|
||||||
>>> st1 = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
|
|
||||||
>>> st1
|
|
||||||
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
|
|
||||||
('Some', 'Value'),
|
|
||||||
('Multi-Line', 'Value1 Also This')])
|
|
||||||
|
|
||||||
# add range
|
|
||||||
>>> StatusAndHeaders(statusline = '200 OK', headers=[('Content-Type', 'text/plain')]).add_range(10, 4, 100)
|
|
||||||
StatusAndHeaders(protocol = '', statusline = '206 Partial Content', headers = [ ('Content-Type', 'text/plain'),
|
|
||||||
('Content-Range', 'bytes 10-13/100'),
|
|
||||||
('Accept-Ranges', 'bytes')])
|
|
||||||
|
|
||||||
# other protocol expected
|
|
||||||
>>> StatusAndHeadersParser(['Other']).parse(StringIO(status_headers_1)) # doctest: +IGNORE_EXCEPTION_DETAIL
|
|
||||||
Traceback (most recent call last):
|
|
||||||
StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
|
|
||||||
|
|
||||||
>>> StatusAndHeadersParser(['Other'], verify=False).parse(StringIO(status_headers_1))
|
|
||||||
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
|
|
||||||
('Some', 'Value'),
|
|
||||||
('Multi-Line', 'Value1 Also This')])
|
|
||||||
|
|
||||||
|
|
||||||
# verify protocol line
|
|
||||||
>>> StatusAndHeadersParser(['HTTP/1.0'], verify=True).parse(StringIO(unknown_protocol_headers)) # doctest: +IGNORE_EXCEPTION_DETAIL
|
|
||||||
Traceback (most recent call last):
|
|
||||||
StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0'] - Found: OtherBlah
|
|
||||||
|
|
||||||
|
|
||||||
# allow unexpected/invalid protocol line
|
|
||||||
>>> StatusAndHeadersParser(['HTTP/1.0'], verify=False).parse(StringIO(unknown_protocol_headers))
|
|
||||||
StatusAndHeaders(protocol = 'OtherBlah', statusline = '', headers = [('Foo', 'Bar')])
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# test equality op
|
|
||||||
>>> st1 == StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
|
|
||||||
True
|
|
||||||
|
|
||||||
# replace header, print new headers
|
|
||||||
>>> st1.replace_header('some', 'Another-Value'); st1
|
|
||||||
'Value'
|
|
||||||
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
|
|
||||||
('Some', 'Another-Value'),
|
|
||||||
('Multi-Line', 'Value1 Also This')])
|
|
||||||
|
|
||||||
|
|
||||||
# remove header
|
|
||||||
>>> st1.remove_header('some')
|
|
||||||
True
|
|
||||||
|
|
||||||
# already removed
|
|
||||||
>>> st1.remove_header('Some')
|
|
||||||
False
|
|
||||||
|
|
||||||
# empty
|
|
||||||
>>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2
|
|
||||||
StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = [])
|
|
||||||
|
|
||||||
|
|
||||||
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_3))
|
|
||||||
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 Empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')])
|
|
||||||
|
|
||||||
# case-insensitive match
|
|
||||||
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_4))
|
|
||||||
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')])
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
from pywb.utils.statusandheaders import StatusAndHeadersParser, StatusAndHeaders
|
|
||||||
from six import StringIO
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
status_headers_1 = "\
|
|
||||||
HTTP/1.0 200 OK\r\n\
|
|
||||||
Content-Type: ABC\r\n\
|
|
||||||
HTTP/1.0 200 OK\r\n\
|
|
||||||
Some: Value\r\n\
|
|
||||||
Multi-Line: Value1\r\n\
|
|
||||||
Also This\r\n\
|
|
||||||
\r\n\
|
|
||||||
Body"
|
|
||||||
|
|
||||||
|
|
||||||
status_headers_2 = """
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
status_headers_3 = "\
|
|
||||||
HTTP/1.0 204 Empty\r\n\
|
|
||||||
Content-Type: Value\r\n\
|
|
||||||
%Invalid%\r\n\
|
|
||||||
\tMultiline\r\n\
|
|
||||||
Content-Length: 0\r\n\
|
|
||||||
\r\n"
|
|
||||||
|
|
||||||
status_headers_4 = "\
|
|
||||||
http/1.0 204 empty\r\n\
|
|
||||||
Content-Type: Value\r\n\
|
|
||||||
%Invalid%\r\n\
|
|
||||||
\tMultiline\r\n\
|
|
||||||
Content-Length: 0\r\n\
|
|
||||||
\r\n"
|
|
||||||
|
|
||||||
unknown_protocol_headers = "\
|
|
||||||
OtherBlah\r\n\
|
|
||||||
Foo: Bar\r\n\
|
|
||||||
\r\n"
|
|
||||||
|
|
||||||
|
|
||||||
req_headers = "\
|
|
||||||
GET / HTTP/1.0\r\n\
|
|
||||||
Foo: Bar\r\n\
|
|
||||||
Content-Length: 0\r\n"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import doctest
|
|
||||||
doctest.testmod()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_to_str_1():
|
|
||||||
res = str(StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1)))
|
|
||||||
|
|
||||||
exp = "\
|
|
||||||
HTTP/1.0 200 OK\r\n\
|
|
||||||
Content-Type: ABC\r\n\
|
|
||||||
Some: Value\r\n\
|
|
||||||
Multi-Line: Value1 Also This\r\n\
|
|
||||||
"
|
|
||||||
assert(res == exp)
|
|
||||||
|
|
||||||
|
|
||||||
def test_to_str_exclude():
|
|
||||||
sah = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
|
|
||||||
res = sah.to_str(['multi-line'])
|
|
||||||
|
|
||||||
exp = "\
|
|
||||||
HTTP/1.0 200 OK\r\n\
|
|
||||||
Content-Type: ABC\r\n\
|
|
||||||
Some: Value\r\n\
|
|
||||||
"
|
|
||||||
assert(res == exp)
|
|
||||||
|
|
||||||
assert(sah.to_bytes(['multi-line']) == (exp.encode('latin-1') + b'\r\n'))
|
|
||||||
|
|
||||||
|
|
||||||
def test_to_str_2():
|
|
||||||
res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers)))
|
|
||||||
|
|
||||||
assert(res == req_headers)
|
|
||||||
|
|
||||||
res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers + '\r\n')))
|
|
||||||
|
|
||||||
assert(res == req_headers)
|
|
||||||
|
|
||||||
|
|
||||||
def test_to_str_with_remove():
|
|
||||||
res = StatusAndHeadersParser(['GET']).parse(StringIO(req_headers))
|
|
||||||
res.remove_header('Foo')
|
|
||||||
|
|
||||||
exp = "\
|
|
||||||
GET / HTTP/1.0\r\n\
|
|
||||||
Content-Length: 0\r\n"
|
|
||||||
|
|
||||||
assert(str(res) == exp)
|
|
||||||
|
|
||||||
def test_status_empty():
|
|
||||||
with pytest.raises(EOFError):
|
|
||||||
StatusAndHeadersParser([], verify=False).parse(StringIO(''))
|
|
||||||
|
|
||||||
|
|
||||||
def test_status_one_word():
|
|
||||||
res = StatusAndHeadersParser(['GET'], verify=False).parse(StringIO('A'))
|
|
||||||
assert(str(res) == 'A\r\n')
|
|
||||||
|
|
||||||
|
|
@ -1,330 +0,0 @@
|
|||||||
"""
|
|
||||||
utility functions for converting between
|
|
||||||
datetime, iso date and 14-digit timestamp
|
|
||||||
"""
|
|
||||||
|
|
||||||
import re
|
|
||||||
import time
|
|
||||||
import datetime
|
|
||||||
import calendar
|
|
||||||
|
|
||||||
from email.utils import parsedate, formatdate
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# str <-> datetime conversion
|
|
||||||
#=================================================================
|
|
||||||
|
|
||||||
DATE_TIMESPLIT = re.compile(r'[^\d]')
|
|
||||||
|
|
||||||
TIMESTAMP_14 = '%Y%m%d%H%M%S'
|
|
||||||
ISO_DT = '%Y-%m-%dT%H:%M:%SZ'
|
|
||||||
|
|
||||||
PAD_14_DOWN = '10000101000000'
|
|
||||||
PAD_14_UP = '29991231235959'
|
|
||||||
PAD_6_UP = '299912'
|
|
||||||
|
|
||||||
|
|
||||||
def iso_date_to_datetime(string):
|
|
||||||
"""
|
|
||||||
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
|
|
||||||
datetime.datetime(2013, 12, 26, 10, 11, 12)
|
|
||||||
|
|
||||||
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
|
|
||||||
datetime.datetime(2013, 12, 26, 10, 11, 12)
|
|
||||||
"""
|
|
||||||
|
|
||||||
nums = DATE_TIMESPLIT.split(string)
|
|
||||||
if nums[-1] == '':
|
|
||||||
nums = nums[:-1]
|
|
||||||
|
|
||||||
the_datetime = datetime.datetime(*(int(num) for num in nums))
|
|
||||||
return the_datetime
|
|
||||||
|
|
||||||
|
|
||||||
def http_date_to_datetime(string):
|
|
||||||
"""
|
|
||||||
>>> http_date_to_datetime('Thu, 26 Dec 2013 09:50:10 GMT')
|
|
||||||
datetime.datetime(2013, 12, 26, 9, 50, 10)
|
|
||||||
"""
|
|
||||||
return datetime.datetime(*parsedate(string)[:6])
|
|
||||||
|
|
||||||
|
|
||||||
def datetime_to_http_date(the_datetime):
|
|
||||||
"""
|
|
||||||
>>> datetime_to_http_date(datetime.datetime(2013, 12, 26, 9, 50, 10))
|
|
||||||
'Thu, 26 Dec 2013 09:50:10 GMT'
|
|
||||||
|
|
||||||
# Verify inverses
|
|
||||||
>>> x = 'Thu, 26 Dec 2013 09:50:10 GMT'
|
|
||||||
>>> datetime_to_http_date(http_date_to_datetime(x)) == x
|
|
||||||
True
|
|
||||||
"""
|
|
||||||
timeval = calendar.timegm(the_datetime.utctimetuple())
|
|
||||||
return formatdate(timeval=timeval,
|
|
||||||
localtime=False,
|
|
||||||
usegmt=True)
|
|
||||||
|
|
||||||
|
|
||||||
def datetime_to_iso_date(the_datetime):
|
|
||||||
"""
|
|
||||||
>>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12))
|
|
||||||
'2013-12-26T10:11:12Z'
|
|
||||||
|
|
||||||
>>> datetime_to_iso_date( datetime.datetime(2013, 12, 26, 10, 11, 12))
|
|
||||||
'2013-12-26T10:11:12Z'
|
|
||||||
"""
|
|
||||||
|
|
||||||
return the_datetime.strftime(ISO_DT)
|
|
||||||
|
|
||||||
|
|
||||||
def datetime_to_timestamp(the_datetime):
|
|
||||||
"""
|
|
||||||
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
|
|
||||||
'20131226101112'
|
|
||||||
"""
|
|
||||||
|
|
||||||
return the_datetime.strftime(TIMESTAMP_14)
|
|
||||||
|
|
||||||
|
|
||||||
def timestamp_now():
|
|
||||||
"""
|
|
||||||
>>> len(timestamp_now())
|
|
||||||
14
|
|
||||||
"""
|
|
||||||
return datetime_to_timestamp(datetime.datetime.utcnow())
|
|
||||||
|
|
||||||
|
|
||||||
def timestamp20_now():
|
|
||||||
"""
|
|
||||||
Create 20-digit timestamp, useful to timestamping temp files
|
|
||||||
|
|
||||||
>>> n = timestamp20_now()
|
|
||||||
>>> timestamp20_now() >= n
|
|
||||||
True
|
|
||||||
|
|
||||||
>>> len(n)
|
|
||||||
20
|
|
||||||
|
|
||||||
"""
|
|
||||||
now = datetime.datetime.utcnow()
|
|
||||||
return now.strftime('%Y%m%d%H%M%S%f')
|
|
||||||
|
|
||||||
|
|
||||||
def iso_date_to_timestamp(string):
|
|
||||||
"""
|
|
||||||
>>> iso_date_to_timestamp('2013-12-26T10:11:12Z')
|
|
||||||
'20131226101112'
|
|
||||||
|
|
||||||
>>> iso_date_to_timestamp('2013-12-26T10:11:12')
|
|
||||||
'20131226101112'
|
|
||||||
"""
|
|
||||||
|
|
||||||
return datetime_to_timestamp(iso_date_to_datetime(string))
|
|
||||||
|
|
||||||
def timestamp_to_iso_date(string):
|
|
||||||
"""
|
|
||||||
>>> timestamp_to_iso_date('20131226101112')
|
|
||||||
'2013-12-26T10:11:12Z'
|
|
||||||
|
|
||||||
>>> timestamp_to_iso_date('20131226101112')
|
|
||||||
'2013-12-26T10:11:12Z'
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
return datetime_to_iso_date(timestamp_to_datetime(string))
|
|
||||||
|
|
||||||
|
|
||||||
def http_date_to_timestamp(string):
|
|
||||||
"""
|
|
||||||
>>> http_date_to_timestamp('Thu, 26 Dec 2013 09:50:00 GMT')
|
|
||||||
'20131226095000'
|
|
||||||
|
|
||||||
>>> http_date_to_timestamp('Sun, 26 Jan 2014 20:08:04 GMT')
|
|
||||||
'20140126200804'
|
|
||||||
"""
|
|
||||||
return datetime_to_timestamp(http_date_to_datetime(string))
|
|
||||||
|
|
||||||
|
|
||||||
# pad to certain length (default 6)
|
|
||||||
def pad_timestamp(string, pad_str=PAD_6_UP):
|
|
||||||
"""
|
|
||||||
>>> pad_timestamp('20')
|
|
||||||
'209912'
|
|
||||||
|
|
||||||
>>> pad_timestamp('2014')
|
|
||||||
'201412'
|
|
||||||
|
|
||||||
>>> pad_timestamp('20141011')
|
|
||||||
'20141011'
|
|
||||||
|
|
||||||
>>> pad_timestamp('201410110010')
|
|
||||||
'201410110010'
|
|
||||||
"""
|
|
||||||
|
|
||||||
str_len = len(string)
|
|
||||||
pad_len = len(pad_str)
|
|
||||||
|
|
||||||
if str_len < pad_len:
|
|
||||||
string = string + pad_str[str_len:]
|
|
||||||
|
|
||||||
return string
|
|
||||||
|
|
||||||
|
|
||||||
def timestamp_to_datetime(string):
|
|
||||||
"""
|
|
||||||
# >14-digit -- rest ignored
|
|
||||||
>>> timestamp_to_datetime('2014122609501011')
|
|
||||||
datetime.datetime(2014, 12, 26, 9, 50, 10)
|
|
||||||
|
|
||||||
# 14-digit
|
|
||||||
>>> timestamp_to_datetime('20141226095010')
|
|
||||||
datetime.datetime(2014, 12, 26, 9, 50, 10)
|
|
||||||
|
|
||||||
# 13-digit padding
|
|
||||||
>>> timestamp_to_datetime('2014122609501')
|
|
||||||
datetime.datetime(2014, 12, 26, 9, 50, 59)
|
|
||||||
|
|
||||||
# 12-digit padding
|
|
||||||
>>> timestamp_to_datetime('201412260950')
|
|
||||||
datetime.datetime(2014, 12, 26, 9, 50, 59)
|
|
||||||
|
|
||||||
# 11-digit padding
|
|
||||||
>>> timestamp_to_datetime('20141226095')
|
|
||||||
datetime.datetime(2014, 12, 26, 9, 59, 59)
|
|
||||||
|
|
||||||
# 10-digit padding
|
|
||||||
>>> timestamp_to_datetime('2014122609')
|
|
||||||
datetime.datetime(2014, 12, 26, 9, 59, 59)
|
|
||||||
|
|
||||||
# 9-digit padding
|
|
||||||
>>> timestamp_to_datetime('201412260')
|
|
||||||
datetime.datetime(2014, 12, 26, 23, 59, 59)
|
|
||||||
|
|
||||||
# 8-digit padding
|
|
||||||
>>> timestamp_to_datetime('20141226')
|
|
||||||
datetime.datetime(2014, 12, 26, 23, 59, 59)
|
|
||||||
|
|
||||||
# 7-digit padding
|
|
||||||
>>> timestamp_to_datetime('2014122')
|
|
||||||
datetime.datetime(2014, 12, 31, 23, 59, 59)
|
|
||||||
|
|
||||||
# 6-digit padding
|
|
||||||
>>> timestamp_to_datetime('201410')
|
|
||||||
datetime.datetime(2014, 10, 31, 23, 59, 59)
|
|
||||||
|
|
||||||
# 5-digit padding
|
|
||||||
>>> timestamp_to_datetime('20141')
|
|
||||||
datetime.datetime(2014, 12, 31, 23, 59, 59)
|
|
||||||
|
|
||||||
# 4-digit padding
|
|
||||||
>>> timestamp_to_datetime('2014')
|
|
||||||
datetime.datetime(2014, 12, 31, 23, 59, 59)
|
|
||||||
|
|
||||||
# 3-digit padding
|
|
||||||
>>> timestamp_to_datetime('201')
|
|
||||||
datetime.datetime(2019, 12, 31, 23, 59, 59)
|
|
||||||
|
|
||||||
# 2-digit padding
|
|
||||||
>>> timestamp_to_datetime('20')
|
|
||||||
datetime.datetime(2099, 12, 31, 23, 59, 59)
|
|
||||||
|
|
||||||
# 1-digit padding
|
|
||||||
>>> timestamp_to_datetime('2')
|
|
||||||
datetime.datetime(2999, 12, 31, 23, 59, 59)
|
|
||||||
|
|
||||||
# 1-digit out-of-range padding
|
|
||||||
>>> timestamp_to_datetime('3')
|
|
||||||
datetime.datetime(2999, 12, 31, 23, 59, 59)
|
|
||||||
|
|
||||||
# 0-digit padding
|
|
||||||
>>> timestamp_to_datetime('')
|
|
||||||
datetime.datetime(2999, 12, 31, 23, 59, 59)
|
|
||||||
|
|
||||||
# bad month
|
|
||||||
>>> timestamp_to_datetime('20131709005601')
|
|
||||||
datetime.datetime(2013, 12, 9, 0, 56, 1)
|
|
||||||
|
|
||||||
# all out of range except minutes
|
|
||||||
>>> timestamp_to_datetime('40001965252477')
|
|
||||||
datetime.datetime(2999, 12, 31, 23, 24, 59)
|
|
||||||
|
|
||||||
# not a number!
|
|
||||||
>>> timestamp_to_datetime('2010abc')
|
|
||||||
datetime.datetime(2010, 12, 31, 23, 59, 59)
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
# pad to 6 digits
|
|
||||||
string = pad_timestamp(string, PAD_6_UP)
|
|
||||||
|
|
||||||
def clamp(val, min_, max_):
|
|
||||||
try:
|
|
||||||
val = int(val)
|
|
||||||
val = max(min_, min(val, max_))
|
|
||||||
return val
|
|
||||||
except:
|
|
||||||
return max_
|
|
||||||
|
|
||||||
def extract(string, start, end, min_, max_):
|
|
||||||
if len(string) >= end:
|
|
||||||
return clamp(string[start:end], min_, max_)
|
|
||||||
else:
|
|
||||||
return max_
|
|
||||||
|
|
||||||
# now parse, clamp to boundary
|
|
||||||
year = extract(string, 0, 4, 1900, 2999)
|
|
||||||
month = extract(string, 4, 6, 1, 12)
|
|
||||||
day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1])
|
|
||||||
hour = extract(string, 8, 10, 0, 23)
|
|
||||||
minute = extract(string, 10, 12, 0, 59)
|
|
||||||
second = extract(string, 12, 14, 0, 59)
|
|
||||||
|
|
||||||
return datetime.datetime(year=year,
|
|
||||||
month=month,
|
|
||||||
day=day,
|
|
||||||
hour=hour,
|
|
||||||
minute=minute,
|
|
||||||
second=second)
|
|
||||||
|
|
||||||
#return time.strptime(pad_timestamp(string), TIMESTAMP_14)
|
|
||||||
|
|
||||||
|
|
||||||
def timestamp_to_sec(string):
|
|
||||||
"""
|
|
||||||
>>> timestamp_to_sec('20131226095010')
|
|
||||||
1388051410
|
|
||||||
|
|
||||||
# rounds to end of 2014
|
|
||||||
>>> timestamp_to_sec('2014')
|
|
||||||
1420070399
|
|
||||||
"""
|
|
||||||
|
|
||||||
return calendar.timegm(timestamp_to_datetime(string).utctimetuple())
|
|
||||||
|
|
||||||
|
|
||||||
def sec_to_timestamp(secs):
|
|
||||||
"""
|
|
||||||
>>> sec_to_timestamp(1388051410)
|
|
||||||
'20131226095010'
|
|
||||||
|
|
||||||
>>> sec_to_timestamp(1420070399)
|
|
||||||
'20141231235959'
|
|
||||||
"""
|
|
||||||
|
|
||||||
return datetime_to_timestamp(datetime.datetime.utcfromtimestamp(secs))
|
|
||||||
|
|
||||||
|
|
||||||
def timestamp_to_http_date(string):
|
|
||||||
"""
|
|
||||||
>>> timestamp_to_http_date('20131226095000')
|
|
||||||
'Thu, 26 Dec 2013 09:50:00 GMT'
|
|
||||||
|
|
||||||
>>> timestamp_to_http_date('20140126200804')
|
|
||||||
'Sun, 26 Jan 2014 20:08:04 GMT'
|
|
||||||
"""
|
|
||||||
return datetime_to_http_date(timestamp_to_datetime(string))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import doctest
|
|
||||||
doctest.testmod()
|
|
@ -1,8 +1,8 @@
|
|||||||
from pywb.utils.timeutils import iso_date_to_timestamp
|
|
||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
from pywb.utils.loaders import extract_post_query, append_post_query
|
from pywb.utils.loaders import extract_post_query, append_post_query
|
||||||
|
|
||||||
from pywb.warc.archiveiterator import ArchiveIterator
|
from warcio.timeutils import iso_date_to_timestamp
|
||||||
|
from warcio.archiveiterator import ArchiveIterator
|
||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import base64
|
import base64
|
||||||
@ -71,7 +71,7 @@ class ArchiveIndexEntryMixin(object):
|
|||||||
self['urlkey'] = canonicalize(url, surt_ordered)
|
self['urlkey'] = canonicalize(url, surt_ordered)
|
||||||
other['urlkey'] = self['urlkey']
|
other['urlkey'] = self['urlkey']
|
||||||
|
|
||||||
referer = other.record.status_headers.get_header('referer')
|
referer = other.record.http_headers.get_header('referer')
|
||||||
if referer:
|
if referer:
|
||||||
self['_referer'] = referer
|
self['_referer'] = referer
|
||||||
|
|
||||||
@ -141,7 +141,7 @@ class DefaultRecordParser(object):
|
|||||||
for record in raw_iter:
|
for record in raw_iter:
|
||||||
entry = None
|
entry = None
|
||||||
|
|
||||||
if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):
|
if not include_all and not minimal and (record.http_headers.get_statuscode() == '-'):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if record.rec_type == 'arc_header':
|
if record.rec_type == 'arc_header':
|
||||||
@ -175,13 +175,13 @@ class DefaultRecordParser(object):
|
|||||||
compute_digest = True
|
compute_digest = True
|
||||||
|
|
||||||
elif not minimal and record.rec_type == 'request' and append_post:
|
elif not minimal and record.rec_type == 'request' and append_post:
|
||||||
method = record.status_headers.protocol
|
method = record.http_headers.protocol
|
||||||
len_ = record.status_headers.get_header('Content-Length')
|
len_ = record.http_headers.get_header('Content-Length')
|
||||||
|
|
||||||
post_query = extract_post_query(method,
|
post_query = extract_post_query(method,
|
||||||
entry.get('_content_type'),
|
entry.get('_content_type'),
|
||||||
len_,
|
len_,
|
||||||
record.stream)
|
record.raw_stream)
|
||||||
|
|
||||||
entry['_post_query'] = post_query
|
entry['_post_query'] = post_query
|
||||||
|
|
||||||
@ -236,7 +236,7 @@ class DefaultRecordParser(object):
|
|||||||
if record.rec_type == 'warcinfo':
|
if record.rec_type == 'warcinfo':
|
||||||
entry['url'] = record.rec_headers.get_header('WARC-Filename')
|
entry['url'] = record.rec_headers.get_header('WARC-Filename')
|
||||||
entry['urlkey'] = entry['url']
|
entry['urlkey'] = entry['url']
|
||||||
entry['_warcinfo'] = record.stream.read(record.length)
|
entry['_warcinfo'] = record.raw_stream.read(record.length)
|
||||||
return entry
|
return entry
|
||||||
|
|
||||||
entry['url'] = record.rec_headers.get_header('WARC-Target-Uri')
|
entry['url'] = record.rec_headers.get_header('WARC-Target-Uri')
|
||||||
@ -252,13 +252,13 @@ class DefaultRecordParser(object):
|
|||||||
entry['mime'] = '-'
|
entry['mime'] = '-'
|
||||||
else:
|
else:
|
||||||
def_mime = '-' if record.rec_type == 'request' else 'unk'
|
def_mime = '-' if record.rec_type == 'request' else 'unk'
|
||||||
entry.extract_mime(record.status_headers.
|
entry.extract_mime(record.http_headers.
|
||||||
get_header('Content-Type'),
|
get_header('Content-Type'),
|
||||||
def_mime)
|
def_mime)
|
||||||
|
|
||||||
# status -- only for response records (by convention):
|
# status -- only for response records (by convention):
|
||||||
if record.rec_type == 'response' and not self.options.get('minimal'):
|
if record.rec_type == 'response' and not self.options.get('minimal'):
|
||||||
entry.extract_status(record.status_headers)
|
entry.extract_status(record.http_headers)
|
||||||
else:
|
else:
|
||||||
entry['status'] = '-'
|
entry['status'] = '-'
|
||||||
|
|
||||||
@ -304,7 +304,7 @@ class DefaultRecordParser(object):
|
|||||||
entry.extract_mime(record.rec_headers.get_header('content-type'))
|
entry.extract_mime(record.rec_headers.get_header('content-type'))
|
||||||
|
|
||||||
# status
|
# status
|
||||||
entry.extract_status(record.status_headers)
|
entry.extract_status(record.http_headers)
|
||||||
|
|
||||||
# digest
|
# digest
|
||||||
entry['digest'] = '-'
|
entry['digest'] = '-'
|
||||||
|
@ -1,233 +0,0 @@
|
|||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
|
||||||
|
|
||||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
|
||||||
|
|
||||||
import six
|
|
||||||
import sys
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
BUFF_SIZE = 16384
|
|
||||||
|
|
||||||
|
|
||||||
class ArchiveIterator(six.Iterator):
|
|
||||||
""" Iterate over records in WARC and ARC files, both gzip chunk
|
|
||||||
compressed and uncompressed
|
|
||||||
|
|
||||||
The indexer will automatically detect format, and decompress
|
|
||||||
if necessary.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
GZIP_ERR_MSG = """
|
|
||||||
ERROR: Non-chunked gzip file detected, gzip block continues
|
|
||||||
beyond single record.
|
|
||||||
|
|
||||||
This file is probably not a multi-chunk gzip but a single gzip file.
|
|
||||||
|
|
||||||
To allow seek, a gzipped {1} must have each record compressed into
|
|
||||||
a single gzip chunk and concatenated together.
|
|
||||||
|
|
||||||
This file is likely still valid and you can use it by decompressing it:
|
|
||||||
|
|
||||||
gunzip myfile.{0}.gz
|
|
||||||
|
|
||||||
You can then also use the 'warc2warc' tool from the 'warc-tools'
|
|
||||||
package which will create a properly chunked gzip file:
|
|
||||||
|
|
||||||
warc2warc -Z myfile.{0} > myfile.{0}.gz
|
|
||||||
"""
|
|
||||||
|
|
||||||
INC_RECORD = """\
|
|
||||||
WARNING: Record not followed by newline, perhaps Content-Length is invalid
|
|
||||||
Offset: {0}
|
|
||||||
Remainder: {1}
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, fileobj, no_record_parse=False,
|
|
||||||
verify_http=False, arc2warc=False, block_size=BUFF_SIZE):
|
|
||||||
|
|
||||||
self.fh = fileobj
|
|
||||||
|
|
||||||
self.loader = ArcWarcRecordLoader(verify_http=verify_http,
|
|
||||||
arc2warc=arc2warc)
|
|
||||||
self.reader = None
|
|
||||||
|
|
||||||
self.offset = 0
|
|
||||||
self.known_format = None
|
|
||||||
|
|
||||||
self.mixed_arc_warc = arc2warc
|
|
||||||
|
|
||||||
self.member_info = None
|
|
||||||
self.no_record_parse = no_record_parse
|
|
||||||
|
|
||||||
self.reader = DecompressingBufferedReader(self.fh,
|
|
||||||
block_size=block_size)
|
|
||||||
self.offset = self.fh.tell()
|
|
||||||
|
|
||||||
self.next_line = None
|
|
||||||
|
|
||||||
self._raise_invalid_gzip = False
|
|
||||||
self._is_empty = False
|
|
||||||
self._is_first = True
|
|
||||||
self.last_record = None
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __next__(self):
|
|
||||||
while True:
|
|
||||||
if not self._is_first:
|
|
||||||
self._finish_record()
|
|
||||||
|
|
||||||
self._is_first = False
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.last_record = self._next_record(self.next_line)
|
|
||||||
if self._raise_invalid_gzip:
|
|
||||||
self._raise_invalid_gzip_err()
|
|
||||||
|
|
||||||
return self.last_record
|
|
||||||
|
|
||||||
except EOFError:
|
|
||||||
self._is_empty = True
|
|
||||||
|
|
||||||
def _finish_record(self):
|
|
||||||
if self.last_record:
|
|
||||||
self.read_to_end(self.last_record)
|
|
||||||
|
|
||||||
if self.reader.decompressor:
|
|
||||||
# if another gzip member, continue
|
|
||||||
if self.reader.read_next_member():
|
|
||||||
return
|
|
||||||
|
|
||||||
# if empty record, then we're done
|
|
||||||
elif self._is_empty:
|
|
||||||
raise StopIteration()
|
|
||||||
|
|
||||||
# otherwise, probably a gzip
|
|
||||||
# containing multiple non-chunked records
|
|
||||||
# raise this as an error
|
|
||||||
else:
|
|
||||||
self._raise_invalid_gzip = True
|
|
||||||
|
|
||||||
# non-gzip, so we're done
|
|
||||||
elif self._is_empty:
|
|
||||||
raise StopIteration()
|
|
||||||
|
|
||||||
def _raise_invalid_gzip_err(self):
|
|
||||||
""" A gzip file with multiple ARC/WARC records, non-chunked
|
|
||||||
has been detected. This is not valid for replay, so notify user
|
|
||||||
"""
|
|
||||||
frmt = 'warc/arc'
|
|
||||||
if self.known_format:
|
|
||||||
frmt = self.known_format
|
|
||||||
|
|
||||||
frmt_up = frmt.upper()
|
|
||||||
|
|
||||||
msg = self.GZIP_ERR_MSG.format(frmt, frmt_up)
|
|
||||||
raise Exception(msg)
|
|
||||||
|
|
||||||
def _consume_blanklines(self):
|
|
||||||
""" Consume blank lines that are between records
|
|
||||||
- For warcs, there are usually 2
|
|
||||||
- For arcs, may be 1 or 0
|
|
||||||
- For block gzipped files, these are at end of each gzip envelope
|
|
||||||
and are included in record length which is the full gzip envelope
|
|
||||||
- For uncompressed, they are between records and so are NOT part of
|
|
||||||
the record length
|
|
||||||
|
|
||||||
count empty_size so that it can be substracted from
|
|
||||||
the record length for uncompressed
|
|
||||||
|
|
||||||
if first line read is not blank, likely error in WARC/ARC,
|
|
||||||
display a warning
|
|
||||||
"""
|
|
||||||
empty_size = 0
|
|
||||||
first_line = True
|
|
||||||
|
|
||||||
while True:
|
|
||||||
line = self.reader.readline()
|
|
||||||
if len(line) == 0:
|
|
||||||
return None, empty_size
|
|
||||||
|
|
||||||
stripped = line.rstrip()
|
|
||||||
|
|
||||||
if len(stripped) == 0 or first_line:
|
|
||||||
empty_size += len(line)
|
|
||||||
|
|
||||||
if len(stripped) != 0:
|
|
||||||
# if first line is not blank,
|
|
||||||
# likely content-length was invalid, display warning
|
|
||||||
err_offset = self.fh.tell() - self.reader.rem_length() - empty_size
|
|
||||||
sys.stderr.write(self.INC_RECORD.format(err_offset, line))
|
|
||||||
|
|
||||||
first_line = False
|
|
||||||
continue
|
|
||||||
|
|
||||||
return line, empty_size
|
|
||||||
|
|
||||||
def read_to_end(self, record, payload_callback=None):
|
|
||||||
""" Read remainder of the stream
|
|
||||||
If a digester is included, update it
|
|
||||||
with the data read
|
|
||||||
"""
|
|
||||||
|
|
||||||
# already at end of this record, don't read until it is consumed
|
|
||||||
if self.member_info:
|
|
||||||
return None
|
|
||||||
|
|
||||||
num = 0
|
|
||||||
curr_offset = self.offset
|
|
||||||
|
|
||||||
while True:
|
|
||||||
b = record.stream.read(BUFF_SIZE)
|
|
||||||
if not b:
|
|
||||||
break
|
|
||||||
num += len(b)
|
|
||||||
if payload_callback:
|
|
||||||
payload_callback(b)
|
|
||||||
|
|
||||||
"""
|
|
||||||
- For compressed files, blank lines are consumed
|
|
||||||
since they are part of record length
|
|
||||||
- For uncompressed files, blank lines are read later,
|
|
||||||
and not included in the record length
|
|
||||||
"""
|
|
||||||
#if self.reader.decompressor:
|
|
||||||
self.next_line, empty_size = self._consume_blanklines()
|
|
||||||
|
|
||||||
self.offset = self.fh.tell() - self.reader.rem_length()
|
|
||||||
#if self.offset < 0:
|
|
||||||
# raise Exception('Not Gzipped Properly')
|
|
||||||
|
|
||||||
if self.next_line:
|
|
||||||
self.offset -= len(self.next_line)
|
|
||||||
|
|
||||||
length = self.offset - curr_offset
|
|
||||||
|
|
||||||
if not self.reader.decompressor:
|
|
||||||
length -= empty_size
|
|
||||||
|
|
||||||
self.member_info = (curr_offset, length)
|
|
||||||
#return self.member_info
|
|
||||||
#return next_line
|
|
||||||
|
|
||||||
def _next_record(self, next_line):
|
|
||||||
""" Use loader to parse the record from the reader stream
|
|
||||||
Supporting warc and arc records
|
|
||||||
"""
|
|
||||||
record = self.loader.parse_record_stream(self.reader,
|
|
||||||
next_line,
|
|
||||||
self.known_format,
|
|
||||||
self.no_record_parse)
|
|
||||||
|
|
||||||
self.member_info = None
|
|
||||||
|
|
||||||
# Track known format for faster parsing of other records
|
|
||||||
if not self.mixed_arc_warc:
|
|
||||||
self.known_format = record.format
|
|
||||||
|
|
||||||
return record
|
|
||||||
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
|||||||
|
from warcio.bufferedreaders import DecompressingBufferedReader
|
||||||
|
from warcio.recordloader import ArcWarcRecordLoader
|
||||||
from pywb.utils.loaders import BlockLoader
|
from pywb.utils.loaders import BlockLoader
|
||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
|
||||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -1,298 +0,0 @@
|
|||||||
import collections
|
|
||||||
|
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
|
||||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
|
||||||
from pywb.utils.statusandheaders import StatusAndHeadersParserException
|
|
||||||
|
|
||||||
from pywb.utils.limitreader import LimitReader
|
|
||||||
from pywb.utils.loaders import to_native_str
|
|
||||||
|
|
||||||
#from pywb.utils.wbexception import WbException
|
|
||||||
from pywb.utils.timeutils import timestamp_to_iso_date
|
|
||||||
|
|
||||||
from six.moves import zip
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
#ArcWarcRecord = collections.namedtuple('ArcWarcRecord',
|
|
||||||
# 'format, rec_type, rec_headers, ' +
|
|
||||||
# 'stream, status_headers, ' +
|
|
||||||
# 'content_type, length')
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class ArcWarcRecord(object):
|
|
||||||
def __init__(self, *args):
|
|
||||||
(self.format, self.rec_type, self.rec_headers, self.stream,
|
|
||||||
self.status_headers, self.content_type, self.length) = args
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class ArchiveLoadFailed(Exception):
|
|
||||||
def __init__(self, reason, filename=''):
|
|
||||||
if filename:
|
|
||||||
msg = filename + ': ' + str(reason)
|
|
||||||
else:
|
|
||||||
msg = str(reason)
|
|
||||||
|
|
||||||
super(ArchiveLoadFailed, self).__init__(msg)
|
|
||||||
self.msg = msg
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class ArcWarcRecordLoader(object):
|
|
||||||
WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
|
|
||||||
|
|
||||||
HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']
|
|
||||||
|
|
||||||
HTTP_VERBS = ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE',
|
|
||||||
'OPTIONS', 'CONNECT', 'PATCH']
|
|
||||||
|
|
||||||
NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource')
|
|
||||||
|
|
||||||
NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
|
|
||||||
HTTP_SCHEMES = ('http:', 'https:')
|
|
||||||
|
|
||||||
def __init__(self, verify_http=True, arc2warc=True):
|
|
||||||
if arc2warc:
|
|
||||||
self.arc_parser = ARC2WARCHeadersParser()
|
|
||||||
else:
|
|
||||||
self.arc_parser = ARCHeadersParser()
|
|
||||||
|
|
||||||
self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
|
|
||||||
self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)
|
|
||||||
|
|
||||||
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
|
|
||||||
|
|
||||||
def parse_record_stream(self, stream,
|
|
||||||
statusline=None,
|
|
||||||
known_format=None,
|
|
||||||
no_record_parse=False):
|
|
||||||
""" Parse file-like stream and return an ArcWarcRecord
|
|
||||||
encapsulating the record headers, http headers (if any),
|
|
||||||
and a stream limited to the remainder of the record.
|
|
||||||
|
|
||||||
Pass statusline and known_format to detect_type_loader_headers()
|
|
||||||
to faciliate parsing.
|
|
||||||
"""
|
|
||||||
(the_format, rec_headers) = (self.
|
|
||||||
_detect_type_load_headers(stream,
|
|
||||||
statusline,
|
|
||||||
known_format))
|
|
||||||
|
|
||||||
if the_format == 'arc':
|
|
||||||
uri = rec_headers.get_header('uri')
|
|
||||||
length = rec_headers.get_header('length')
|
|
||||||
content_type = rec_headers.get_header('content-type')
|
|
||||||
sub_len = rec_headers.total_len
|
|
||||||
if uri and uri.startswith('filedesc://'):
|
|
||||||
rec_type = 'arc_header'
|
|
||||||
else:
|
|
||||||
rec_type = 'response'
|
|
||||||
|
|
||||||
elif the_format in ('warc', 'arc2warc'):
|
|
||||||
rec_type = rec_headers.get_header('WARC-Type')
|
|
||||||
uri = rec_headers.get_header('WARC-Target-URI')
|
|
||||||
length = rec_headers.get_header('Content-Length')
|
|
||||||
content_type = rec_headers.get_header('Content-Type')
|
|
||||||
if the_format == 'warc':
|
|
||||||
sub_len = 0
|
|
||||||
else:
|
|
||||||
sub_len = rec_headers.total_len
|
|
||||||
the_format = 'warc'
|
|
||||||
|
|
||||||
is_err = False
|
|
||||||
|
|
||||||
try:
|
|
||||||
if length is not None:
|
|
||||||
length = int(length) - sub_len
|
|
||||||
if length < 0:
|
|
||||||
is_err = True
|
|
||||||
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
is_err = True
|
|
||||||
|
|
||||||
# err condition
|
|
||||||
if is_err:
|
|
||||||
length = 0
|
|
||||||
# or status and headers are completely empty (blank lines found)
|
|
||||||
elif not rec_headers:
|
|
||||||
length = 0
|
|
||||||
|
|
||||||
# limit stream to the length for all valid records
|
|
||||||
if length is not None and length >= 0:
|
|
||||||
stream = LimitReader.wrap_stream(stream, length)
|
|
||||||
|
|
||||||
# don't parse the http record at all
|
|
||||||
if no_record_parse:
|
|
||||||
status_headers = None#StatusAndHeaders('', [])
|
|
||||||
|
|
||||||
# if empty record (error or otherwise) set status to 204
|
|
||||||
elif length == 0:
|
|
||||||
if is_err:
|
|
||||||
msg = '204 Possible Error'
|
|
||||||
else:
|
|
||||||
msg = '204 No Content'
|
|
||||||
|
|
||||||
status_headers = StatusAndHeaders(msg, [])
|
|
||||||
|
|
||||||
# response record or non-empty revisit: parse HTTP status and headers!
|
|
||||||
elif (rec_type in ('response', 'revisit')
|
|
||||||
and uri.startswith(self.HTTP_SCHEMES)):
|
|
||||||
status_headers = self.http_parser.parse(stream)
|
|
||||||
|
|
||||||
# request record: parse request
|
|
||||||
elif ((rec_type == 'request')
|
|
||||||
and uri.startswith(self.HTTP_SCHEMES)):
|
|
||||||
status_headers = self.http_req_parser.parse(stream)
|
|
||||||
|
|
||||||
# everything else: create a no-status entry, set content-type
|
|
||||||
else:
|
|
||||||
content_type_header = [('Content-Type', content_type)]
|
|
||||||
|
|
||||||
if length is not None and length >= 0:
|
|
||||||
content_type_header.append(('Content-Length', str(length)))
|
|
||||||
|
|
||||||
status_headers = StatusAndHeaders('200 OK', content_type_header)
|
|
||||||
|
|
||||||
return ArcWarcRecord(the_format, rec_type,
|
|
||||||
rec_headers, stream, status_headers,
|
|
||||||
content_type, length)
|
|
||||||
|
|
||||||
def _detect_type_load_headers(self, stream,
|
|
||||||
statusline=None, known_format=None):
|
|
||||||
""" If known_format is specified ('warc' or 'arc'),
|
|
||||||
parse only as that format.
|
|
||||||
|
|
||||||
Otherwise, try parsing record as WARC, then try parsing as ARC.
|
|
||||||
if neither one succeeds, we're out of luck.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if known_format != 'arc':
|
|
||||||
# try as warc first
|
|
||||||
try:
|
|
||||||
rec_headers = self.warc_parser.parse(stream, statusline)
|
|
||||||
return 'warc', rec_headers
|
|
||||||
except StatusAndHeadersParserException as se:
|
|
||||||
if known_format == 'warc':
|
|
||||||
msg = 'Invalid WARC record, first line: '
|
|
||||||
raise ArchiveLoadFailed(msg + str(se.statusline))
|
|
||||||
|
|
||||||
statusline = se.statusline
|
|
||||||
pass
|
|
||||||
|
|
||||||
# now try as arc
|
|
||||||
try:
|
|
||||||
rec_headers = self.arc_parser.parse(stream, statusline)
|
|
||||||
return self.arc_parser.get_rec_type(), rec_headers
|
|
||||||
except StatusAndHeadersParserException as se:
|
|
||||||
if known_format == 'arc':
|
|
||||||
msg = 'Invalid ARC record, first line: '
|
|
||||||
else:
|
|
||||||
msg = 'Unknown archive format, first line: '
|
|
||||||
raise ArchiveLoadFailed(msg + str(se.statusline))
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class ARCHeadersParser(object):
|
|
||||||
# ARC 1.0 headers
|
|
||||||
ARC_HEADERS = ["uri", "ip-address", "archive-date",
|
|
||||||
"content-type", "length"]
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.headernames = self.get_header_names()
|
|
||||||
|
|
||||||
def get_rec_type(self):
|
|
||||||
return 'arc'
|
|
||||||
|
|
||||||
def parse(self, stream, headerline=None):
|
|
||||||
total_read = 0
|
|
||||||
|
|
||||||
def readline():
|
|
||||||
return to_native_str(stream.readline())
|
|
||||||
|
|
||||||
# if headerline passed in, use that
|
|
||||||
if headerline is None:
|
|
||||||
headerline = readline()
|
|
||||||
else:
|
|
||||||
headerline = to_native_str(headerline)
|
|
||||||
|
|
||||||
header_len = len(headerline)
|
|
||||||
|
|
||||||
if header_len == 0:
|
|
||||||
raise EOFError()
|
|
||||||
|
|
||||||
headerline = headerline.rstrip()
|
|
||||||
|
|
||||||
headernames = self.headernames
|
|
||||||
|
|
||||||
# if arc header, consume next two lines
|
|
||||||
if headerline.startswith('filedesc://'):
|
|
||||||
version = readline() # skip version
|
|
||||||
spec = readline() # skip header spec, use preset one
|
|
||||||
total_read += len(version)
|
|
||||||
total_read += len(spec)
|
|
||||||
|
|
||||||
parts = headerline.split(' ')
|
|
||||||
|
|
||||||
if len(parts) != len(headernames):
|
|
||||||
msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'
|
|
||||||
msg = msg.format(headernames, parts)
|
|
||||||
raise StatusAndHeadersParserException(msg, parts)
|
|
||||||
|
|
||||||
|
|
||||||
protocol, headers = self._get_protocol_and_headers(headerline, parts)
|
|
||||||
|
|
||||||
return StatusAndHeaders(statusline='',
|
|
||||||
headers=headers,
|
|
||||||
protocol='WARC/1.0',
|
|
||||||
total_len=total_read)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_header_names(cls):
|
|
||||||
return cls.ARC_HEADERS
|
|
||||||
|
|
||||||
def _get_protocol_and_headers(self, headerline, parts):
|
|
||||||
headers = []
|
|
||||||
|
|
||||||
for name, value in zip(self.headernames, parts):
|
|
||||||
headers.append((name, value))
|
|
||||||
|
|
||||||
return ('ARC/1.0', headers)
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class ARC2WARCHeadersParser(ARCHeadersParser):
|
|
||||||
# Headers for converting ARC -> WARC Header
|
|
||||||
ARC_TO_WARC_HEADERS = ["WARC-Target-URI",
|
|
||||||
"WARC-IP-Address",
|
|
||||||
"WARC-Date",
|
|
||||||
"Content-Type",
|
|
||||||
"Content-Length"]
|
|
||||||
|
|
||||||
def get_rec_type(self):
|
|
||||||
return 'arc2warc'
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_header_names(cls):
|
|
||||||
return cls.ARC_TO_WARC_HEADERS
|
|
||||||
|
|
||||||
def _get_protocol_and_headers(self, headerline, parts):
|
|
||||||
headers = []
|
|
||||||
|
|
||||||
for name, value in zip(self.headernames, parts):
|
|
||||||
if name == 'WARC-Date':
|
|
||||||
value = timestamp_to_iso_date(value)
|
|
||||||
|
|
||||||
headers.append((name, value))
|
|
||||||
|
|
||||||
if headerline.startswith('filedesc://'):
|
|
||||||
rec_type = 'arc_header'
|
|
||||||
else:
|
|
||||||
rec_type = 'response'
|
|
||||||
|
|
||||||
headers.append(('WARC-Type', rec_type))
|
|
||||||
headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id()))
|
|
||||||
|
|
||||||
return ('WARC/1.0', headers)
|
|
||||||
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
|||||||
from pywb.utils.timeutils import iso_date_to_timestamp
|
from warcio.recordloader import ArchiveLoadFailed
|
||||||
|
from warcio.timeutils import iso_date_to_timestamp
|
||||||
|
|
||||||
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
|
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
|
||||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
|
||||||
from pywb.utils.wbexception import NotFoundException
|
from pywb.utils.wbexception import NotFoundException
|
||||||
|
|
||||||
import six
|
import six
|
||||||
@ -27,20 +28,20 @@ class ResolvingLoader(object):
|
|||||||
elif headers_record != payload_record:
|
elif headers_record != payload_record:
|
||||||
# close remainder of stream as this record only used for
|
# close remainder of stream as this record only used for
|
||||||
# (already parsed) headers
|
# (already parsed) headers
|
||||||
headers_record.stream.close()
|
headers_record.raw_stream.close()
|
||||||
|
|
||||||
# special case: check if headers record is actually empty
|
# special case: check if headers record is actually empty
|
||||||
# (eg empty revisit), then use headers from revisit
|
# (eg empty revisit), then use headers from revisit
|
||||||
if not headers_record.status_headers.headers:
|
if not headers_record.http_headers.headers:
|
||||||
headers_record = payload_record
|
headers_record = payload_record
|
||||||
|
|
||||||
if not headers_record or not payload_record:
|
if not headers_record or not payload_record:
|
||||||
raise ArchiveLoadFailed('Could not load ' + str(cdx))
|
raise ArchiveLoadFailed('Could not load ' + str(cdx))
|
||||||
|
|
||||||
# ensure status line is valid from here
|
# ensure status line is valid from here
|
||||||
headers_record.status_headers.validate_statusline('204 No Content')
|
headers_record.http_headers.validate_statusline('204 No Content')
|
||||||
|
|
||||||
return (headers_record.status_headers, payload_record.stream)
|
return (headers_record.http_headers, payload_record.raw_stream)
|
||||||
|
|
||||||
def load_headers_and_payload(self, cdx, failed_files, cdx_loader):
|
def load_headers_and_payload(self, cdx, failed_files, cdx_loader):
|
||||||
"""
|
"""
|
||||||
@ -107,7 +108,7 @@ class ResolvingLoader(object):
|
|||||||
# optimization: if same file already failed this request,
|
# optimization: if same file already failed this request,
|
||||||
# don't try again
|
# don't try again
|
||||||
if failed_files is not None and filename in failed_files:
|
if failed_files is not None and filename in failed_files:
|
||||||
raise ArchiveLoadFailed('Skipping Already Failed', filename)
|
raise ArchiveLoadFailed('Skipping Already Failed: ' + filename)
|
||||||
|
|
||||||
any_found = False
|
any_found = False
|
||||||
last_exc = None
|
last_exc = None
|
||||||
@ -144,7 +145,7 @@ class ResolvingLoader(object):
|
|||||||
msg = 'Archive File Not Found'
|
msg = 'Archive File Not Found'
|
||||||
|
|
||||||
#raise ArchiveLoadFailed(msg, filename), None, last_traceback
|
#raise ArchiveLoadFailed(msg, filename), None, last_traceback
|
||||||
six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(msg, filename), last_traceback)
|
six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(filename + ': ' + msg), last_traceback)
|
||||||
|
|
||||||
def _load_different_url_payload(self, cdx, headers_record,
|
def _load_different_url_payload(self, cdx, headers_record,
|
||||||
failed_files, cdx_loader):
|
failed_files, cdx_loader):
|
||||||
|
@ -292,13 +292,13 @@ import sys
|
|||||||
import pprint
|
import pprint
|
||||||
import six
|
import six
|
||||||
|
|
||||||
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
|
from warcio.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
|
||||||
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
|
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
|
||||||
from pywb.warc.resolvingloader import ResolvingLoader
|
from pywb.warc.resolvingloader import ResolvingLoader
|
||||||
from pywb.warc.pathresolvers import PathResolverMapper
|
from pywb.warc.pathresolvers import PathResolverMapper
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
|
|
||||||
import pywb.utils.statusandheaders
|
import warcio.statusandheaders
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
|
|
||||||
@ -331,10 +331,12 @@ def load_test_archive(test_file, offset, length):
|
|||||||
|
|
||||||
archive = testloader.load(path, offset, length)
|
archive = testloader.load(path, offset, length)
|
||||||
|
|
||||||
pywb.utils.statusandheaders.WRAP_WIDTH = 160
|
warcio.statusandheaders.WRAP_WIDTH = 160
|
||||||
|
|
||||||
pprint.pprint(((archive.format, archive.rec_type),
|
pprint.pprint(((archive.format, archive.rec_type),
|
||||||
archive.rec_headers, archive.status_headers), indent=1, width=160)
|
archive.rec_headers, archive.http_headers), indent=1, width=160)
|
||||||
|
|
||||||
|
warcio.statusandheaders.WRAP_WIDTH = 80
|
||||||
|
|
||||||
|
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
|
@ -1,121 +0,0 @@
|
|||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
|
||||||
from pywb.warc.warcwriter import BufferWARCWriter
|
|
||||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
|
||||||
from pywb.warc.archiveiterator import ArchiveIterator
|
|
||||||
|
|
||||||
from io import BytesIO
|
|
||||||
from collections import OrderedDict
|
|
||||||
import json
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
class FixedTestWARCWriter(BufferWARCWriter):
|
|
||||||
@classmethod
|
|
||||||
def _make_warc_id(cls, id_=None):
|
|
||||||
return '<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>'
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _make_warc_date(cls):
|
|
||||||
return '2000-01-01T00:00:00Z'
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
class TestWarcWriter(object):
|
|
||||||
def _validate_record_content_len(self, stream):
|
|
||||||
for record in ArchiveIterator(stream, no_record_parse=True):
|
|
||||||
assert record.status_headers == None
|
|
||||||
assert int(record.rec_headers.get_header('Content-Length')) == record.length
|
|
||||||
assert record.length == len(record.stream.read())
|
|
||||||
|
|
||||||
|
|
||||||
def test_warcinfo_record(self):
|
|
||||||
simplewriter = FixedTestWARCWriter(gzip=False)
|
|
||||||
params = OrderedDict([('software', 'recorder test'),
|
|
||||||
('format', 'WARC File Format 1.0'),
|
|
||||||
('json-metadata', json.dumps({'foo': 'bar'}))])
|
|
||||||
|
|
||||||
record = simplewriter.create_warcinfo_record('testfile.warc.gz', params)
|
|
||||||
simplewriter.write_record(record)
|
|
||||||
buff = simplewriter.get_contents()
|
|
||||||
assert isinstance(buff, bytes)
|
|
||||||
|
|
||||||
buff = BytesIO(buff)
|
|
||||||
parsed_record = ArcWarcRecordLoader().parse_record_stream(buff)
|
|
||||||
|
|
||||||
assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
|
|
||||||
assert parsed_record.rec_headers.get_header('Content-Type') == 'application/warc-fields'
|
|
||||||
assert parsed_record.rec_headers.get_header('WARC-Filename') == 'testfile.warc.gz'
|
|
||||||
|
|
||||||
buff = parsed_record.stream.read().decode('utf-8')
|
|
||||||
|
|
||||||
length = parsed_record.rec_headers.get_header('Content-Length')
|
|
||||||
|
|
||||||
assert len(buff) == int(length)
|
|
||||||
|
|
||||||
assert 'json-metadata: {"foo": "bar"}\r\n' in buff
|
|
||||||
assert 'format: WARC File Format 1.0\r\n' in buff
|
|
||||||
|
|
||||||
warcinfo_record = '\
|
|
||||||
WARC/1.0\r\n\
|
|
||||||
WARC-Type: warcinfo\r\n\
|
|
||||||
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
|
|
||||||
WARC-Filename: testfile.warc.gz\r\n\
|
|
||||||
WARC-Date: 2000-01-01T00:00:00Z\r\n\
|
|
||||||
Content-Type: application/warc-fields\r\n\
|
|
||||||
Content-Length: 86\r\n\
|
|
||||||
\r\n\
|
|
||||||
software: recorder test\r\n\
|
|
||||||
format: WARC File Format 1.0\r\n\
|
|
||||||
json-metadata: {"foo": "bar"}\r\n\
|
|
||||||
\r\n\
|
|
||||||
\r\n\
|
|
||||||
'
|
|
||||||
|
|
||||||
assert simplewriter.get_contents().decode('utf-8') == warcinfo_record
|
|
||||||
|
|
||||||
def test_generate_response(self):
|
|
||||||
headers_list = [('Content-Type', 'text/plain; charset="UTF-8"'),
|
|
||||||
('Custom-Header', 'somevalue')
|
|
||||||
]
|
|
||||||
|
|
||||||
payload = b'some\ntext'
|
|
||||||
|
|
||||||
status_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0')
|
|
||||||
|
|
||||||
|
|
||||||
writer = FixedTestWARCWriter(gzip=False)
|
|
||||||
|
|
||||||
record = writer.create_warc_record('http://example.com/', 'response',
|
|
||||||
payload=BytesIO(payload),
|
|
||||||
length=len(payload),
|
|
||||||
status_headers=status_headers)
|
|
||||||
|
|
||||||
|
|
||||||
writer.write_record(record)
|
|
||||||
|
|
||||||
buff = writer.get_contents()
|
|
||||||
|
|
||||||
self._validate_record_content_len(BytesIO(buff))
|
|
||||||
|
|
||||||
warc_record = '\
|
|
||||||
WARC/1.0\r\n\
|
|
||||||
WARC-Type: response\r\n\
|
|
||||||
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
|
|
||||||
WARC-Target-URI: http://example.com/\r\n\
|
|
||||||
WARC-Date: 2000-01-01T00:00:00Z\r\n\
|
|
||||||
WARC-Block-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
|
|
||||||
WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
|
|
||||||
Content-Type: application/http; msgtype=response\r\n\
|
|
||||||
Content-Length: 97\r\n\
|
|
||||||
\r\n\
|
|
||||||
HTTP/1.0 200 OK\r\n\
|
|
||||||
Content-Type: text/plain; charset="UTF-8"\r\n\
|
|
||||||
Custom-Header: somevalue\r\n\
|
|
||||||
\r\n\
|
|
||||||
some\n\
|
|
||||||
text\
|
|
||||||
\r\n\
|
|
||||||
\r\n\
|
|
||||||
'
|
|
||||||
assert buff.decode('utf-8') == warc_record
|
|
||||||
|
|
@ -1,304 +0,0 @@
|
|||||||
import tempfile
|
|
||||||
import uuid
|
|
||||||
import base64
|
|
||||||
import hashlib
|
|
||||||
import datetime
|
|
||||||
import zlib
|
|
||||||
import six
|
|
||||||
|
|
||||||
from socket import gethostname
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
from pywb.utils.loaders import to_native_str
|
|
||||||
from pywb.utils.timeutils import datetime_to_iso_date
|
|
||||||
|
|
||||||
from pywb.utils.statusandheaders import StatusAndHeadersParser, StatusAndHeaders
|
|
||||||
|
|
||||||
from pywb.warc.recordloader import ArcWarcRecord
|
|
||||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
class BaseWARCWriter(object):
|
|
||||||
BUFF_SIZE = 16384
|
|
||||||
|
|
||||||
WARC_RECORDS = {'warcinfo': 'application/warc-fields',
|
|
||||||
'response': 'application/http; msgtype=response',
|
|
||||||
'revisit': 'application/http; msgtype=response',
|
|
||||||
'request': 'application/http; msgtype=request',
|
|
||||||
'metadata': 'application/warc-fields',
|
|
||||||
}
|
|
||||||
|
|
||||||
REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'
|
|
||||||
|
|
||||||
WARC_VERSION = 'WARC/1.0'
|
|
||||||
|
|
||||||
def __init__(self, gzip=True, header_filter=None, *args, **kwargs):
|
|
||||||
self.gzip = gzip
|
|
||||||
self.header_filter = header_filter
|
|
||||||
self.hostname = gethostname()
|
|
||||||
|
|
||||||
self.parser = StatusAndHeadersParser([], verify=False)
|
|
||||||
self.warc_version = kwargs.get('warc_version', self.WARC_VERSION)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _iter_stream(cls, stream):
|
|
||||||
while True:
|
|
||||||
buf = stream.read(cls.BUFF_SIZE)
|
|
||||||
if not buf:
|
|
||||||
return
|
|
||||||
|
|
||||||
yield buf
|
|
||||||
|
|
||||||
def ensure_digest(self, record):
|
|
||||||
block_digest = record.rec_headers.get_header('WARC-Block-Digest')
|
|
||||||
payload_digest = record.rec_headers.get_header('WARC-Payload-Digest')
|
|
||||||
if block_digest and payload_digest:
|
|
||||||
return
|
|
||||||
|
|
||||||
block_digester = self._create_digester()
|
|
||||||
payload_digester = self._create_digester()
|
|
||||||
|
|
||||||
pos = record.stream.tell()
|
|
||||||
|
|
||||||
if record.status_headers and hasattr(record.status_headers, 'headers_buff'):
|
|
||||||
block_digester.update(record.status_headers.headers_buff)
|
|
||||||
|
|
||||||
for buf in self._iter_stream(record.stream):
|
|
||||||
block_digester.update(buf)
|
|
||||||
payload_digester.update(buf)
|
|
||||||
|
|
||||||
record.stream.seek(pos)
|
|
||||||
record.rec_headers.add_header('WARC-Block-Digest', str(block_digester))
|
|
||||||
record.rec_headers.add_header('WARC-Payload-Digest', str(payload_digester))
|
|
||||||
|
|
||||||
def _create_digester(self):
|
|
||||||
return Digester('sha1')
|
|
||||||
|
|
||||||
def _set_header_buff(self, record):
|
|
||||||
exclude_list = None
|
|
||||||
if self.header_filter:
|
|
||||||
exclude_list = self.header_filter(record)
|
|
||||||
buff = record.status_headers.to_bytes(exclude_list)
|
|
||||||
record.status_headers.headers_buff = buff
|
|
||||||
|
|
||||||
def create_warcinfo_record(self, filename, info):
|
|
||||||
warc_headers = StatusAndHeaders(self.warc_version, [])
|
|
||||||
warc_headers.add_header('WARC-Type', 'warcinfo')
|
|
||||||
warc_headers.add_header('WARC-Record-ID', self._make_warc_id())
|
|
||||||
if filename:
|
|
||||||
warc_headers.add_header('WARC-Filename', filename)
|
|
||||||
warc_headers.add_header('WARC-Date', self._make_warc_date())
|
|
||||||
|
|
||||||
warcinfo = BytesIO()
|
|
||||||
for n, v in six.iteritems(info):
|
|
||||||
self._header(warcinfo, n, v)
|
|
||||||
|
|
||||||
warcinfo.seek(0)
|
|
||||||
|
|
||||||
record = ArcWarcRecord('warc', 'warcinfo', warc_headers, warcinfo,
|
|
||||||
None, '', len(warcinfo.getvalue()))
|
|
||||||
|
|
||||||
return record
|
|
||||||
|
|
||||||
def copy_warc_record(self, payload):
|
|
||||||
len_ = payload.tell()
|
|
||||||
payload.seek(0)
|
|
||||||
|
|
||||||
warc_headers = self.parser.parse(payload)
|
|
||||||
|
|
||||||
record_type = warc_headers.get_header('WARC-Type', 'response')
|
|
||||||
|
|
||||||
return self._fill_record(record_type, warc_headers, None, payload, '', len_)
|
|
||||||
|
|
||||||
def create_warc_record(self, uri, record_type, payload,
|
|
||||||
length=None,
|
|
||||||
warc_content_type='',
|
|
||||||
warc_headers_dict={},
|
|
||||||
status_headers=None):
|
|
||||||
|
|
||||||
if length is None:
|
|
||||||
length = payload.tell()
|
|
||||||
payload.seek(0)
|
|
||||||
|
|
||||||
warc_headers = StatusAndHeaders(self.warc_version, list(warc_headers_dict.items()))
|
|
||||||
warc_headers.replace_header('WARC-Type', record_type)
|
|
||||||
if not warc_headers.get_header('WARC-Record-ID'):
|
|
||||||
warc_headers.add_header('WARC-Record-ID', self._make_warc_id())
|
|
||||||
|
|
||||||
if uri:
|
|
||||||
warc_headers.replace_header('WARC-Target-URI', uri)
|
|
||||||
|
|
||||||
if not warc_headers.get_header('WARC-Date'):
|
|
||||||
warc_headers.add_header('WARC-Date', self._make_warc_date())
|
|
||||||
|
|
||||||
return self._fill_record(record_type, warc_headers, status_headers,
|
|
||||||
payload, warc_content_type, length)
|
|
||||||
|
|
||||||
def _fill_record(self, record_type, warc_headers, status_headers, payload, warc_content_type, len_):
|
|
||||||
has_http_headers = (record_type in ('request', 'response', 'revisit'))
|
|
||||||
|
|
||||||
if not status_headers and has_http_headers:
|
|
||||||
status_headers = self.parser.parse(payload)
|
|
||||||
|
|
||||||
record = ArcWarcRecord('warc', record_type, warc_headers, payload,
|
|
||||||
status_headers, warc_content_type, len_)
|
|
||||||
|
|
||||||
self.ensure_digest(record)
|
|
||||||
|
|
||||||
if has_http_headers:
|
|
||||||
self._set_header_buff(record)
|
|
||||||
|
|
||||||
return record
|
|
||||||
|
|
||||||
def _write_warc_record(self, out, record, adjust_cl=True):
|
|
||||||
if self.gzip:
|
|
||||||
out = GzippingWrapper(out)
|
|
||||||
|
|
||||||
# compute Content-Type
|
|
||||||
content_type = record.rec_headers.get_header('Content-Type')
|
|
||||||
|
|
||||||
if not content_type:
|
|
||||||
content_type = record.content_type
|
|
||||||
|
|
||||||
if not content_type:
|
|
||||||
content_type = self.WARC_RECORDS.get(record.rec_headers.get_header('WARC-Type'))
|
|
||||||
|
|
||||||
if content_type:
|
|
||||||
record.rec_headers.replace_header('Content-Type', content_type)
|
|
||||||
#self._header(out, 'Content-Type', content_type)
|
|
||||||
|
|
||||||
if record.rec_headers.get_header('WARC-Type') == 'revisit':
|
|
||||||
http_headers_only = True
|
|
||||||
else:
|
|
||||||
http_headers_only = False
|
|
||||||
|
|
||||||
# compute Content-Length
|
|
||||||
if record.length or record.status_headers:
|
|
||||||
actual_len = 0
|
|
||||||
if record.status_headers:
|
|
||||||
actual_len = len(record.status_headers.headers_buff)
|
|
||||||
|
|
||||||
if not http_headers_only:
|
|
||||||
if adjust_cl:
|
|
||||||
diff = record.stream.tell() - actual_len
|
|
||||||
else:
|
|
||||||
diff = 0
|
|
||||||
|
|
||||||
actual_len = record.length - diff
|
|
||||||
|
|
||||||
record.rec_headers.replace_header('Content-Length', str(actual_len))
|
|
||||||
#self._header(out, 'Content-Length', str(actual_len))
|
|
||||||
|
|
||||||
# add empty line
|
|
||||||
#self._line(out, b'')
|
|
||||||
|
|
||||||
# write record headers
|
|
||||||
out.write(record.rec_headers.to_bytes())
|
|
||||||
|
|
||||||
# write headers buffer, if any
|
|
||||||
if record.status_headers:
|
|
||||||
out.write(record.status_headers.headers_buff)
|
|
||||||
|
|
||||||
if not http_headers_only:
|
|
||||||
for buf in self._iter_stream(record.stream):
|
|
||||||
out.write(buf)
|
|
||||||
#out.write(record.stream.read())
|
|
||||||
|
|
||||||
# add two lines
|
|
||||||
self._line(out, b'\r\n')
|
|
||||||
else:
|
|
||||||
# add three lines (1 for end of header, 2 for end of record)
|
|
||||||
self._line(out, b'Content-Length: 0\r\n\r\n')
|
|
||||||
|
|
||||||
out.flush()
|
|
||||||
|
|
||||||
def _header(self, out, name, value):
|
|
||||||
if not value:
|
|
||||||
return
|
|
||||||
|
|
||||||
self._line(out, (name + ': ' + str(value)).encode('latin-1'))
|
|
||||||
|
|
||||||
def _line(self, out, line):
|
|
||||||
out.write(line + b'\r\n')
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _make_warc_id(cls, id_=None):
|
|
||||||
if not id_:
|
|
||||||
id_ = uuid.uuid1()
|
|
||||||
return '<urn:uuid:{0}>'.format(id_)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _make_warc_date(cls):
|
|
||||||
return datetime_to_iso_date(datetime.datetime.utcnow())
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
class GzippingWrapper(object):
|
|
||||||
def __init__(self, out):
|
|
||||||
self.compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16)
|
|
||||||
self.out = out
|
|
||||||
|
|
||||||
def write(self, buff):
|
|
||||||
#if isinstance(buff, str):
|
|
||||||
# buff = buff.encode('utf-8')
|
|
||||||
buff = self.compressor.compress(buff)
|
|
||||||
self.out.write(buff)
|
|
||||||
|
|
||||||
def flush(self):
|
|
||||||
buff = self.compressor.flush()
|
|
||||||
self.out.write(buff)
|
|
||||||
self.out.flush()
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
class Digester(object):
|
|
||||||
def __init__(self, type_='sha1'):
|
|
||||||
self.type_ = type_
|
|
||||||
self.digester = hashlib.new(type_)
|
|
||||||
|
|
||||||
def update(self, buff):
|
|
||||||
self.digester.update(buff)
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return self.type_ + ':' + to_native_str(base64.b32encode(self.digester.digest()))
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
class BufferWARCWriter(BaseWARCWriter):
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super(BufferWARCWriter, self).__init__(*args, **kwargs)
|
|
||||||
self.out = self._create_buffer()
|
|
||||||
|
|
||||||
def _create_buffer(self):
|
|
||||||
return tempfile.SpooledTemporaryFile(max_size=512*1024)
|
|
||||||
|
|
||||||
def write_record(self, record):
|
|
||||||
self._write_warc_record(self.out, record)
|
|
||||||
|
|
||||||
def get_contents(self):
|
|
||||||
pos = self.out.tell()
|
|
||||||
self.out.seek(0)
|
|
||||||
buff = self.out.read()
|
|
||||||
self.out.seek(pos)
|
|
||||||
return buff
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
class FileWARCWriter(BufferWARCWriter):
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
file_or_buff = None
|
|
||||||
if len(args) > 0:
|
|
||||||
file_or_buff = args[0]
|
|
||||||
else:
|
|
||||||
file_or_buff = kwargs.get('file')
|
|
||||||
|
|
||||||
if isinstance(file_or_buff, str):
|
|
||||||
self.out = open(file_or_buff, 'rb')
|
|
||||||
elif hasattr(file_or_buff, 'read'):
|
|
||||||
self.out = file_or_buff
|
|
||||||
else:
|
|
||||||
raise Exception('file must be a readable or valid filename')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -5,7 +5,8 @@ import json
|
|||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from pywb.utils.timeutils import timestamp_now
|
from warcio.timeutils import timestamp_now
|
||||||
|
|
||||||
from pywb.cdx.cdxops import process_cdx
|
from pywb.cdx.cdxops import process_cdx
|
||||||
from pywb.cdx.query import CDXQuery
|
from pywb.cdx.query import CDXQuery
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@ from pywb.webagg.responseloader import WARCPathLoader, LiveWebLoader, VideoLoad
|
|||||||
from pywb.webagg.utils import MementoUtils
|
from pywb.webagg.utils import MementoUtils
|
||||||
from pywb.utils.wbexception import BadRequestException, WbException
|
from pywb.utils.wbexception import BadRequestException, WbException
|
||||||
from pywb.utils.wbexception import NotFoundException
|
from pywb.utils.wbexception import NotFoundException
|
||||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
from warcio.recordloader import ArchiveLoadFailed
|
||||||
|
|
||||||
from pywb.cdx.query import CDXQuery
|
from pywb.cdx.query import CDXQuery
|
||||||
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
|
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from pywb.utils.binsearch import iter_range
|
from pywb.utils.binsearch import iter_range
|
||||||
from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
||||||
from pywb.utils.timeutils import timestamp_now
|
from warcio.timeutils import timestamp_now
|
||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
from pywb.utils.wbexception import NotFoundException
|
from pywb.utils.wbexception import NotFoundException
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
|
from warcio.limitreader import LimitReader
|
||||||
|
from warcio.statusandheaders import StatusAndHeadersParser
|
||||||
|
|
||||||
from pywb.utils.loaders import extract_post_query, append_post_query
|
from pywb.utils.loaders import extract_post_query, append_post_query
|
||||||
from pywb.utils.limitreader import LimitReader
|
|
||||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
|
||||||
|
|
||||||
from six.moves.urllib.parse import urlsplit, quote
|
from six.moves.urllib.parse import urlsplit, quote
|
||||||
from six import iteritems, StringIO
|
from six import iteritems, StringIO
|
||||||
|
@ -3,7 +3,7 @@ from pywb.utils.wbexception import NotFoundException
|
|||||||
from pywb.webagg.indexsource import BaseIndexSource, RemoteIndexSource
|
from pywb.webagg.indexsource import BaseIndexSource, RemoteIndexSource
|
||||||
from pywb.webagg.responseloader import LiveWebLoader
|
from pywb.webagg.responseloader import LiveWebLoader
|
||||||
from pywb.webagg.utils import ParamFormatter, res_template
|
from pywb.webagg.utils import ParamFormatter, res_template
|
||||||
from pywb.utils.timeutils import timestamp_now
|
from warcio.timeutils import timestamp_now
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
|
@ -2,12 +2,13 @@ from pywb.webagg.utils import MementoUtils, StreamIter, compress_gzip_iter
|
|||||||
from pywb.webagg.utils import ParamFormatter
|
from pywb.webagg.utils import ParamFormatter
|
||||||
from pywb.webagg.indexsource import RedisIndexSource
|
from pywb.webagg.indexsource import RedisIndexSource
|
||||||
|
|
||||||
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp
|
from warcio.timeutils import timestamp_to_datetime, datetime_to_timestamp
|
||||||
from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date
|
from warcio.timeutils import iso_date_to_datetime, datetime_to_iso_date
|
||||||
from pywb.utils.timeutils import http_date_to_datetime, datetime_to_http_date
|
from warcio.timeutils import http_date_to_datetime, datetime_to_http_date
|
||||||
|
|
||||||
|
from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
|
||||||
|
|
||||||
from pywb.utils.wbexception import LiveResourceException, WbException
|
from pywb.utils.wbexception import LiveResourceException, WbException
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
|
|
||||||
|
|
||||||
from pywb.warc.resolvingloader import ResolvingLoader
|
from pywb.warc.resolvingloader import ResolvingLoader
|
||||||
|
|
||||||
@ -243,11 +244,11 @@ class WARCPathLoader(BaseLoader):
|
|||||||
|
|
||||||
status = cdx.get('status')
|
status = cdx.get('status')
|
||||||
if not status or status.startswith('3'):
|
if not status or status.startswith('3'):
|
||||||
status_headers = self.headers_parser.parse(payload.stream)
|
http_headers = self.headers_parser.parse(payload.raw_stream)
|
||||||
self.raise_on_self_redirect(params, cdx,
|
self.raise_on_self_redirect(params, cdx,
|
||||||
status_headers.get_statuscode(),
|
http_headers.get_statuscode(),
|
||||||
status_headers.get_header('Location'))
|
http_headers.get_header('Location'))
|
||||||
http_headers_buff = status_headers.to_bytes()
|
http_headers_buff = http_headers.to_bytes()
|
||||||
else:
|
else:
|
||||||
http_headers_buff = None
|
http_headers_buff = None
|
||||||
|
|
||||||
@ -266,9 +267,9 @@ class WARCPathLoader(BaseLoader):
|
|||||||
warc_headers.replace_header('WARC-Date',
|
warc_headers.replace_header('WARC-Date',
|
||||||
headers.rec_headers.get_header('WARC-Date'))
|
headers.rec_headers.get_header('WARC-Date'))
|
||||||
|
|
||||||
headers.stream.close()
|
headers.raw_stream.close()
|
||||||
|
|
||||||
return (warc_headers, http_headers_buff, payload.stream)
|
return (warc_headers, http_headers_buff, payload.raw_stream)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'WARCPathLoader'
|
return 'WARCPathLoader'
|
||||||
|
@ -13,10 +13,10 @@ from pywb.webagg.aggregator import DirectoryIndexSource
|
|||||||
from pywb.webagg.app import ResAggApp
|
from pywb.webagg.app import ResAggApp
|
||||||
from pywb.webagg.utils import MementoUtils
|
from pywb.webagg.utils import MementoUtils
|
||||||
|
|
||||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
from warcio.recordloader import ArcWarcRecordLoader
|
||||||
|
from warcio.statusandheaders import StatusAndHeadersParser
|
||||||
|
from warcio.bufferedreaders import ChunkedDataReader
|
||||||
|
|
||||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
|
||||||
from pywb.utils.bufferedreaders import ChunkedDataReader
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from six.moves.urllib.parse import urlencode
|
from six.moves.urllib.parse import urlencode
|
||||||
|
|
||||||
@ -215,9 +215,9 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
|
|||||||
|
|
||||||
buff = BytesIO(resp.body)
|
buff = BytesIO(resp.body)
|
||||||
record = ArcWarcRecordLoader().parse_record_stream(buff, no_record_parse=False)
|
record = ArcWarcRecordLoader().parse_record_stream(buff, no_record_parse=False)
|
||||||
print(record.status_headers)
|
print(record.http_headers)
|
||||||
assert record.status_headers.get_statuscode() == '302'
|
assert record.http_headers.get_statuscode() == '302'
|
||||||
assert record.status_headers.get_header('Location') == 'https://www.iana.org/'
|
assert record.http_headers.get_header('Location') == 'https://www.iana.org/'
|
||||||
|
|
||||||
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live'))
|
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live'))
|
||||||
def test_agg_select_live(self):
|
def test_agg_select_live(self):
|
||||||
|
@ -3,7 +3,7 @@ from pywb.webagg.indexsource import LiveIndexSource
|
|||||||
|
|
||||||
from pywb.webagg.aggregator import SimpleAggregator
|
from pywb.webagg.aggregator import SimpleAggregator
|
||||||
|
|
||||||
from pywb.utils.timeutils import timestamp_now
|
from warcio.timeutils import timestamp_now
|
||||||
|
|
||||||
from .testutils import key_ts_res, TEST_CDX_PATH
|
from .testutils import key_ts_res, TEST_CDX_PATH
|
||||||
|
|
||||||
|
@ -8,7 +8,7 @@ from pywb.webagg.handlers import DefaultResourceHandler
|
|||||||
from pywb.webagg.aggregator import SimpleAggregator
|
from pywb.webagg.aggregator import SimpleAggregator
|
||||||
from pywb.webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource
|
from pywb.webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource
|
||||||
|
|
||||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
from warcio.recordloader import ArcWarcRecordLoader
|
||||||
|
|
||||||
from .testutils import LiveServerTests, BaseTestClass
|
from .testutils import LiveServerTests, BaseTestClass
|
||||||
|
|
||||||
@ -48,7 +48,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
|
|||||||
|
|
||||||
record = ArcWarcRecordLoader().parse_record_stream(resp.raw, no_record_parse=False)
|
record = ArcWarcRecordLoader().parse_record_stream(resp.raw, no_record_parse=False)
|
||||||
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
|
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
|
||||||
assert record.status_headers.get_header('Date') != ''
|
assert record.http_headers.get_header('Date') != ''
|
||||||
|
|
||||||
def test_upstream_1(self):
|
def test_upstream_1(self):
|
||||||
resp = self.testapp.get('/upstream/resource?url=http://httpbin.org/get')
|
resp = self.testapp.get('/upstream/resource?url=http://httpbin.org/get')
|
||||||
@ -58,7 +58,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
|
|||||||
|
|
||||||
record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False)
|
record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False)
|
||||||
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
|
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
|
||||||
assert record.status_headers.get_header('Date') != ''
|
assert record.http_headers.get_header('Date') != ''
|
||||||
|
|
||||||
def test_upstream_2(self):
|
def test_upstream_2(self):
|
||||||
resp = self.testapp.get('/upstream_opt/resource?url=http://httpbin.org/get')
|
resp = self.testapp.get('/upstream_opt/resource?url=http://httpbin.org/get')
|
||||||
@ -68,7 +68,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
|
|||||||
|
|
||||||
record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False)
|
record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False)
|
||||||
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
|
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
|
||||||
assert record.status_headers.get_header('Date') != ''
|
assert record.http_headers.get_header('Date') != ''
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -7,7 +7,9 @@ import zlib
|
|||||||
|
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
|
||||||
from pywb.utils.timeutils import timestamp_to_http_date
|
from warcio.timeutils import timestamp_to_http_date
|
||||||
|
from warcio.utils import BUFF_SIZE
|
||||||
|
|
||||||
from pywb.utils.wbexception import BadRequestException
|
from pywb.utils.wbexception import BadRequestException
|
||||||
from pywb.utils.loaders import load_yaml_config
|
from pywb.utils.loaders import load_yaml_config
|
||||||
|
|
||||||
@ -20,9 +22,6 @@ LINK_SEG_SPLIT = re.compile(';\s*')
|
|||||||
LINK_URL = re.compile('<(.*)>')
|
LINK_URL = re.compile('<(.*)>')
|
||||||
LINK_PROP = re.compile('([\w]+)="([^"]+)')
|
LINK_PROP = re.compile('([\w]+)="([^"]+)')
|
||||||
|
|
||||||
BUFF_SIZE = 16384
|
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class MementoException(BadRequestException):
|
class MementoException(BadRequestException):
|
||||||
pass
|
pass
|
||||||
|
@ -15,7 +15,7 @@ from pywb.cdx.cdxobject import IDXObject, CDXException, CDXObject
|
|||||||
from pywb.cdx.query import CDXQuery
|
from pywb.cdx.query import CDXQuery
|
||||||
|
|
||||||
from pywb.utils.loaders import BlockLoader, read_last_line
|
from pywb.utils.loaders import BlockLoader, read_last_line
|
||||||
from pywb.utils.bufferedreaders import gzip_decompressor
|
from warcio.bufferedreaders import gzip_decompressor
|
||||||
from pywb.utils.binsearch import iter_range, linearsearch, search
|
from pywb.utils.binsearch import iter_range, linearsearch, search
|
||||||
|
|
||||||
|
|
||||||
|
@ -5,9 +5,11 @@ import logging
|
|||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
from warcio.statusandheaders import StatusAndHeaders
|
||||||
|
from warcio.timeutils import datetime_to_timestamp
|
||||||
|
|
||||||
from pywb.utils.wbexception import NotFoundException
|
from pywb.utils.wbexception import NotFoundException
|
||||||
from pywb.utils.loaders import LocalFileLoader
|
from pywb.utils.loaders import LocalFileLoader
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
|
||||||
|
|
||||||
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
|
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
|
||||||
from pywb.framework.wbrequestresponse import WbResponse
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
@ -19,7 +21,6 @@ from pywb.warc.pathresolvers import PathResolverMapper
|
|||||||
from pywb.webapp.views import J2TemplateView, init_view
|
from pywb.webapp.views import J2TemplateView, init_view
|
||||||
from pywb.webapp.replay_views import ReplayView
|
from pywb.webapp.replay_views import ReplayView
|
||||||
from pywb.framework.memento import MementoResponse
|
from pywb.framework.memento import MementoResponse
|
||||||
from pywb.utils.timeutils import datetime_to_timestamp
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from warcio.statusandheaders import StatusAndHeaders
|
||||||
from pywb.utils.limitreader import LimitReader
|
from warcio.limitreader import LimitReader
|
||||||
|
|
||||||
from pywb.framework.cache import create_cache
|
from pywb.framework.cache import create_cache
|
||||||
|
|
||||||
from tempfile import NamedTemporaryFile, mkdtemp
|
from tempfile import NamedTemporaryFile, mkdtemp
|
||||||
|
@ -5,16 +5,17 @@ from io import BytesIO
|
|||||||
from six.moves.urllib.parse import urlsplit
|
from six.moves.urllib.parse import urlsplit
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
|
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from warcio.statusandheaders import StatusAndHeaders
|
||||||
|
from warcio.limitreader import LimitReader
|
||||||
|
from warcio.timeutils import timestamp_now
|
||||||
|
from warcio.recordloader import ArchiveLoadFailed
|
||||||
|
|
||||||
from pywb.utils.wbexception import WbException, NotFoundException
|
from pywb.utils.wbexception import WbException, NotFoundException
|
||||||
from pywb.utils.limitreader import LimitReader
|
|
||||||
from pywb.utils.timeutils import timestamp_now
|
|
||||||
|
|
||||||
from pywb.framework.wbrequestresponse import WbResponse
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
from pywb.framework.memento import MementoResponse
|
from pywb.framework.memento import MementoResponse
|
||||||
|
|
||||||
from pywb.rewrite.rewrite_content import RewriteContent
|
from pywb.rewrite.rewrite_content import RewriteContent
|
||||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
|
||||||
|
|
||||||
from pywb.webapp.views import HeadInsertView
|
from pywb.webapp.views import HeadInsertView
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec
|
from warcio.timeutils import timestamp_to_datetime, timestamp_to_sec
|
||||||
from pywb.framework.wbrequestresponse import WbResponse
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
from pywb.framework.memento import make_timemap, LINK_FORMAT
|
from pywb.framework.memento import make_timemap, LINK_FORMAT
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user