mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
spin-off warcio!
update imports to point to warcio warcio rename fixes: - ArcWarcRecord.stream -> raw_stream - ArcWarcRecord.status_headers -> http_headers - ArchiveLoadFailed single param init
This commit is contained in:
parent
4a94699a65
commit
0784e4e5aa
@ -3,8 +3,9 @@ from pywb.cdx.cdxobject import TIMESTAMP, STATUSCODE, MIMETYPE, DIGEST
|
||||
from pywb.cdx.cdxobject import OFFSET, LENGTH, FILENAME
|
||||
|
||||
from pywb.cdx.query import CDXQuery
|
||||
from pywb.utils.timeutils import timestamp_to_sec, pad_timestamp
|
||||
from pywb.utils.timeutils import PAD_14_DOWN, PAD_14_UP
|
||||
|
||||
from warcio.timeutils import timestamp_to_sec, pad_timestamp
|
||||
from warcio.timeutils import PAD_14_DOWN, PAD_14_UP
|
||||
|
||||
import bisect
|
||||
|
||||
|
@ -14,7 +14,7 @@ com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ
|
||||
from fakeredis import FakeStrictRedis
|
||||
from mock import patch
|
||||
|
||||
from pywb.utils.timeutils import timestamp_to_sec
|
||||
from warcio.timeutils import timestamp_to_sec
|
||||
from pywb.cdx.cdxsource import RedisCDXSource
|
||||
from pywb.cdx.cdxserver import CDXServer
|
||||
|
||||
|
@ -13,7 +13,7 @@ from pywb.cdx.cdxsource import CDXSource
|
||||
from pywb.cdx.cdxobject import IDXObject, CDXException
|
||||
|
||||
from pywb.utils.loaders import BlockLoader, read_last_line
|
||||
from pywb.utils.bufferedreaders import gzip_decompressor
|
||||
from warcio.bufferedreaders import gzip_decompressor
|
||||
from pywb.utils.binsearch import iter_range, linearsearch, search
|
||||
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
from pywb.utils.wbexception import BadRequestException
|
||||
from pywb.utils.timeutils import http_date_to_timestamp
|
||||
from pywb.utils.timeutils import timestamp_to_http_date
|
||||
from warcio.timeutils import http_date_to_timestamp
|
||||
from warcio.timeutils import timestamp_to_http_date
|
||||
|
||||
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
|
@ -4,6 +4,7 @@ from pywb.framework.wbrequestresponse import WbResponse, WbRequest
|
||||
from pywb.framework.archivalrouter import ArchivalRouter
|
||||
|
||||
from six.moves.urllib.parse import urlsplit
|
||||
from six import iteritems
|
||||
import base64
|
||||
|
||||
import socket
|
||||
@ -15,8 +16,8 @@ from pywb.rewrite.url_rewriter import SchemeOnlyUrlRewriter, UrlRewriter
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
from pywb.utils.wbexception import BadRequestException
|
||||
|
||||
from pywb.utils.bufferedreaders import BufferedReader
|
||||
from pywb.utils.loaders import to_native_str
|
||||
from warcio.bufferedreaders import BufferedReader
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
from pywb.framework.proxy_resolvers import ProxyAuthResolver, CookieResolver, IPCacheResolver
|
||||
|
||||
@ -250,7 +251,8 @@ class ProxyRouter(object):
|
||||
|
||||
# add extra headers for replay responses
|
||||
if wbrequest.wb_url and wbrequest.wb_url.is_replay():
|
||||
response.status_headers.replace_headers(self.extra_headers)
|
||||
for name, value in iteritems(self.extra_headers):
|
||||
response.status_headers.replace_header(name, value)
|
||||
|
||||
# check for content-length
|
||||
res = response.status_headers.get_header('content-length')
|
||||
|
@ -1,7 +1,6 @@
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
from pywb.utils.wbexception import WbException
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
|
||||
from pywb.framework.cache import create_cache
|
||||
@ -10,7 +9,8 @@ from pywb.framework.basehandlers import WbUrlHandler
|
||||
from six.moves.urllib.parse import parse_qs, urlsplit
|
||||
import six
|
||||
|
||||
from pywb.utils.loaders import to_native_str
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
import base64
|
||||
import os
|
||||
|
@ -61,7 +61,7 @@
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
|
||||
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.loaders import extract_post_query, append_post_query
|
||||
|
||||
from io import BytesIO
|
||||
|
@ -1,7 +1,10 @@
|
||||
from pywb.utils.wbexception import WbException, NotFoundException
|
||||
from pywb.utils.loaders import load_yaml_config, to_native_str
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
from pywb.framework.wbrequestresponse import WbResponse, StatusAndHeaders
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
|
||||
|
||||
import os
|
||||
|
@ -13,7 +13,7 @@ from pkg_resources import resource_string
|
||||
from argparse import ArgumentParser, RawTextHelpFormatter
|
||||
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
from pywb.utils.timeutils import timestamp20_now
|
||||
from warcio.timeutils import timestamp20_now
|
||||
|
||||
from pywb import DEFAULT_CONFIG
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_iso_date
|
||||
from warcio.timeutils import timestamp_to_datetime, datetime_to_iso_date
|
||||
import re
|
||||
|
||||
|
||||
|
@ -7,12 +7,11 @@ import traceback
|
||||
|
||||
import portalocker
|
||||
|
||||
from pywb.utils.timeutils import timestamp20_now
|
||||
from warcio.timeutils import timestamp20_now
|
||||
from warcio.warcwriter import BaseWARCWriter
|
||||
|
||||
from pywb.webagg.utils import res_template
|
||||
|
||||
from pywb.warc.warcwriter import BaseWARCWriter
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class MultiFileWARCWriter(BaseWARCWriter):
|
||||
@ -22,6 +21,8 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
max_idle_secs=1800, *args, **kwargs):
|
||||
super(MultiFileWARCWriter, self).__init__(*args, **kwargs)
|
||||
|
||||
self.header_filter = kwargs.get('header_filter')
|
||||
|
||||
if not filename_template:
|
||||
dir_template, filename_template = os.path.split(dir_template)
|
||||
dir_template += os.path.sep
|
||||
@ -41,27 +42,8 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
|
||||
self.fh_cache = {}
|
||||
|
||||
def write_req_resp(self, req, resp, params):
|
||||
url = resp.rec_headers.get_header('WARC-Target-URI')
|
||||
dt = resp.rec_headers.get_header('WARC-Date')
|
||||
|
||||
#req.rec_headers['Content-Type'] = req.content_type
|
||||
req.rec_headers.replace_header('WARC-Target-URI', url)
|
||||
req.rec_headers.replace_header('WARC-Date', dt)
|
||||
|
||||
resp_id = resp.rec_headers.get_header('WARC-Record-ID')
|
||||
if resp_id:
|
||||
req.rec_headers.add_header('WARC-Concurrent-To', resp_id)
|
||||
|
||||
resp = self._check_revisit(resp, params)
|
||||
if not resp:
|
||||
print('Skipping due to dedup')
|
||||
return
|
||||
|
||||
self._do_write_req_resp(req, resp, params)
|
||||
|
||||
def _check_revisit(self, record, params):
|
||||
if not self.dedup_index:
|
||||
if not self.dedup_index or record.rec_type != 'response':
|
||||
return record
|
||||
|
||||
try:
|
||||
@ -77,14 +59,18 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
return None
|
||||
|
||||
if isinstance(result, tuple) and result[0] == 'revisit':
|
||||
record.rec_headers.replace_header('WARC-Type', 'revisit')
|
||||
record.rec_headers.add_header('WARC-Profile', self.REVISIT_PROFILE)
|
||||
|
||||
record.rec_headers.add_header('WARC-Refers-To-Target-URI', result[1])
|
||||
record.rec_headers.add_header('WARC-Refers-To-Date', result[2])
|
||||
record = self.create_revisit_record(url, digest, result[1], result[2],
|
||||
http_headers=record.http_headers)
|
||||
|
||||
return record
|
||||
|
||||
def _set_header_buff(self, record):
|
||||
exclude_list = None
|
||||
if self.header_filter:
|
||||
exclude_list = self.header_filter(record)
|
||||
buff = record.http_headers.to_bytes(exclude_list)
|
||||
record.http_headers.headers_buff = buff
|
||||
|
||||
def get_new_filename(self, dir_, params):
|
||||
timestamp = timestamp20_now()
|
||||
|
||||
@ -153,6 +139,11 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
self._do_write_req_resp(None, record, params)
|
||||
|
||||
def _do_write_req_resp(self, req, resp, params):
|
||||
resp = self._check_revisit(resp, params)
|
||||
if not resp:
|
||||
print('Skipping due to dedup')
|
||||
return
|
||||
|
||||
def write_callback(out, filename):
|
||||
#url = resp.rec_headers.get_header('WARC-Target-URI')
|
||||
#print('Writing req/resp {0} to {1} '.format(url, filename))
|
||||
@ -180,6 +171,8 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
||||
|
||||
close_file = False
|
||||
|
||||
new_size = start = 0
|
||||
|
||||
if result:
|
||||
out, filename = result
|
||||
is_new = False
|
||||
|
@ -69,16 +69,21 @@ class RecorderApp(object):
|
||||
req_head, req_pay, resp_head, resp_pay, params = result
|
||||
|
||||
#resp_type, resp = self.writer.read_resp_record(resp_head, resp_pay)
|
||||
resp = self.writer.copy_warc_record(resp_pay)
|
||||
resp_length = resp_pay.tell()
|
||||
resp_pay.seek(0)
|
||||
resp = self.writer.create_record_from_stream(resp_pay, resp_length)
|
||||
|
||||
if resp.rec_type == 'response':
|
||||
uri = resp.rec_headers.get_header('WARC-Target-Uri')
|
||||
req_length = req_pay.tell()
|
||||
req_pay.seek(0)
|
||||
req = self.writer.create_warc_record(uri=uri,
|
||||
record_type='request',
|
||||
payload=req_pay,
|
||||
length=req_length,
|
||||
warc_headers_dict=req_head)
|
||||
|
||||
self.writer.write_req_resp(req, resp, params)
|
||||
self.writer.write_request_response_pair(req, resp, params)
|
||||
|
||||
else:
|
||||
self.writer.write_record(resp, params)
|
||||
@ -133,9 +138,13 @@ class RecorderApp(object):
|
||||
|
||||
content_type = headers.get('Content-Type')
|
||||
|
||||
payload_length = req_stream.out.tell()
|
||||
req_stream.out.seek(0)
|
||||
|
||||
record = self.writer.create_warc_record(uri=params['url'],
|
||||
record_type=record_type,
|
||||
payload=req_stream.out,
|
||||
length=payload_length,
|
||||
warc_content_type=content_type,
|
||||
warc_headers_dict=req_stream.headers)
|
||||
|
||||
|
@ -1,7 +1,8 @@
|
||||
from pywb.utils.canonicalize import calc_search_range
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.warc.cdxindexer import write_cdx_index
|
||||
from pywb.utils.timeutils import iso_date_to_timestamp
|
||||
|
||||
from warcio.timeutils import iso_date_to_timestamp
|
||||
|
||||
from io import BytesIO
|
||||
import os
|
||||
|
@ -20,11 +20,13 @@ from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitD
|
||||
from pywb.webagg.utils import MementoUtils
|
||||
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
|
||||
from warcio.statusandheaders import StatusAndHeadersParser
|
||||
from warcio.bufferedreaders import DecompressingBufferedReader
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
from warcio.archiveiterator import ArchiveIterator
|
||||
|
||||
from pywb.warc.cdxindexer import write_cdx_index
|
||||
from pywb.warc.archiveiterator import ArchiveIterator
|
||||
|
||||
from six.moves.urllib.parse import quote, unquote, urlencode
|
||||
from io import BytesIO
|
||||
@ -122,9 +124,9 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
||||
filename = os.path.join(base_dir, filename)
|
||||
with open(filename, 'rb') as fh:
|
||||
for record in ArchiveIterator(fh, no_record_parse=True):
|
||||
assert record.status_headers == None
|
||||
assert record.http_headers == None
|
||||
assert int(record.rec_headers.get_header('Content-Length')) == record.length
|
||||
assert record.length == len(record.stream.read())
|
||||
assert record.length == len(record.raw_stream.read())
|
||||
|
||||
def test_record_warc_1(self):
|
||||
recorder_app = RecorderApp(self.upstream_url,
|
||||
@ -168,16 +170,16 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
||||
|
||||
buff = BytesIO(resp.body)
|
||||
record = ArcWarcRecordLoader().parse_record_stream(buff)
|
||||
assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers
|
||||
assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers
|
||||
assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers
|
||||
assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers
|
||||
|
||||
stored_req, stored_resp = self._load_resp_req(base_path)
|
||||
|
||||
assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.status_headers.headers
|
||||
assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.status_headers.headers
|
||||
assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.http_headers.headers
|
||||
assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.http_headers.headers
|
||||
|
||||
assert ('X-Other', 'foo') in stored_req.status_headers.headers
|
||||
assert ('Cookie', 'boo=far') in stored_req.status_headers.headers
|
||||
assert ('X-Other', 'foo') in stored_req.http_headers.headers
|
||||
assert ('Cookie', 'boo=far') in stored_req.http_headers.headers
|
||||
|
||||
self._test_all_warcs('/warcs/cookiecheck/', 1)
|
||||
|
||||
@ -193,16 +195,16 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
||||
|
||||
buff = BytesIO(resp.body)
|
||||
record = ArcWarcRecordLoader().parse_record_stream(buff)
|
||||
assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers
|
||||
assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers
|
||||
assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers
|
||||
assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers
|
||||
|
||||
stored_req, stored_resp = self._load_resp_req(warc_path)
|
||||
|
||||
assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.status_headers.headers
|
||||
assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.status_headers.headers
|
||||
assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.http_headers.headers
|
||||
assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.http_headers.headers
|
||||
|
||||
assert ('X-Other', 'foo') in stored_req.status_headers.headers
|
||||
assert ('Cookie', 'boo=far') not in stored_req.status_headers.headers
|
||||
assert ('X-Other', 'foo') in stored_req.http_headers.headers
|
||||
assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers
|
||||
|
||||
self._test_all_warcs('/warcs/cookieskip/', 1)
|
||||
|
||||
@ -530,10 +532,10 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
||||
assert status_headers.get_header('Content-Length') == str(len(buff))
|
||||
assert status_headers.get_header('WARC-Custom') == 'foo'
|
||||
|
||||
assert record.stream.read() == buff
|
||||
assert record.raw_stream.read() == buff
|
||||
|
||||
status_headers = record.status_headers
|
||||
assert len(record.status_headers.headers) == 2
|
||||
status_headers = record.http_headers
|
||||
assert len(record.http_headers.headers) == 2
|
||||
|
||||
assert status_headers.get_header('Content-Type') == 'text/plain'
|
||||
assert status_headers.get_header('Content-Length') == str(len(buff))
|
||||
|
@ -1,5 +1,5 @@
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.timeutils import datetime_to_http_date
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from warcio.timeutils import datetime_to_http_date
|
||||
from datetime import datetime, timedelta
|
||||
import six
|
||||
|
||||
|
@ -12,10 +12,11 @@ from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
|
||||
from pywb.rewrite.rewriterules import RewriteRules
|
||||
|
||||
from pywb.utils.dsrules import RuleSet
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
from pywb.utils.bufferedreaders import ChunkedDataReader, BufferedReader
|
||||
from pywb.utils.loaders import to_native_str
|
||||
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from warcio.bufferedreaders import DecompressingBufferedReader
|
||||
from warcio.bufferedreaders import ChunkedDataReader, BufferedReader
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
|
||||
|
||||
|
@ -11,10 +11,11 @@ import os
|
||||
from six.moves.urllib.parse import urlsplit
|
||||
import six
|
||||
|
||||
from warcio.timeutils import timestamp_now
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
|
||||
from pywb.utils.loaders import is_http, LimitReader, LocalFileLoader, to_file_url
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
|
@ -50,9 +50,9 @@ HTTP Headers Rewriting
|
||||
|
||||
from pywb.rewrite.header_rewriter import HeaderRewriter
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
|
||||
from pywb.utils.timeutils import datetime_to_http_date
|
||||
from warcio.timeutils import datetime_to_http_date
|
||||
from datetime import datetime
|
||||
|
||||
import pprint
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,5 +1,5 @@
|
||||
from pywb.rewrite.cookie_rewriter import WbUrlBaseCookieRewriter, HostScopeCookieRewriter
|
||||
from pywb.utils.timeutils import datetime_to_http_date
|
||||
from warcio.timeutils import datetime_to_http_date
|
||||
from six.moves import zip
|
||||
|
||||
import redis
|
||||
|
@ -6,10 +6,10 @@ from pywb.framework.archivalrouter import Route
|
||||
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
from pywb.webapp.live_rewrite_handler import RewriteHandler
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.timeutils import http_date_to_timestamp
|
||||
from warcio.timeutils import http_date_to_timestamp
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
|
||||
from io import BytesIO
|
||||
@ -81,7 +81,7 @@ class PlatformHandler(RewriteHandler):
|
||||
|
||||
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
|
||||
result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter,
|
||||
record.status_headers,
|
||||
record.http_headers,
|
||||
record.stream,
|
||||
head_insert_func,
|
||||
urlkey,
|
||||
|
@ -6,14 +6,15 @@ from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.timeutils import http_date_to_timestamp
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
from pywb.utils.bufferedreaders import BufferedReader
|
||||
|
||||
from warcio.timeutils import http_date_to_timestamp
|
||||
from warcio.bufferedreaders import BufferedReader
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
|
||||
from pywb.webagg.utils import BUFF_SIZE
|
||||
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
|
||||
from pywb.webagg.utils import MementoUtils, buffer_iter
|
||||
@ -200,11 +201,11 @@ class RewriterApp(object):
|
||||
self._add_custom_params(cdx, r.headers, kwargs)
|
||||
|
||||
if readd_range:
|
||||
content_length = (record.status_headers.
|
||||
content_length = (record.http_headers.
|
||||
get_header('Content-Length'))
|
||||
try:
|
||||
content_length = int(content_length)
|
||||
record.status_headers.add_range(0, content_length,
|
||||
record.http_headers.add_range(0, content_length,
|
||||
content_length)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
@ -228,8 +229,8 @@ class RewriterApp(object):
|
||||
cookie_key)
|
||||
|
||||
result = self.content_rewriter.rewrite_content(urlrewriter,
|
||||
record.status_headers,
|
||||
record.stream,
|
||||
record.http_headers,
|
||||
record.raw_stream,
|
||||
head_insert_func,
|
||||
urlkey,
|
||||
cdx,
|
||||
|
@ -1,5 +1,6 @@
|
||||
from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
from warcio.timeutils import timestamp_to_datetime, timestamp_to_sec
|
||||
from warcio.timeutils import timestamp_now
|
||||
|
||||
from pywb.utils.loaders import load
|
||||
|
||||
from six.moves.urllib.parse import urlsplit
|
||||
|
@ -1,336 +0,0 @@
|
||||
from io import BytesIO
|
||||
import zlib
|
||||
import brotli
|
||||
|
||||
|
||||
#=================================================================
|
||||
def gzip_decompressor():
|
||||
"""
|
||||
Decompressor which can handle decompress gzip stream
|
||||
"""
|
||||
return zlib.decompressobj(16 + zlib.MAX_WBITS)
|
||||
|
||||
|
||||
def deflate_decompressor():
|
||||
return zlib.decompressobj()
|
||||
|
||||
|
||||
def deflate_decompressor_alt():
|
||||
return zlib.decompressobj(-zlib.MAX_WBITS)
|
||||
|
||||
def brotli_decompressor():
|
||||
decomp = brotli.Decompressor()
|
||||
decomp.unused_data = None
|
||||
return decomp
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BufferedReader(object):
|
||||
"""
|
||||
A wrapping line reader which wraps an existing reader.
|
||||
Read operations operate on underlying buffer, which is filled to
|
||||
block_size (1024 default)
|
||||
|
||||
If an optional decompress type is specified,
|
||||
data is fed through the decompressor when read from the buffer.
|
||||
Currently supported decompression: gzip
|
||||
If unspecified, default decompression is None
|
||||
|
||||
If decompression is specified, and decompress fails on first try,
|
||||
data is assumed to not be compressed and no exception is thrown.
|
||||
|
||||
If a failure occurs after data has been
|
||||
partially decompressed, the exception is propagated.
|
||||
|
||||
"""
|
||||
|
||||
DECOMPRESSORS = {'gzip': gzip_decompressor,
|
||||
'deflate': deflate_decompressor,
|
||||
'deflate_alt': deflate_decompressor_alt,
|
||||
'br': brotli_decompressor
|
||||
}
|
||||
|
||||
def __init__(self, stream, block_size=1024,
|
||||
decomp_type=None,
|
||||
starting_data=None):
|
||||
self.stream = stream
|
||||
self.block_size = block_size
|
||||
|
||||
self._init_decomp(decomp_type)
|
||||
|
||||
self.buff = None
|
||||
self.starting_data = starting_data
|
||||
self.num_read = 0
|
||||
self.buff_size = 0
|
||||
|
||||
def set_decomp(self, decomp_type):
|
||||
self._init_decomp(decomp_type)
|
||||
|
||||
def _init_decomp(self, decomp_type):
|
||||
self.num_block_read = 0
|
||||
if decomp_type:
|
||||
try:
|
||||
self.decomp_type = decomp_type
|
||||
self.decompressor = self.DECOMPRESSORS[decomp_type.lower()]()
|
||||
except KeyError:
|
||||
raise Exception('Decompression type not supported: ' +
|
||||
decomp_type)
|
||||
else:
|
||||
self.decomp_type = None
|
||||
self.decompressor = None
|
||||
|
||||
def _fillbuff(self, block_size=None):
|
||||
if not self.empty():
|
||||
return
|
||||
|
||||
# can't read past next member
|
||||
if self.rem_length() > 0:
|
||||
return
|
||||
|
||||
if self.starting_data:
|
||||
data = self.starting_data
|
||||
self.starting_data = None
|
||||
else:
|
||||
if not block_size:
|
||||
block_size = self.block_size
|
||||
data = self.stream.read(block_size)
|
||||
|
||||
self._process_read(data)
|
||||
|
||||
def _process_read(self, data):
|
||||
data = self._decompress(data)
|
||||
self.buff_size = len(data)
|
||||
self.num_read += self.buff_size
|
||||
self.num_block_read += self.buff_size
|
||||
self.buff = BytesIO(data)
|
||||
|
||||
def _decompress(self, data):
|
||||
if self.decompressor and data:
|
||||
try:
|
||||
data = self.decompressor.decompress(data)
|
||||
except Exception as e:
|
||||
# if first read attempt, assume non-gzipped stream
|
||||
if self.num_block_read == 0:
|
||||
if self.decomp_type == 'deflate':
|
||||
self._init_decomp('deflate_alt')
|
||||
data = self._decompress(data)
|
||||
else:
|
||||
self.decompressor = None
|
||||
# otherwise (partly decompressed), something is wrong
|
||||
else:
|
||||
print(str(e))
|
||||
return b''
|
||||
return data
|
||||
|
||||
def read(self, length=None):
|
||||
"""
|
||||
Fill bytes and read some number of bytes
|
||||
(up to length if specified)
|
||||
<= length bytes may be read if reached the end of input
|
||||
if at buffer boundary, will attempt to read again until
|
||||
specified length is read
|
||||
"""
|
||||
all_buffs = []
|
||||
while length is None or length > 0:
|
||||
self._fillbuff()
|
||||
buff = self.buff.read(length)
|
||||
if not buff:
|
||||
break
|
||||
|
||||
all_buffs.append(buff)
|
||||
if length:
|
||||
length -= len(buff)
|
||||
|
||||
return b''.join(all_buffs)
|
||||
|
||||
|
||||
|
||||
def readline(self, length=None):
|
||||
"""
|
||||
Fill buffer and read a full line from the buffer
|
||||
(up to specified length, if provided)
|
||||
If no newline found at end, try filling buffer again in case
|
||||
at buffer boundary.
|
||||
"""
|
||||
if length == 0:
|
||||
return b''
|
||||
|
||||
self._fillbuff()
|
||||
linebuff = self.buff.readline(length)
|
||||
|
||||
# we may be at a boundary
|
||||
while not linebuff.endswith(b'\n'):
|
||||
if length:
|
||||
length -= len(linebuff)
|
||||
if length <= 0:
|
||||
break
|
||||
|
||||
self._fillbuff()
|
||||
|
||||
if self.empty():
|
||||
break
|
||||
|
||||
linebuff += self.buff.readline(length)
|
||||
|
||||
return linebuff
|
||||
|
||||
def empty(self):
|
||||
return not self.buff or self.buff.tell() >= self.buff_size
|
||||
|
||||
def read_next_member(self):
|
||||
if not self.decompressor or not self.decompressor.unused_data:
|
||||
return False
|
||||
|
||||
self.starting_data = self.decompressor.unused_data
|
||||
self._init_decomp(self.decomp_type)
|
||||
return True
|
||||
|
||||
def rem_length(self):
|
||||
rem = 0
|
||||
if self.buff:
|
||||
rem = self.buff_size - self.buff.tell()
|
||||
|
||||
if self.decompressor and self.decompressor.unused_data:
|
||||
rem += len(self.decompressor.unused_data)
|
||||
return rem
|
||||
|
||||
def close(self):
|
||||
if self.stream:
|
||||
self.stream.close()
|
||||
self.stream = None
|
||||
|
||||
@classmethod
|
||||
def get_supported_decompressors(cls):
|
||||
return cls.DECOMPRESSORS.keys()
|
||||
|
||||
|
||||
#=================================================================
|
||||
class DecompressingBufferedReader(BufferedReader):
|
||||
"""
|
||||
A BufferedReader which defaults to gzip decompression,
|
||||
(unless different type specified)
|
||||
"""
|
||||
def __init__(self, *args, **kwargs):
|
||||
if 'decomp_type' not in kwargs:
|
||||
kwargs['decomp_type'] = 'gzip'
|
||||
super(DecompressingBufferedReader, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ChunkedDataException(Exception):
|
||||
def __init__(self, msg, data=b''):
|
||||
Exception.__init__(self, msg)
|
||||
self.data = data
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ChunkedDataReader(BufferedReader):
|
||||
r"""
|
||||
A ChunkedDataReader is a DecompressingBufferedReader
|
||||
which also supports de-chunking of the data if it happens
|
||||
to be http 'chunk-encoded'.
|
||||
|
||||
If at any point the chunked header is not available, the stream is
|
||||
assumed to not be chunked and no more dechunking occurs.
|
||||
"""
|
||||
def __init__(self, stream, raise_exceptions=False, **kwargs):
|
||||
super(ChunkedDataReader, self).__init__(stream, **kwargs)
|
||||
self.all_chunks_read = False
|
||||
self.not_chunked = False
|
||||
|
||||
# if False, we'll use best-guess fallback for parse errors
|
||||
self.raise_chunked_data_exceptions = raise_exceptions
|
||||
|
||||
def _fillbuff(self, block_size=None):
|
||||
if self.not_chunked:
|
||||
return super(ChunkedDataReader, self)._fillbuff(block_size)
|
||||
|
||||
# Loop over chunks until there is some data (not empty())
|
||||
# In particular, gzipped data may require multiple chunks to
|
||||
# return any decompressed result
|
||||
while (self.empty() and
|
||||
not self.all_chunks_read and
|
||||
not self.not_chunked):
|
||||
|
||||
try:
|
||||
length_header = self.stream.readline(64)
|
||||
self._try_decode(length_header)
|
||||
except ChunkedDataException as e:
|
||||
if self.raise_chunked_data_exceptions:
|
||||
raise
|
||||
|
||||
# Can't parse the data as chunked.
|
||||
# It's possible that non-chunked data is served
|
||||
# with a Transfer-Encoding: chunked.
|
||||
# Treat this as non-chunk encoded from here on.
|
||||
self._process_read(length_header + e.data)
|
||||
self.not_chunked = True
|
||||
|
||||
# parse as block as non-chunked
|
||||
return super(ChunkedDataReader, self)._fillbuff(block_size)
|
||||
|
||||
def _try_decode(self, length_header):
|
||||
# decode length header
|
||||
try:
|
||||
chunk_size = int(length_header.strip().split(b';')[0], 16)
|
||||
except ValueError:
|
||||
raise ChunkedDataException(b"Couldn't decode length header " +
|
||||
length_header)
|
||||
|
||||
if not chunk_size:
|
||||
# chunk_size 0 indicates end of file
|
||||
self.all_chunks_read = True
|
||||
self._process_read(b'')
|
||||
return
|
||||
|
||||
data_len = 0
|
||||
data = b''
|
||||
|
||||
# read chunk
|
||||
while data_len < chunk_size:
|
||||
new_data = self.stream.read(chunk_size - data_len)
|
||||
|
||||
# if we unexpectedly run out of data,
|
||||
# either raise an exception or just stop reading,
|
||||
# assuming file was cut off
|
||||
if not new_data:
|
||||
if self.raise_chunked_data_exceptions:
|
||||
msg = 'Ran out of data before end of chunk'
|
||||
raise ChunkedDataException(msg, data)
|
||||
else:
|
||||
chunk_size = data_len
|
||||
self.all_chunks_read = True
|
||||
|
||||
data += new_data
|
||||
data_len = len(data)
|
||||
|
||||
# if we successfully read a block without running out,
|
||||
# it should end in \r\n
|
||||
if not self.all_chunks_read:
|
||||
clrf = self.stream.read(2)
|
||||
if clrf != b'\r\n':
|
||||
raise ChunkedDataException(b"Chunk terminator not found.",
|
||||
data)
|
||||
|
||||
# hand to base class for further processing
|
||||
self._process_read(data)
|
||||
|
||||
def read(self, length=None):
|
||||
""" read bytes from stream, if length specified,
|
||||
may read across multiple chunks to get exact length
|
||||
"""
|
||||
buf = super(ChunkedDataReader, self).read(length)
|
||||
if not length:
|
||||
return buf
|
||||
|
||||
# if length specified, attempt to read exact length
|
||||
rem = length - len(buf)
|
||||
while rem > 0:
|
||||
new_buf = super(ChunkedDataReader, self).read(rem)
|
||||
if not new_buf:
|
||||
break
|
||||
|
||||
buf += new_buf
|
||||
rem -= len(new_buf)
|
||||
|
||||
return buf
|
@ -1,69 +0,0 @@
|
||||
# ============================================================================
|
||||
class LimitReader(object):
|
||||
"""
|
||||
A reader which will not read more than specified limit
|
||||
"""
|
||||
|
||||
def __init__(self, stream, limit):
|
||||
self.stream = stream
|
||||
self.limit = limit
|
||||
|
||||
if hasattr(stream, 'tell'):
|
||||
self.tell = self._tell
|
||||
|
||||
def _update(self, buff):
|
||||
length = len(buff)
|
||||
self.limit -= length
|
||||
return buff
|
||||
|
||||
def read(self, length=None):
|
||||
if length is not None:
|
||||
length = min(length, self.limit)
|
||||
else:
|
||||
length = self.limit
|
||||
|
||||
if length == 0:
|
||||
return b''
|
||||
|
||||
buff = self.stream.read(length)
|
||||
return self._update(buff)
|
||||
|
||||
def readline(self, length=None):
|
||||
if length is not None:
|
||||
length = min(length, self.limit)
|
||||
else:
|
||||
length = self.limit
|
||||
|
||||
if length == 0:
|
||||
return b''
|
||||
|
||||
buff = self.stream.readline(length)
|
||||
return self._update(buff)
|
||||
|
||||
def close(self):
|
||||
self.stream.close()
|
||||
|
||||
def _tell(self):
|
||||
return self.stream.tell()
|
||||
|
||||
@staticmethod
|
||||
def wrap_stream(stream, content_length):
|
||||
"""
|
||||
If given content_length is an int > 0, wrap the stream
|
||||
in a LimitReader. Otherwise, return the stream unaltered
|
||||
"""
|
||||
try:
|
||||
content_length = int(content_length)
|
||||
if content_length >= 0:
|
||||
# optimize: if already a LimitStream, set limit to
|
||||
# the smaller of the two limits
|
||||
if isinstance(stream, LimitReader):
|
||||
stream.limit = min(stream.limit, content_length)
|
||||
else:
|
||||
stream = LimitReader(stream, content_length)
|
||||
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
return stream
|
||||
|
@ -17,7 +17,7 @@ import base64
|
||||
import cgi
|
||||
|
||||
from io import open, BytesIO
|
||||
from pywb.utils.limitreader import LimitReader
|
||||
from warcio.limitreader import LimitReader
|
||||
|
||||
try:
|
||||
from boto import connect_s3
|
||||
|
@ -1,285 +0,0 @@
|
||||
"""
|
||||
Representation and parsing of HTTP-style status + headers
|
||||
"""
|
||||
|
||||
from pprint import pformat
|
||||
from copy import copy
|
||||
from six.moves import range
|
||||
from six import iteritems
|
||||
from pywb.utils.loaders import to_native_str
|
||||
import uuid
|
||||
|
||||
|
||||
WRAP_WIDTH = 80
|
||||
|
||||
#=================================================================
|
||||
class StatusAndHeaders(object):
|
||||
"""
|
||||
Representation of parsed http-style status line and headers
|
||||
Status Line if first line of request/response
|
||||
Headers is a list of (name, value) tuples
|
||||
An optional protocol which appears on first line may be specified
|
||||
"""
|
||||
def __init__(self, statusline, headers, protocol='', total_len=0):
|
||||
self.statusline = statusline
|
||||
self.headers = headers
|
||||
self.protocol = protocol
|
||||
self.total_len = total_len
|
||||
|
||||
def get_header(self, name, default_value=None):
|
||||
"""
|
||||
return header (name, value)
|
||||
if found
|
||||
"""
|
||||
name_lower = name.lower()
|
||||
for value in self.headers:
|
||||
if value[0].lower() == name_lower:
|
||||
return value[1]
|
||||
|
||||
return default_value
|
||||
|
||||
def add_header(self, name, value):
|
||||
self.headers.append((name, value))
|
||||
|
||||
def replace_header(self, name, value):
|
||||
"""
|
||||
replace header with new value or add new header
|
||||
return old header value, if any
|
||||
"""
|
||||
name_lower = name.lower()
|
||||
for index in range(len(self.headers) - 1, -1, -1):
|
||||
curr_name, curr_value = self.headers[index]
|
||||
if curr_name.lower() == name_lower:
|
||||
self.headers[index] = (curr_name, value)
|
||||
return curr_value
|
||||
|
||||
self.headers.append((name, value))
|
||||
return None
|
||||
|
||||
def replace_headers(self, header_dict):
|
||||
"""
|
||||
replace all headers in header_dict that already exist
|
||||
add any remaining headers
|
||||
"""
|
||||
header_dict = copy(header_dict)
|
||||
|
||||
for index in range(len(self.headers) - 1, -1, -1):
|
||||
curr_name, curr_value = self.headers[index]
|
||||
name_lower = curr_name.lower()
|
||||
if name_lower in header_dict:
|
||||
self.headers[index] = (curr_name, header_dict[name_lower])
|
||||
del header_dict[name_lower]
|
||||
|
||||
for name, value in iteritems(header_dict):
|
||||
self.headers.append((name, value))
|
||||
|
||||
def remove_header(self, name):
|
||||
"""
|
||||
Remove header (case-insensitive)
|
||||
return True if header removed, False otherwise
|
||||
"""
|
||||
name_lower = name.lower()
|
||||
for index in range(len(self.headers) - 1, -1, -1):
|
||||
if self.headers[index][0].lower() == name_lower:
|
||||
del self.headers[index]
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def get_statuscode(self):
|
||||
"""
|
||||
Return the statuscode part of the status response line
|
||||
(Assumes no protocol in the statusline)
|
||||
"""
|
||||
code = self.statusline.split(' ', 1)[0]
|
||||
return code
|
||||
|
||||
def validate_statusline(self, valid_statusline):
|
||||
"""
|
||||
Check that the statusline is valid, eg. starts with a numeric
|
||||
code. If not, replace with passed in valid_statusline
|
||||
"""
|
||||
code = self.get_statuscode()
|
||||
try:
|
||||
code = int(code)
|
||||
assert(code > 0)
|
||||
return True
|
||||
except(ValueError, AssertionError):
|
||||
self.statusline = valid_statusline
|
||||
return False
|
||||
|
||||
def add_range(self, start, part_len, total_len):
|
||||
"""
|
||||
Add range headers indicating that this a partial response
|
||||
"""
|
||||
content_range = 'bytes {0}-{1}/{2}'.format(start,
|
||||
start + part_len - 1,
|
||||
total_len)
|
||||
|
||||
self.statusline = '206 Partial Content'
|
||||
self.replace_header('Content-Range', content_range)
|
||||
self.replace_header('Accept-Ranges', 'bytes')
|
||||
return self
|
||||
|
||||
def __repr__(self):
|
||||
headers_str = pformat(self.headers, indent=2, width=WRAP_WIDTH)
|
||||
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
|
||||
headers = {2})".format(self.protocol, self.statusline, headers_str)
|
||||
|
||||
def __eq__(self, other):
|
||||
return (self.statusline == other.statusline and
|
||||
self.headers == other.headers and
|
||||
self.protocol == other.protocol)
|
||||
|
||||
def __str__(self, exclude_list=None):
|
||||
return self.to_str(exclude_list)
|
||||
|
||||
def __bool__(self):
|
||||
return bool(self.statusline or self.headers)
|
||||
|
||||
__nonzero__ = __bool__
|
||||
|
||||
def to_str(self, exclude_list):
|
||||
string = self.protocol
|
||||
|
||||
if string and self.statusline:
|
||||
string += ' '
|
||||
|
||||
if self.statusline:
|
||||
string += self.statusline
|
||||
|
||||
if string:
|
||||
string += '\r\n'
|
||||
|
||||
for h in self.headers:
|
||||
if exclude_list and h[0].lower() in exclude_list:
|
||||
continue
|
||||
|
||||
string += ': '.join(h) + '\r\n'
|
||||
|
||||
return string
|
||||
|
||||
def to_bytes(self, exclude_list=None):
|
||||
return self.to_str(exclude_list).encode('iso-8859-1') + b'\r\n'
|
||||
|
||||
|
||||
#=================================================================
|
||||
def _strip_count(string, total_read):
|
||||
length = len(string)
|
||||
return string.rstrip(), total_read + length
|
||||
|
||||
|
||||
#=================================================================
|
||||
class StatusAndHeadersParser(object):
|
||||
"""
|
||||
Parser which consumes a stream support readline() to read
|
||||
status and headers and return a StatusAndHeaders object
|
||||
"""
|
||||
def __init__(self, statuslist, verify=True):
|
||||
self.statuslist = statuslist
|
||||
self.verify = verify
|
||||
|
||||
def parse(self, stream, full_statusline=None):
|
||||
"""
|
||||
parse stream for status line and headers
|
||||
return a StatusAndHeaders object
|
||||
|
||||
support continuation headers starting with space or tab
|
||||
"""
|
||||
|
||||
def readline():
|
||||
return to_native_str(stream.readline())
|
||||
|
||||
# status line w newlines intact
|
||||
if full_statusline is None:
|
||||
full_statusline = readline()
|
||||
else:
|
||||
full_statusline = to_native_str(full_statusline)
|
||||
|
||||
statusline, total_read = _strip_count(full_statusline, 0)
|
||||
|
||||
headers = []
|
||||
|
||||
# at end of stream
|
||||
if total_read == 0:
|
||||
raise EOFError()
|
||||
elif not statusline:
|
||||
return StatusAndHeaders(statusline=statusline,
|
||||
headers=headers,
|
||||
protocol='',
|
||||
total_len=total_read)
|
||||
|
||||
# validate only if verify is set
|
||||
if self.verify:
|
||||
protocol_status = self.split_prefix(statusline, self.statuslist)
|
||||
|
||||
if not protocol_status:
|
||||
msg = 'Expected Status Line starting with {0} - Found: {1}'
|
||||
msg = msg.format(self.statuslist, statusline)
|
||||
raise StatusAndHeadersParserException(msg, full_statusline)
|
||||
else:
|
||||
protocol_status = statusline.split(' ', 1)
|
||||
|
||||
line, total_read = _strip_count(readline(), total_read)
|
||||
while line:
|
||||
result = line.split(':', 1)
|
||||
if len(result) == 2:
|
||||
name = result[0].rstrip(' \t')
|
||||
value = result[1].lstrip()
|
||||
else:
|
||||
name = result[0]
|
||||
value = None
|
||||
|
||||
next_line, total_read = _strip_count(readline(),
|
||||
total_read)
|
||||
|
||||
# append continuation lines, if any
|
||||
while next_line and next_line.startswith((' ', '\t')):
|
||||
if value is not None:
|
||||
value += next_line
|
||||
next_line, total_read = _strip_count(readline(),
|
||||
total_read)
|
||||
|
||||
if value is not None:
|
||||
header = (name, value)
|
||||
headers.append(header)
|
||||
|
||||
line = next_line
|
||||
|
||||
if len(protocol_status) > 1:
|
||||
statusline = protocol_status[1].strip()
|
||||
else:
|
||||
statusline = ''
|
||||
|
||||
return StatusAndHeaders(statusline=statusline,
|
||||
headers=headers,
|
||||
protocol=protocol_status[0],
|
||||
total_len=total_read)
|
||||
|
||||
@staticmethod
|
||||
def split_prefix(key, prefixs):
|
||||
"""
|
||||
split key string into prefix and remainder
|
||||
for first matching prefix from a list
|
||||
"""
|
||||
key_upper = key.upper()
|
||||
for prefix in prefixs:
|
||||
if key_upper.startswith(prefix):
|
||||
plen = len(prefix)
|
||||
return (key_upper[:plen], key[plen:])
|
||||
|
||||
@staticmethod
|
||||
def make_warc_id(id_=None):
|
||||
if not id_:
|
||||
id_ = uuid.uuid1()
|
||||
return '<urn:uuid:{0}>'.format(id_)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class StatusAndHeadersParserException(Exception):
|
||||
"""
|
||||
status + headers parsing exception
|
||||
"""
|
||||
def __init__(self, msg, statusline):
|
||||
super(StatusAndHeadersParserException, self).__init__(msg)
|
||||
self.statusline = statusline
|
@ -1,174 +0,0 @@
|
||||
r"""
|
||||
# DecompressingBufferedReader Tests
|
||||
#=================================================================
|
||||
|
||||
# DecompressingBufferedReader readline()
|
||||
>>> print_str(DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline())
|
||||
' CDX N b a m s k r M S V g\n'
|
||||
|
||||
# detect not compressed
|
||||
>>> print_str(DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb'), decomp_type = 'gzip').readline())
|
||||
' CDX N b a m s k r M S V g\n'
|
||||
|
||||
# decompress with on the fly compression, default gzip compression
|
||||
>>> print_str(DecompressingBufferedReader(BytesIO(compress('ABC\n1234\n'))).read())
|
||||
'ABC\n1234\n'
|
||||
|
||||
# decompress with on the fly compression, default 'inflate' compression
|
||||
>>> print_str(DecompressingBufferedReader(BytesIO(compress_alt('ABC\n1234\n')), decomp_type='deflate').read())
|
||||
'ABC\n1234\n'
|
||||
|
||||
# error: invalid compress type
|
||||
>>> DecompressingBufferedReader(BytesIO(compress('ABC')), decomp_type = 'bzip2').read()
|
||||
Traceback (most recent call last):
|
||||
Exception: Decompression type not supported: bzip2
|
||||
|
||||
# invalid output when reading compressed data as not compressed
|
||||
>>> DecompressingBufferedReader(BytesIO(compress('ABC')), decomp_type = None).read() != b'ABC'
|
||||
True
|
||||
|
||||
|
||||
# DecompressingBufferedReader readline() with decompression (zipnum file, no header)
|
||||
>>> print_str(DecompressingBufferedReader(open(test_zip_dir + 'zipnum-sample.cdx.gz', 'rb'), decomp_type = 'gzip').readline())
|
||||
'com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz\n'
|
||||
|
||||
# test very small block size
|
||||
>>> dbr = DecompressingBufferedReader(BytesIO(b'ABCDEFG\nHIJKLMN\nOPQR\nXYZ'), block_size = 3)
|
||||
>>> print_str(dbr.readline()); print_str(dbr.readline(4)); print_str(dbr.readline()); print_str(dbr.readline()); print_str(dbr.readline(2)); print_str(dbr.readline()); print_str(dbr.readline())
|
||||
'ABCDEFG\n'
|
||||
'HIJK'
|
||||
'LMN\n'
|
||||
'OPQR\n'
|
||||
'XY'
|
||||
'Z'
|
||||
''
|
||||
|
||||
# test zero length reads
|
||||
>>> x = DecompressingBufferedReader(LimitReader(BytesIO(b'\r\n'), 1))
|
||||
>>> print_str(x.readline(0)); print_str(x.read(0))
|
||||
''
|
||||
''
|
||||
|
||||
# Chunk-Decoding Buffered Reader Tests
|
||||
#=================================================================
|
||||
|
||||
Properly formatted chunked data:
|
||||
>>> c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n0\r\n\r\n"));
|
||||
>>> print_str(c.read() + c.read() + c.read())
|
||||
'1234'
|
||||
|
||||
Non-chunked data:
|
||||
>>> print_str(ChunkedDataReader(BytesIO(b"xyz123!@#")).read())
|
||||
'xyz123!@#'
|
||||
|
||||
Non-chunked, compressed data, specify decomp_type
|
||||
>>> print_str(ChunkedDataReader(BytesIO(compress('ABCDEF')), decomp_type='gzip').read())
|
||||
'ABCDEF'
|
||||
|
||||
Non-chunked, compressed data, specifiy compression seperately
|
||||
>>> c = ChunkedDataReader(BytesIO(compress('ABCDEF'))); c.set_decomp('gzip'); print_str(c.read())
|
||||
'ABCDEF'
|
||||
|
||||
Non-chunked, compressed data, wrap in DecompressingBufferedReader
|
||||
>>> print_str(DecompressingBufferedReader(ChunkedDataReader(BytesIO(compress('\nABCDEF\nGHIJ')))).read())
|
||||
'\nABCDEF\nGHIJ'
|
||||
|
||||
Chunked compressed data
|
||||
Split compressed stream into 10-byte chunk and a remainder chunk
|
||||
>>> b = compress('ABCDEFGHIJKLMNOP')
|
||||
>>> l = len(b)
|
||||
>>> in_ = format(10, 'x').encode('utf-8') + b"\r\n" + b[:10] + b"\r\n" + format(l - 10, 'x').encode('utf-8') + b"\r\n" + b[10:] + b"\r\n0\r\n\r\n"
|
||||
>>> c = ChunkedDataReader(BytesIO(in_), decomp_type='gzip')
|
||||
>>> print_str(c.read())
|
||||
'ABCDEFGHIJKLMNOP'
|
||||
|
||||
Starts like chunked data, but isn't:
|
||||
>>> c = ChunkedDataReader(BytesIO(b"1\r\nxyz123!@#"));
|
||||
>>> print_str(c.read() + c.read())
|
||||
'1\r\nx123!@#'
|
||||
|
||||
Chunked data cut off part way through:
|
||||
>>> c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n4\r\n12"));
|
||||
>>> print_str(c.read() + c.read())
|
||||
'123412'
|
||||
|
||||
Zero-Length chunk:
|
||||
>>> print_str(ChunkedDataReader(BytesIO(b"0\r\n\r\n")).read())
|
||||
''
|
||||
|
||||
"""
|
||||
|
||||
from io import BytesIO
|
||||
from pywb.utils.bufferedreaders import ChunkedDataReader, ChunkedDataException
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
from pywb.utils.limitreader import LimitReader
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
import six
|
||||
|
||||
import zlib
|
||||
import pytest
|
||||
|
||||
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||
test_zip_dir = get_test_dir() + 'zipcdx/'
|
||||
|
||||
|
||||
def compress(buff):
|
||||
buff = buff.encode('utf-8')
|
||||
compressobj = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
|
||||
compressed = compressobj.compress(buff)
|
||||
compressed += compressobj.flush()
|
||||
|
||||
return compressed
|
||||
|
||||
# plain "inflate"
|
||||
def compress_alt(buff):
|
||||
buff = buff.encode('utf-8')
|
||||
compressobj = zlib.compressobj(6, zlib.DEFLATED)
|
||||
compressed = compressobj.compress(buff)
|
||||
compressed += compressobj.flush()
|
||||
# drop gzip headers/tail
|
||||
compressed = compressed[2:-4]
|
||||
|
||||
return compressed
|
||||
|
||||
# Brotli
|
||||
|
||||
def test_brotli():
|
||||
with open(get_test_dir() + 'text_content/quickfox_repeated.compressed', 'rb') as fh:
|
||||
x = DecompressingBufferedReader(fh, decomp_type='br')
|
||||
x.read() == b'The quick brown fox jumps over the lazy dog' * 4096
|
||||
|
||||
|
||||
# Compression
|
||||
def test_compress_mix():
|
||||
# error: compressed member, followed by not compressed -- now allowed!
|
||||
x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip')
|
||||
b = x.read()
|
||||
assert b == b'ABC'
|
||||
x.read_next_member()
|
||||
assert x.read() == b'123'
|
||||
#with pytest.raises(zlib.error):
|
||||
# x.read()
|
||||
#error: Error -3 while decompressing: incorrect header check
|
||||
|
||||
# Errors
|
||||
|
||||
def test_err_chunk_cut_off():
|
||||
# Chunked data cut off with exceptions
|
||||
c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n4\r\n12"), raise_exceptions=True)
|
||||
with pytest.raises(ChunkedDataException):
|
||||
c.read() + c.read()
|
||||
#ChunkedDataException: Ran out of data before end of chunk
|
||||
|
||||
|
||||
|
||||
def print_str(string):
|
||||
return string.decode('utf-8') if six.PY3 else string
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
@ -1,27 +1,5 @@
|
||||
#=================================================================
|
||||
r"""
|
||||
# LimitReader Tests
|
||||
>>> LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26)
|
||||
'abcdefghji'
|
||||
|
||||
>>> LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26)
|
||||
'abcdefgh'
|
||||
|
||||
>>> LimitReader.wrap_stream(LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 8), 4).readline(26)
|
||||
'abcd'
|
||||
|
||||
>>> read_multiple(LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
|
||||
'efghji'
|
||||
|
||||
# zero-length read
|
||||
>>> print_str(LimitReader(StringIO('a'), 0).readline(0))
|
||||
''
|
||||
|
||||
# don't wrap if invalid length
|
||||
>>> b = StringIO('b')
|
||||
>>> LimitReader.wrap_stream(b, 'abc') == b
|
||||
True
|
||||
|
||||
#=================================================================
|
||||
# BlockLoader Tests (includes LimitReader)
|
||||
# Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
|
||||
>>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read(400))
|
||||
@ -143,27 +121,14 @@ import requests
|
||||
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
|
||||
from pywb.utils.loaders import extract_client_cookie, extract_post_query
|
||||
from pywb.utils.loaders import append_post_query, read_last_line
|
||||
from pywb.utils.limitreader import LimitReader
|
||||
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
from warcio.bufferedreaders import DecompressingBufferedReader
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||
|
||||
|
||||
def read_multiple(reader, inc_reads):
|
||||
result = None
|
||||
for x in inc_reads:
|
||||
result = reader.read(x)
|
||||
return result
|
||||
|
||||
|
||||
def seek_read_full(seekable_reader, offset):
|
||||
seekable_reader.seek(offset)
|
||||
seekable_reader.readline() #skip
|
||||
return seekable_reader.readline()
|
||||
|
||||
def test_s3_read_1():
|
||||
pytest.importorskip('boto')
|
||||
|
||||
@ -178,15 +143,6 @@ def test_s3_read_1():
|
||||
assert reader.readline() == b'WARC/1.0\r\n'
|
||||
assert reader.readline() == b'WARC-Type: response\r\n'
|
||||
|
||||
def test_limit_post():
|
||||
reader = LimitReader(BytesIO(b'abcdefg'), 3)
|
||||
r = requests.request(method='POST',
|
||||
url='http://httpbin.org/post',
|
||||
data=reader,
|
||||
headers={'Content-Length': '3'})
|
||||
|
||||
assert '"abc"' in r.text
|
||||
|
||||
# Error
|
||||
def test_err_no_such_file():
|
||||
# no such file
|
||||
|
@ -1,182 +0,0 @@
|
||||
"""
|
||||
>>> st1 = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
|
||||
>>> st1
|
||||
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
|
||||
('Some', 'Value'),
|
||||
('Multi-Line', 'Value1 Also This')])
|
||||
|
||||
# add range
|
||||
>>> StatusAndHeaders(statusline = '200 OK', headers=[('Content-Type', 'text/plain')]).add_range(10, 4, 100)
|
||||
StatusAndHeaders(protocol = '', statusline = '206 Partial Content', headers = [ ('Content-Type', 'text/plain'),
|
||||
('Content-Range', 'bytes 10-13/100'),
|
||||
('Accept-Ranges', 'bytes')])
|
||||
|
||||
# other protocol expected
|
||||
>>> StatusAndHeadersParser(['Other']).parse(StringIO(status_headers_1)) # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||
Traceback (most recent call last):
|
||||
StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
|
||||
|
||||
>>> StatusAndHeadersParser(['Other'], verify=False).parse(StringIO(status_headers_1))
|
||||
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
|
||||
('Some', 'Value'),
|
||||
('Multi-Line', 'Value1 Also This')])
|
||||
|
||||
|
||||
# verify protocol line
|
||||
>>> StatusAndHeadersParser(['HTTP/1.0'], verify=True).parse(StringIO(unknown_protocol_headers)) # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||
Traceback (most recent call last):
|
||||
StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0'] - Found: OtherBlah
|
||||
|
||||
|
||||
# allow unexpected/invalid protocol line
|
||||
>>> StatusAndHeadersParser(['HTTP/1.0'], verify=False).parse(StringIO(unknown_protocol_headers))
|
||||
StatusAndHeaders(protocol = 'OtherBlah', statusline = '', headers = [('Foo', 'Bar')])
|
||||
|
||||
|
||||
|
||||
# test equality op
|
||||
>>> st1 == StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
|
||||
True
|
||||
|
||||
# replace header, print new headers
|
||||
>>> st1.replace_header('some', 'Another-Value'); st1
|
||||
'Value'
|
||||
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
|
||||
('Some', 'Another-Value'),
|
||||
('Multi-Line', 'Value1 Also This')])
|
||||
|
||||
|
||||
# remove header
|
||||
>>> st1.remove_header('some')
|
||||
True
|
||||
|
||||
# already removed
|
||||
>>> st1.remove_header('Some')
|
||||
False
|
||||
|
||||
# empty
|
||||
>>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2
|
||||
StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = [])
|
||||
|
||||
|
||||
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_3))
|
||||
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 Empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')])
|
||||
|
||||
# case-insensitive match
|
||||
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_4))
|
||||
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')])
|
||||
|
||||
|
||||
"""
|
||||
|
||||
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser, StatusAndHeaders
|
||||
from six import StringIO
|
||||
import pytest
|
||||
|
||||
|
||||
status_headers_1 = "\
|
||||
HTTP/1.0 200 OK\r\n\
|
||||
Content-Type: ABC\r\n\
|
||||
HTTP/1.0 200 OK\r\n\
|
||||
Some: Value\r\n\
|
||||
Multi-Line: Value1\r\n\
|
||||
Also This\r\n\
|
||||
\r\n\
|
||||
Body"
|
||||
|
||||
|
||||
status_headers_2 = """
|
||||
|
||||
"""
|
||||
|
||||
status_headers_3 = "\
|
||||
HTTP/1.0 204 Empty\r\n\
|
||||
Content-Type: Value\r\n\
|
||||
%Invalid%\r\n\
|
||||
\tMultiline\r\n\
|
||||
Content-Length: 0\r\n\
|
||||
\r\n"
|
||||
|
||||
status_headers_4 = "\
|
||||
http/1.0 204 empty\r\n\
|
||||
Content-Type: Value\r\n\
|
||||
%Invalid%\r\n\
|
||||
\tMultiline\r\n\
|
||||
Content-Length: 0\r\n\
|
||||
\r\n"
|
||||
|
||||
unknown_protocol_headers = "\
|
||||
OtherBlah\r\n\
|
||||
Foo: Bar\r\n\
|
||||
\r\n"
|
||||
|
||||
|
||||
req_headers = "\
|
||||
GET / HTTP/1.0\r\n\
|
||||
Foo: Bar\r\n\
|
||||
Content-Length: 0\r\n"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
|
||||
|
||||
def test_to_str_1():
|
||||
res = str(StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1)))
|
||||
|
||||
exp = "\
|
||||
HTTP/1.0 200 OK\r\n\
|
||||
Content-Type: ABC\r\n\
|
||||
Some: Value\r\n\
|
||||
Multi-Line: Value1 Also This\r\n\
|
||||
"
|
||||
assert(res == exp)
|
||||
|
||||
|
||||
def test_to_str_exclude():
|
||||
sah = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
|
||||
res = sah.to_str(['multi-line'])
|
||||
|
||||
exp = "\
|
||||
HTTP/1.0 200 OK\r\n\
|
||||
Content-Type: ABC\r\n\
|
||||
Some: Value\r\n\
|
||||
"
|
||||
assert(res == exp)
|
||||
|
||||
assert(sah.to_bytes(['multi-line']) == (exp.encode('latin-1') + b'\r\n'))
|
||||
|
||||
|
||||
def test_to_str_2():
|
||||
res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers)))
|
||||
|
||||
assert(res == req_headers)
|
||||
|
||||
res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers + '\r\n')))
|
||||
|
||||
assert(res == req_headers)
|
||||
|
||||
|
||||
def test_to_str_with_remove():
|
||||
res = StatusAndHeadersParser(['GET']).parse(StringIO(req_headers))
|
||||
res.remove_header('Foo')
|
||||
|
||||
exp = "\
|
||||
GET / HTTP/1.0\r\n\
|
||||
Content-Length: 0\r\n"
|
||||
|
||||
assert(str(res) == exp)
|
||||
|
||||
def test_status_empty():
|
||||
with pytest.raises(EOFError):
|
||||
StatusAndHeadersParser([], verify=False).parse(StringIO(''))
|
||||
|
||||
|
||||
def test_status_one_word():
|
||||
res = StatusAndHeadersParser(['GET'], verify=False).parse(StringIO('A'))
|
||||
assert(str(res) == 'A\r\n')
|
||||
|
||||
|
@ -1,330 +0,0 @@
|
||||
"""
|
||||
utility functions for converting between
|
||||
datetime, iso date and 14-digit timestamp
|
||||
"""
|
||||
|
||||
import re
|
||||
import time
|
||||
import datetime
|
||||
import calendar
|
||||
|
||||
from email.utils import parsedate, formatdate
|
||||
|
||||
#=================================================================
|
||||
# str <-> datetime conversion
|
||||
#=================================================================
|
||||
|
||||
DATE_TIMESPLIT = re.compile(r'[^\d]')
|
||||
|
||||
TIMESTAMP_14 = '%Y%m%d%H%M%S'
|
||||
ISO_DT = '%Y-%m-%dT%H:%M:%SZ'
|
||||
|
||||
PAD_14_DOWN = '10000101000000'
|
||||
PAD_14_UP = '29991231235959'
|
||||
PAD_6_UP = '299912'
|
||||
|
||||
|
||||
def iso_date_to_datetime(string):
|
||||
"""
|
||||
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
|
||||
datetime.datetime(2013, 12, 26, 10, 11, 12)
|
||||
|
||||
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
|
||||
datetime.datetime(2013, 12, 26, 10, 11, 12)
|
||||
"""
|
||||
|
||||
nums = DATE_TIMESPLIT.split(string)
|
||||
if nums[-1] == '':
|
||||
nums = nums[:-1]
|
||||
|
||||
the_datetime = datetime.datetime(*(int(num) for num in nums))
|
||||
return the_datetime
|
||||
|
||||
|
||||
def http_date_to_datetime(string):
|
||||
"""
|
||||
>>> http_date_to_datetime('Thu, 26 Dec 2013 09:50:10 GMT')
|
||||
datetime.datetime(2013, 12, 26, 9, 50, 10)
|
||||
"""
|
||||
return datetime.datetime(*parsedate(string)[:6])
|
||||
|
||||
|
||||
def datetime_to_http_date(the_datetime):
|
||||
"""
|
||||
>>> datetime_to_http_date(datetime.datetime(2013, 12, 26, 9, 50, 10))
|
||||
'Thu, 26 Dec 2013 09:50:10 GMT'
|
||||
|
||||
# Verify inverses
|
||||
>>> x = 'Thu, 26 Dec 2013 09:50:10 GMT'
|
||||
>>> datetime_to_http_date(http_date_to_datetime(x)) == x
|
||||
True
|
||||
"""
|
||||
timeval = calendar.timegm(the_datetime.utctimetuple())
|
||||
return formatdate(timeval=timeval,
|
||||
localtime=False,
|
||||
usegmt=True)
|
||||
|
||||
|
||||
def datetime_to_iso_date(the_datetime):
|
||||
"""
|
||||
>>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12))
|
||||
'2013-12-26T10:11:12Z'
|
||||
|
||||
>>> datetime_to_iso_date( datetime.datetime(2013, 12, 26, 10, 11, 12))
|
||||
'2013-12-26T10:11:12Z'
|
||||
"""
|
||||
|
||||
return the_datetime.strftime(ISO_DT)
|
||||
|
||||
|
||||
def datetime_to_timestamp(the_datetime):
|
||||
"""
|
||||
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
|
||||
'20131226101112'
|
||||
"""
|
||||
|
||||
return the_datetime.strftime(TIMESTAMP_14)
|
||||
|
||||
|
||||
def timestamp_now():
|
||||
"""
|
||||
>>> len(timestamp_now())
|
||||
14
|
||||
"""
|
||||
return datetime_to_timestamp(datetime.datetime.utcnow())
|
||||
|
||||
|
||||
def timestamp20_now():
|
||||
"""
|
||||
Create 20-digit timestamp, useful to timestamping temp files
|
||||
|
||||
>>> n = timestamp20_now()
|
||||
>>> timestamp20_now() >= n
|
||||
True
|
||||
|
||||
>>> len(n)
|
||||
20
|
||||
|
||||
"""
|
||||
now = datetime.datetime.utcnow()
|
||||
return now.strftime('%Y%m%d%H%M%S%f')
|
||||
|
||||
|
||||
def iso_date_to_timestamp(string):
|
||||
"""
|
||||
>>> iso_date_to_timestamp('2013-12-26T10:11:12Z')
|
||||
'20131226101112'
|
||||
|
||||
>>> iso_date_to_timestamp('2013-12-26T10:11:12')
|
||||
'20131226101112'
|
||||
"""
|
||||
|
||||
return datetime_to_timestamp(iso_date_to_datetime(string))
|
||||
|
||||
def timestamp_to_iso_date(string):
|
||||
"""
|
||||
>>> timestamp_to_iso_date('20131226101112')
|
||||
'2013-12-26T10:11:12Z'
|
||||
|
||||
>>> timestamp_to_iso_date('20131226101112')
|
||||
'2013-12-26T10:11:12Z'
|
||||
"""
|
||||
|
||||
|
||||
return datetime_to_iso_date(timestamp_to_datetime(string))
|
||||
|
||||
|
||||
def http_date_to_timestamp(string):
|
||||
"""
|
||||
>>> http_date_to_timestamp('Thu, 26 Dec 2013 09:50:00 GMT')
|
||||
'20131226095000'
|
||||
|
||||
>>> http_date_to_timestamp('Sun, 26 Jan 2014 20:08:04 GMT')
|
||||
'20140126200804'
|
||||
"""
|
||||
return datetime_to_timestamp(http_date_to_datetime(string))
|
||||
|
||||
|
||||
# pad to certain length (default 6)
|
||||
def pad_timestamp(string, pad_str=PAD_6_UP):
|
||||
"""
|
||||
>>> pad_timestamp('20')
|
||||
'209912'
|
||||
|
||||
>>> pad_timestamp('2014')
|
||||
'201412'
|
||||
|
||||
>>> pad_timestamp('20141011')
|
||||
'20141011'
|
||||
|
||||
>>> pad_timestamp('201410110010')
|
||||
'201410110010'
|
||||
"""
|
||||
|
||||
str_len = len(string)
|
||||
pad_len = len(pad_str)
|
||||
|
||||
if str_len < pad_len:
|
||||
string = string + pad_str[str_len:]
|
||||
|
||||
return string
|
||||
|
||||
|
||||
def timestamp_to_datetime(string):
|
||||
"""
|
||||
# >14-digit -- rest ignored
|
||||
>>> timestamp_to_datetime('2014122609501011')
|
||||
datetime.datetime(2014, 12, 26, 9, 50, 10)
|
||||
|
||||
# 14-digit
|
||||
>>> timestamp_to_datetime('20141226095010')
|
||||
datetime.datetime(2014, 12, 26, 9, 50, 10)
|
||||
|
||||
# 13-digit padding
|
||||
>>> timestamp_to_datetime('2014122609501')
|
||||
datetime.datetime(2014, 12, 26, 9, 50, 59)
|
||||
|
||||
# 12-digit padding
|
||||
>>> timestamp_to_datetime('201412260950')
|
||||
datetime.datetime(2014, 12, 26, 9, 50, 59)
|
||||
|
||||
# 11-digit padding
|
||||
>>> timestamp_to_datetime('20141226095')
|
||||
datetime.datetime(2014, 12, 26, 9, 59, 59)
|
||||
|
||||
# 10-digit padding
|
||||
>>> timestamp_to_datetime('2014122609')
|
||||
datetime.datetime(2014, 12, 26, 9, 59, 59)
|
||||
|
||||
# 9-digit padding
|
||||
>>> timestamp_to_datetime('201412260')
|
||||
datetime.datetime(2014, 12, 26, 23, 59, 59)
|
||||
|
||||
# 8-digit padding
|
||||
>>> timestamp_to_datetime('20141226')
|
||||
datetime.datetime(2014, 12, 26, 23, 59, 59)
|
||||
|
||||
# 7-digit padding
|
||||
>>> timestamp_to_datetime('2014122')
|
||||
datetime.datetime(2014, 12, 31, 23, 59, 59)
|
||||
|
||||
# 6-digit padding
|
||||
>>> timestamp_to_datetime('201410')
|
||||
datetime.datetime(2014, 10, 31, 23, 59, 59)
|
||||
|
||||
# 5-digit padding
|
||||
>>> timestamp_to_datetime('20141')
|
||||
datetime.datetime(2014, 12, 31, 23, 59, 59)
|
||||
|
||||
# 4-digit padding
|
||||
>>> timestamp_to_datetime('2014')
|
||||
datetime.datetime(2014, 12, 31, 23, 59, 59)
|
||||
|
||||
# 3-digit padding
|
||||
>>> timestamp_to_datetime('201')
|
||||
datetime.datetime(2019, 12, 31, 23, 59, 59)
|
||||
|
||||
# 2-digit padding
|
||||
>>> timestamp_to_datetime('20')
|
||||
datetime.datetime(2099, 12, 31, 23, 59, 59)
|
||||
|
||||
# 1-digit padding
|
||||
>>> timestamp_to_datetime('2')
|
||||
datetime.datetime(2999, 12, 31, 23, 59, 59)
|
||||
|
||||
# 1-digit out-of-range padding
|
||||
>>> timestamp_to_datetime('3')
|
||||
datetime.datetime(2999, 12, 31, 23, 59, 59)
|
||||
|
||||
# 0-digit padding
|
||||
>>> timestamp_to_datetime('')
|
||||
datetime.datetime(2999, 12, 31, 23, 59, 59)
|
||||
|
||||
# bad month
|
||||
>>> timestamp_to_datetime('20131709005601')
|
||||
datetime.datetime(2013, 12, 9, 0, 56, 1)
|
||||
|
||||
# all out of range except minutes
|
||||
>>> timestamp_to_datetime('40001965252477')
|
||||
datetime.datetime(2999, 12, 31, 23, 24, 59)
|
||||
|
||||
# not a number!
|
||||
>>> timestamp_to_datetime('2010abc')
|
||||
datetime.datetime(2010, 12, 31, 23, 59, 59)
|
||||
|
||||
"""
|
||||
|
||||
# pad to 6 digits
|
||||
string = pad_timestamp(string, PAD_6_UP)
|
||||
|
||||
def clamp(val, min_, max_):
|
||||
try:
|
||||
val = int(val)
|
||||
val = max(min_, min(val, max_))
|
||||
return val
|
||||
except:
|
||||
return max_
|
||||
|
||||
def extract(string, start, end, min_, max_):
|
||||
if len(string) >= end:
|
||||
return clamp(string[start:end], min_, max_)
|
||||
else:
|
||||
return max_
|
||||
|
||||
# now parse, clamp to boundary
|
||||
year = extract(string, 0, 4, 1900, 2999)
|
||||
month = extract(string, 4, 6, 1, 12)
|
||||
day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1])
|
||||
hour = extract(string, 8, 10, 0, 23)
|
||||
minute = extract(string, 10, 12, 0, 59)
|
||||
second = extract(string, 12, 14, 0, 59)
|
||||
|
||||
return datetime.datetime(year=year,
|
||||
month=month,
|
||||
day=day,
|
||||
hour=hour,
|
||||
minute=minute,
|
||||
second=second)
|
||||
|
||||
#return time.strptime(pad_timestamp(string), TIMESTAMP_14)
|
||||
|
||||
|
||||
def timestamp_to_sec(string):
|
||||
"""
|
||||
>>> timestamp_to_sec('20131226095010')
|
||||
1388051410
|
||||
|
||||
# rounds to end of 2014
|
||||
>>> timestamp_to_sec('2014')
|
||||
1420070399
|
||||
"""
|
||||
|
||||
return calendar.timegm(timestamp_to_datetime(string).utctimetuple())
|
||||
|
||||
|
||||
def sec_to_timestamp(secs):
|
||||
"""
|
||||
>>> sec_to_timestamp(1388051410)
|
||||
'20131226095010'
|
||||
|
||||
>>> sec_to_timestamp(1420070399)
|
||||
'20141231235959'
|
||||
"""
|
||||
|
||||
return datetime_to_timestamp(datetime.datetime.utcfromtimestamp(secs))
|
||||
|
||||
|
||||
def timestamp_to_http_date(string):
|
||||
"""
|
||||
>>> timestamp_to_http_date('20131226095000')
|
||||
'Thu, 26 Dec 2013 09:50:00 GMT'
|
||||
|
||||
>>> timestamp_to_http_date('20140126200804')
|
||||
'Sun, 26 Jan 2014 20:08:04 GMT'
|
||||
"""
|
||||
return datetime_to_http_date(timestamp_to_datetime(string))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
@ -1,8 +1,8 @@
|
||||
from pywb.utils.timeutils import iso_date_to_timestamp
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.loaders import extract_post_query, append_post_query
|
||||
|
||||
from pywb.warc.archiveiterator import ArchiveIterator
|
||||
from warcio.timeutils import iso_date_to_timestamp
|
||||
from warcio.archiveiterator import ArchiveIterator
|
||||
|
||||
import hashlib
|
||||
import base64
|
||||
@ -71,7 +71,7 @@ class ArchiveIndexEntryMixin(object):
|
||||
self['urlkey'] = canonicalize(url, surt_ordered)
|
||||
other['urlkey'] = self['urlkey']
|
||||
|
||||
referer = other.record.status_headers.get_header('referer')
|
||||
referer = other.record.http_headers.get_header('referer')
|
||||
if referer:
|
||||
self['_referer'] = referer
|
||||
|
||||
@ -141,7 +141,7 @@ class DefaultRecordParser(object):
|
||||
for record in raw_iter:
|
||||
entry = None
|
||||
|
||||
if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):
|
||||
if not include_all and not minimal and (record.http_headers.get_statuscode() == '-'):
|
||||
continue
|
||||
|
||||
if record.rec_type == 'arc_header':
|
||||
@ -175,13 +175,13 @@ class DefaultRecordParser(object):
|
||||
compute_digest = True
|
||||
|
||||
elif not minimal and record.rec_type == 'request' and append_post:
|
||||
method = record.status_headers.protocol
|
||||
len_ = record.status_headers.get_header('Content-Length')
|
||||
method = record.http_headers.protocol
|
||||
len_ = record.http_headers.get_header('Content-Length')
|
||||
|
||||
post_query = extract_post_query(method,
|
||||
entry.get('_content_type'),
|
||||
len_,
|
||||
record.stream)
|
||||
record.raw_stream)
|
||||
|
||||
entry['_post_query'] = post_query
|
||||
|
||||
@ -236,7 +236,7 @@ class DefaultRecordParser(object):
|
||||
if record.rec_type == 'warcinfo':
|
||||
entry['url'] = record.rec_headers.get_header('WARC-Filename')
|
||||
entry['urlkey'] = entry['url']
|
||||
entry['_warcinfo'] = record.stream.read(record.length)
|
||||
entry['_warcinfo'] = record.raw_stream.read(record.length)
|
||||
return entry
|
||||
|
||||
entry['url'] = record.rec_headers.get_header('WARC-Target-Uri')
|
||||
@ -252,13 +252,13 @@ class DefaultRecordParser(object):
|
||||
entry['mime'] = '-'
|
||||
else:
|
||||
def_mime = '-' if record.rec_type == 'request' else 'unk'
|
||||
entry.extract_mime(record.status_headers.
|
||||
entry.extract_mime(record.http_headers.
|
||||
get_header('Content-Type'),
|
||||
def_mime)
|
||||
|
||||
# status -- only for response records (by convention):
|
||||
if record.rec_type == 'response' and not self.options.get('minimal'):
|
||||
entry.extract_status(record.status_headers)
|
||||
entry.extract_status(record.http_headers)
|
||||
else:
|
||||
entry['status'] = '-'
|
||||
|
||||
@ -304,7 +304,7 @@ class DefaultRecordParser(object):
|
||||
entry.extract_mime(record.rec_headers.get_header('content-type'))
|
||||
|
||||
# status
|
||||
entry.extract_status(record.status_headers)
|
||||
entry.extract_status(record.http_headers)
|
||||
|
||||
# digest
|
||||
entry['digest'] = '-'
|
||||
|
@ -1,233 +0,0 @@
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
|
||||
import six
|
||||
import sys
|
||||
|
||||
|
||||
# ============================================================================
|
||||
BUFF_SIZE = 16384
|
||||
|
||||
|
||||
class ArchiveIterator(six.Iterator):
|
||||
""" Iterate over records in WARC and ARC files, both gzip chunk
|
||||
compressed and uncompressed
|
||||
|
||||
The indexer will automatically detect format, and decompress
|
||||
if necessary.
|
||||
|
||||
"""
|
||||
|
||||
GZIP_ERR_MSG = """
|
||||
ERROR: Non-chunked gzip file detected, gzip block continues
|
||||
beyond single record.
|
||||
|
||||
This file is probably not a multi-chunk gzip but a single gzip file.
|
||||
|
||||
To allow seek, a gzipped {1} must have each record compressed into
|
||||
a single gzip chunk and concatenated together.
|
||||
|
||||
This file is likely still valid and you can use it by decompressing it:
|
||||
|
||||
gunzip myfile.{0}.gz
|
||||
|
||||
You can then also use the 'warc2warc' tool from the 'warc-tools'
|
||||
package which will create a properly chunked gzip file:
|
||||
|
||||
warc2warc -Z myfile.{0} > myfile.{0}.gz
|
||||
"""
|
||||
|
||||
INC_RECORD = """\
|
||||
WARNING: Record not followed by newline, perhaps Content-Length is invalid
|
||||
Offset: {0}
|
||||
Remainder: {1}
|
||||
"""
|
||||
|
||||
def __init__(self, fileobj, no_record_parse=False,
|
||||
verify_http=False, arc2warc=False, block_size=BUFF_SIZE):
|
||||
|
||||
self.fh = fileobj
|
||||
|
||||
self.loader = ArcWarcRecordLoader(verify_http=verify_http,
|
||||
arc2warc=arc2warc)
|
||||
self.reader = None
|
||||
|
||||
self.offset = 0
|
||||
self.known_format = None
|
||||
|
||||
self.mixed_arc_warc = arc2warc
|
||||
|
||||
self.member_info = None
|
||||
self.no_record_parse = no_record_parse
|
||||
|
||||
self.reader = DecompressingBufferedReader(self.fh,
|
||||
block_size=block_size)
|
||||
self.offset = self.fh.tell()
|
||||
|
||||
self.next_line = None
|
||||
|
||||
self._raise_invalid_gzip = False
|
||||
self._is_empty = False
|
||||
self._is_first = True
|
||||
self.last_record = None
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
while True:
|
||||
if not self._is_first:
|
||||
self._finish_record()
|
||||
|
||||
self._is_first = False
|
||||
|
||||
try:
|
||||
self.last_record = self._next_record(self.next_line)
|
||||
if self._raise_invalid_gzip:
|
||||
self._raise_invalid_gzip_err()
|
||||
|
||||
return self.last_record
|
||||
|
||||
except EOFError:
|
||||
self._is_empty = True
|
||||
|
||||
def _finish_record(self):
|
||||
if self.last_record:
|
||||
self.read_to_end(self.last_record)
|
||||
|
||||
if self.reader.decompressor:
|
||||
# if another gzip member, continue
|
||||
if self.reader.read_next_member():
|
||||
return
|
||||
|
||||
# if empty record, then we're done
|
||||
elif self._is_empty:
|
||||
raise StopIteration()
|
||||
|
||||
# otherwise, probably a gzip
|
||||
# containing multiple non-chunked records
|
||||
# raise this as an error
|
||||
else:
|
||||
self._raise_invalid_gzip = True
|
||||
|
||||
# non-gzip, so we're done
|
||||
elif self._is_empty:
|
||||
raise StopIteration()
|
||||
|
||||
def _raise_invalid_gzip_err(self):
|
||||
""" A gzip file with multiple ARC/WARC records, non-chunked
|
||||
has been detected. This is not valid for replay, so notify user
|
||||
"""
|
||||
frmt = 'warc/arc'
|
||||
if self.known_format:
|
||||
frmt = self.known_format
|
||||
|
||||
frmt_up = frmt.upper()
|
||||
|
||||
msg = self.GZIP_ERR_MSG.format(frmt, frmt_up)
|
||||
raise Exception(msg)
|
||||
|
||||
def _consume_blanklines(self):
|
||||
""" Consume blank lines that are between records
|
||||
- For warcs, there are usually 2
|
||||
- For arcs, may be 1 or 0
|
||||
- For block gzipped files, these are at end of each gzip envelope
|
||||
and are included in record length which is the full gzip envelope
|
||||
- For uncompressed, they are between records and so are NOT part of
|
||||
the record length
|
||||
|
||||
count empty_size so that it can be substracted from
|
||||
the record length for uncompressed
|
||||
|
||||
if first line read is not blank, likely error in WARC/ARC,
|
||||
display a warning
|
||||
"""
|
||||
empty_size = 0
|
||||
first_line = True
|
||||
|
||||
while True:
|
||||
line = self.reader.readline()
|
||||
if len(line) == 0:
|
||||
return None, empty_size
|
||||
|
||||
stripped = line.rstrip()
|
||||
|
||||
if len(stripped) == 0 or first_line:
|
||||
empty_size += len(line)
|
||||
|
||||
if len(stripped) != 0:
|
||||
# if first line is not blank,
|
||||
# likely content-length was invalid, display warning
|
||||
err_offset = self.fh.tell() - self.reader.rem_length() - empty_size
|
||||
sys.stderr.write(self.INC_RECORD.format(err_offset, line))
|
||||
|
||||
first_line = False
|
||||
continue
|
||||
|
||||
return line, empty_size
|
||||
|
||||
def read_to_end(self, record, payload_callback=None):
|
||||
""" Read remainder of the stream
|
||||
If a digester is included, update it
|
||||
with the data read
|
||||
"""
|
||||
|
||||
# already at end of this record, don't read until it is consumed
|
||||
if self.member_info:
|
||||
return None
|
||||
|
||||
num = 0
|
||||
curr_offset = self.offset
|
||||
|
||||
while True:
|
||||
b = record.stream.read(BUFF_SIZE)
|
||||
if not b:
|
||||
break
|
||||
num += len(b)
|
||||
if payload_callback:
|
||||
payload_callback(b)
|
||||
|
||||
"""
|
||||
- For compressed files, blank lines are consumed
|
||||
since they are part of record length
|
||||
- For uncompressed files, blank lines are read later,
|
||||
and not included in the record length
|
||||
"""
|
||||
#if self.reader.decompressor:
|
||||
self.next_line, empty_size = self._consume_blanklines()
|
||||
|
||||
self.offset = self.fh.tell() - self.reader.rem_length()
|
||||
#if self.offset < 0:
|
||||
# raise Exception('Not Gzipped Properly')
|
||||
|
||||
if self.next_line:
|
||||
self.offset -= len(self.next_line)
|
||||
|
||||
length = self.offset - curr_offset
|
||||
|
||||
if not self.reader.decompressor:
|
||||
length -= empty_size
|
||||
|
||||
self.member_info = (curr_offset, length)
|
||||
#return self.member_info
|
||||
#return next_line
|
||||
|
||||
def _next_record(self, next_line):
|
||||
""" Use loader to parse the record from the reader stream
|
||||
Supporting warc and arc records
|
||||
"""
|
||||
record = self.loader.parse_record_stream(self.reader,
|
||||
next_line,
|
||||
self.known_format,
|
||||
self.no_record_parse)
|
||||
|
||||
self.member_info = None
|
||||
|
||||
# Track known format for faster parsing of other records
|
||||
if not self.mixed_arc_warc:
|
||||
self.known_format = record.format
|
||||
|
||||
return record
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
from warcio.bufferedreaders import DecompressingBufferedReader
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
from pywb.utils.loaders import BlockLoader
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
|
||||
|
||||
#=================================================================
|
||||
|
@ -1,298 +0,0 @@
|
||||
import collections
|
||||
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParserException
|
||||
|
||||
from pywb.utils.limitreader import LimitReader
|
||||
from pywb.utils.loaders import to_native_str
|
||||
|
||||
#from pywb.utils.wbexception import WbException
|
||||
from pywb.utils.timeutils import timestamp_to_iso_date
|
||||
|
||||
from six.moves import zip
|
||||
|
||||
|
||||
#=================================================================
|
||||
#ArcWarcRecord = collections.namedtuple('ArcWarcRecord',
|
||||
# 'format, rec_type, rec_headers, ' +
|
||||
# 'stream, status_headers, ' +
|
||||
# 'content_type, length')
|
||||
|
||||
#=================================================================
|
||||
class ArcWarcRecord(object):
|
||||
def __init__(self, *args):
|
||||
(self.format, self.rec_type, self.rec_headers, self.stream,
|
||||
self.status_headers, self.content_type, self.length) = args
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ArchiveLoadFailed(Exception):
|
||||
def __init__(self, reason, filename=''):
|
||||
if filename:
|
||||
msg = filename + ': ' + str(reason)
|
||||
else:
|
||||
msg = str(reason)
|
||||
|
||||
super(ArchiveLoadFailed, self).__init__(msg)
|
||||
self.msg = msg
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ArcWarcRecordLoader(object):
|
||||
WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
|
||||
|
||||
HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']
|
||||
|
||||
HTTP_VERBS = ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE',
|
||||
'OPTIONS', 'CONNECT', 'PATCH']
|
||||
|
||||
NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource')
|
||||
|
||||
NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
|
||||
HTTP_SCHEMES = ('http:', 'https:')
|
||||
|
||||
def __init__(self, verify_http=True, arc2warc=True):
|
||||
if arc2warc:
|
||||
self.arc_parser = ARC2WARCHeadersParser()
|
||||
else:
|
||||
self.arc_parser = ARCHeadersParser()
|
||||
|
||||
self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
|
||||
self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)
|
||||
|
||||
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
|
||||
|
||||
def parse_record_stream(self, stream,
|
||||
statusline=None,
|
||||
known_format=None,
|
||||
no_record_parse=False):
|
||||
""" Parse file-like stream and return an ArcWarcRecord
|
||||
encapsulating the record headers, http headers (if any),
|
||||
and a stream limited to the remainder of the record.
|
||||
|
||||
Pass statusline and known_format to detect_type_loader_headers()
|
||||
to faciliate parsing.
|
||||
"""
|
||||
(the_format, rec_headers) = (self.
|
||||
_detect_type_load_headers(stream,
|
||||
statusline,
|
||||
known_format))
|
||||
|
||||
if the_format == 'arc':
|
||||
uri = rec_headers.get_header('uri')
|
||||
length = rec_headers.get_header('length')
|
||||
content_type = rec_headers.get_header('content-type')
|
||||
sub_len = rec_headers.total_len
|
||||
if uri and uri.startswith('filedesc://'):
|
||||
rec_type = 'arc_header'
|
||||
else:
|
||||
rec_type = 'response'
|
||||
|
||||
elif the_format in ('warc', 'arc2warc'):
|
||||
rec_type = rec_headers.get_header('WARC-Type')
|
||||
uri = rec_headers.get_header('WARC-Target-URI')
|
||||
length = rec_headers.get_header('Content-Length')
|
||||
content_type = rec_headers.get_header('Content-Type')
|
||||
if the_format == 'warc':
|
||||
sub_len = 0
|
||||
else:
|
||||
sub_len = rec_headers.total_len
|
||||
the_format = 'warc'
|
||||
|
||||
is_err = False
|
||||
|
||||
try:
|
||||
if length is not None:
|
||||
length = int(length) - sub_len
|
||||
if length < 0:
|
||||
is_err = True
|
||||
|
||||
except (ValueError, TypeError):
|
||||
is_err = True
|
||||
|
||||
# err condition
|
||||
if is_err:
|
||||
length = 0
|
||||
# or status and headers are completely empty (blank lines found)
|
||||
elif not rec_headers:
|
||||
length = 0
|
||||
|
||||
# limit stream to the length for all valid records
|
||||
if length is not None and length >= 0:
|
||||
stream = LimitReader.wrap_stream(stream, length)
|
||||
|
||||
# don't parse the http record at all
|
||||
if no_record_parse:
|
||||
status_headers = None#StatusAndHeaders('', [])
|
||||
|
||||
# if empty record (error or otherwise) set status to 204
|
||||
elif length == 0:
|
||||
if is_err:
|
||||
msg = '204 Possible Error'
|
||||
else:
|
||||
msg = '204 No Content'
|
||||
|
||||
status_headers = StatusAndHeaders(msg, [])
|
||||
|
||||
# response record or non-empty revisit: parse HTTP status and headers!
|
||||
elif (rec_type in ('response', 'revisit')
|
||||
and uri.startswith(self.HTTP_SCHEMES)):
|
||||
status_headers = self.http_parser.parse(stream)
|
||||
|
||||
# request record: parse request
|
||||
elif ((rec_type == 'request')
|
||||
and uri.startswith(self.HTTP_SCHEMES)):
|
||||
status_headers = self.http_req_parser.parse(stream)
|
||||
|
||||
# everything else: create a no-status entry, set content-type
|
||||
else:
|
||||
content_type_header = [('Content-Type', content_type)]
|
||||
|
||||
if length is not None and length >= 0:
|
||||
content_type_header.append(('Content-Length', str(length)))
|
||||
|
||||
status_headers = StatusAndHeaders('200 OK', content_type_header)
|
||||
|
||||
return ArcWarcRecord(the_format, rec_type,
|
||||
rec_headers, stream, status_headers,
|
||||
content_type, length)
|
||||
|
||||
def _detect_type_load_headers(self, stream,
|
||||
statusline=None, known_format=None):
|
||||
""" If known_format is specified ('warc' or 'arc'),
|
||||
parse only as that format.
|
||||
|
||||
Otherwise, try parsing record as WARC, then try parsing as ARC.
|
||||
if neither one succeeds, we're out of luck.
|
||||
"""
|
||||
|
||||
if known_format != 'arc':
|
||||
# try as warc first
|
||||
try:
|
||||
rec_headers = self.warc_parser.parse(stream, statusline)
|
||||
return 'warc', rec_headers
|
||||
except StatusAndHeadersParserException as se:
|
||||
if known_format == 'warc':
|
||||
msg = 'Invalid WARC record, first line: '
|
||||
raise ArchiveLoadFailed(msg + str(se.statusline))
|
||||
|
||||
statusline = se.statusline
|
||||
pass
|
||||
|
||||
# now try as arc
|
||||
try:
|
||||
rec_headers = self.arc_parser.parse(stream, statusline)
|
||||
return self.arc_parser.get_rec_type(), rec_headers
|
||||
except StatusAndHeadersParserException as se:
|
||||
if known_format == 'arc':
|
||||
msg = 'Invalid ARC record, first line: '
|
||||
else:
|
||||
msg = 'Unknown archive format, first line: '
|
||||
raise ArchiveLoadFailed(msg + str(se.statusline))
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ARCHeadersParser(object):
|
||||
# ARC 1.0 headers
|
||||
ARC_HEADERS = ["uri", "ip-address", "archive-date",
|
||||
"content-type", "length"]
|
||||
|
||||
def __init__(self):
|
||||
self.headernames = self.get_header_names()
|
||||
|
||||
def get_rec_type(self):
|
||||
return 'arc'
|
||||
|
||||
def parse(self, stream, headerline=None):
|
||||
total_read = 0
|
||||
|
||||
def readline():
|
||||
return to_native_str(stream.readline())
|
||||
|
||||
# if headerline passed in, use that
|
||||
if headerline is None:
|
||||
headerline = readline()
|
||||
else:
|
||||
headerline = to_native_str(headerline)
|
||||
|
||||
header_len = len(headerline)
|
||||
|
||||
if header_len == 0:
|
||||
raise EOFError()
|
||||
|
||||
headerline = headerline.rstrip()
|
||||
|
||||
headernames = self.headernames
|
||||
|
||||
# if arc header, consume next two lines
|
||||
if headerline.startswith('filedesc://'):
|
||||
version = readline() # skip version
|
||||
spec = readline() # skip header spec, use preset one
|
||||
total_read += len(version)
|
||||
total_read += len(spec)
|
||||
|
||||
parts = headerline.split(' ')
|
||||
|
||||
if len(parts) != len(headernames):
|
||||
msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'
|
||||
msg = msg.format(headernames, parts)
|
||||
raise StatusAndHeadersParserException(msg, parts)
|
||||
|
||||
|
||||
protocol, headers = self._get_protocol_and_headers(headerline, parts)
|
||||
|
||||
return StatusAndHeaders(statusline='',
|
||||
headers=headers,
|
||||
protocol='WARC/1.0',
|
||||
total_len=total_read)
|
||||
|
||||
@classmethod
|
||||
def get_header_names(cls):
|
||||
return cls.ARC_HEADERS
|
||||
|
||||
def _get_protocol_and_headers(self, headerline, parts):
|
||||
headers = []
|
||||
|
||||
for name, value in zip(self.headernames, parts):
|
||||
headers.append((name, value))
|
||||
|
||||
return ('ARC/1.0', headers)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ARC2WARCHeadersParser(ARCHeadersParser):
|
||||
# Headers for converting ARC -> WARC Header
|
||||
ARC_TO_WARC_HEADERS = ["WARC-Target-URI",
|
||||
"WARC-IP-Address",
|
||||
"WARC-Date",
|
||||
"Content-Type",
|
||||
"Content-Length"]
|
||||
|
||||
def get_rec_type(self):
|
||||
return 'arc2warc'
|
||||
|
||||
@classmethod
|
||||
def get_header_names(cls):
|
||||
return cls.ARC_TO_WARC_HEADERS
|
||||
|
||||
def _get_protocol_and_headers(self, headerline, parts):
|
||||
headers = []
|
||||
|
||||
for name, value in zip(self.headernames, parts):
|
||||
if name == 'WARC-Date':
|
||||
value = timestamp_to_iso_date(value)
|
||||
|
||||
headers.append((name, value))
|
||||
|
||||
if headerline.startswith('filedesc://'):
|
||||
rec_type = 'arc_header'
|
||||
else:
|
||||
rec_type = 'response'
|
||||
|
||||
headers.append(('WARC-Type', rec_type))
|
||||
headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id()))
|
||||
|
||||
return ('WARC/1.0', headers)
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
from pywb.utils.timeutils import iso_date_to_timestamp
|
||||
from warcio.recordloader import ArchiveLoadFailed
|
||||
from warcio.timeutils import iso_date_to_timestamp
|
||||
|
||||
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
|
||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
|
||||
import six
|
||||
@ -27,20 +28,20 @@ class ResolvingLoader(object):
|
||||
elif headers_record != payload_record:
|
||||
# close remainder of stream as this record only used for
|
||||
# (already parsed) headers
|
||||
headers_record.stream.close()
|
||||
headers_record.raw_stream.close()
|
||||
|
||||
# special case: check if headers record is actually empty
|
||||
# (eg empty revisit), then use headers from revisit
|
||||
if not headers_record.status_headers.headers:
|
||||
if not headers_record.http_headers.headers:
|
||||
headers_record = payload_record
|
||||
|
||||
if not headers_record or not payload_record:
|
||||
raise ArchiveLoadFailed('Could not load ' + str(cdx))
|
||||
|
||||
# ensure status line is valid from here
|
||||
headers_record.status_headers.validate_statusline('204 No Content')
|
||||
headers_record.http_headers.validate_statusline('204 No Content')
|
||||
|
||||
return (headers_record.status_headers, payload_record.stream)
|
||||
return (headers_record.http_headers, payload_record.raw_stream)
|
||||
|
||||
def load_headers_and_payload(self, cdx, failed_files, cdx_loader):
|
||||
"""
|
||||
@ -107,7 +108,7 @@ class ResolvingLoader(object):
|
||||
# optimization: if same file already failed this request,
|
||||
# don't try again
|
||||
if failed_files is not None and filename in failed_files:
|
||||
raise ArchiveLoadFailed('Skipping Already Failed', filename)
|
||||
raise ArchiveLoadFailed('Skipping Already Failed: ' + filename)
|
||||
|
||||
any_found = False
|
||||
last_exc = None
|
||||
@ -144,7 +145,7 @@ class ResolvingLoader(object):
|
||||
msg = 'Archive File Not Found'
|
||||
|
||||
#raise ArchiveLoadFailed(msg, filename), None, last_traceback
|
||||
six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(msg, filename), last_traceback)
|
||||
six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(filename + ': ' + msg), last_traceback)
|
||||
|
||||
def _load_different_url_payload(self, cdx, headers_record,
|
||||
failed_files, cdx_loader):
|
||||
|
@ -292,13 +292,13 @@ import sys
|
||||
import pprint
|
||||
import six
|
||||
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
|
||||
from warcio.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
|
||||
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
|
||||
from pywb.warc.resolvingloader import ResolvingLoader
|
||||
from pywb.warc.pathresolvers import PathResolverMapper
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
|
||||
import pywb.utils.statusandheaders
|
||||
import warcio.statusandheaders
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
@ -331,10 +331,12 @@ def load_test_archive(test_file, offset, length):
|
||||
|
||||
archive = testloader.load(path, offset, length)
|
||||
|
||||
pywb.utils.statusandheaders.WRAP_WIDTH = 160
|
||||
warcio.statusandheaders.WRAP_WIDTH = 160
|
||||
|
||||
pprint.pprint(((archive.format, archive.rec_type),
|
||||
archive.rec_headers, archive.status_headers), indent=1, width=160)
|
||||
archive.rec_headers, archive.http_headers), indent=1, width=160)
|
||||
|
||||
warcio.statusandheaders.WRAP_WIDTH = 80
|
||||
|
||||
|
||||
#==============================================================================
|
||||
|
@ -1,121 +0,0 @@
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.warc.warcwriter import BufferWARCWriter
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
from pywb.warc.archiveiterator import ArchiveIterator
|
||||
|
||||
from io import BytesIO
|
||||
from collections import OrderedDict
|
||||
import json
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class FixedTestWARCWriter(BufferWARCWriter):
|
||||
@classmethod
|
||||
def _make_warc_id(cls, id_=None):
|
||||
return '<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>'
|
||||
|
||||
@classmethod
|
||||
def _make_warc_date(cls):
|
||||
return '2000-01-01T00:00:00Z'
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class TestWarcWriter(object):
|
||||
def _validate_record_content_len(self, stream):
|
||||
for record in ArchiveIterator(stream, no_record_parse=True):
|
||||
assert record.status_headers == None
|
||||
assert int(record.rec_headers.get_header('Content-Length')) == record.length
|
||||
assert record.length == len(record.stream.read())
|
||||
|
||||
|
||||
def test_warcinfo_record(self):
|
||||
simplewriter = FixedTestWARCWriter(gzip=False)
|
||||
params = OrderedDict([('software', 'recorder test'),
|
||||
('format', 'WARC File Format 1.0'),
|
||||
('json-metadata', json.dumps({'foo': 'bar'}))])
|
||||
|
||||
record = simplewriter.create_warcinfo_record('testfile.warc.gz', params)
|
||||
simplewriter.write_record(record)
|
||||
buff = simplewriter.get_contents()
|
||||
assert isinstance(buff, bytes)
|
||||
|
||||
buff = BytesIO(buff)
|
||||
parsed_record = ArcWarcRecordLoader().parse_record_stream(buff)
|
||||
|
||||
assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
|
||||
assert parsed_record.rec_headers.get_header('Content-Type') == 'application/warc-fields'
|
||||
assert parsed_record.rec_headers.get_header('WARC-Filename') == 'testfile.warc.gz'
|
||||
|
||||
buff = parsed_record.stream.read().decode('utf-8')
|
||||
|
||||
length = parsed_record.rec_headers.get_header('Content-Length')
|
||||
|
||||
assert len(buff) == int(length)
|
||||
|
||||
assert 'json-metadata: {"foo": "bar"}\r\n' in buff
|
||||
assert 'format: WARC File Format 1.0\r\n' in buff
|
||||
|
||||
warcinfo_record = '\
|
||||
WARC/1.0\r\n\
|
||||
WARC-Type: warcinfo\r\n\
|
||||
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
|
||||
WARC-Filename: testfile.warc.gz\r\n\
|
||||
WARC-Date: 2000-01-01T00:00:00Z\r\n\
|
||||
Content-Type: application/warc-fields\r\n\
|
||||
Content-Length: 86\r\n\
|
||||
\r\n\
|
||||
software: recorder test\r\n\
|
||||
format: WARC File Format 1.0\r\n\
|
||||
json-metadata: {"foo": "bar"}\r\n\
|
||||
\r\n\
|
||||
\r\n\
|
||||
'
|
||||
|
||||
assert simplewriter.get_contents().decode('utf-8') == warcinfo_record
|
||||
|
||||
def test_generate_response(self):
|
||||
headers_list = [('Content-Type', 'text/plain; charset="UTF-8"'),
|
||||
('Custom-Header', 'somevalue')
|
||||
]
|
||||
|
||||
payload = b'some\ntext'
|
||||
|
||||
status_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0')
|
||||
|
||||
|
||||
writer = FixedTestWARCWriter(gzip=False)
|
||||
|
||||
record = writer.create_warc_record('http://example.com/', 'response',
|
||||
payload=BytesIO(payload),
|
||||
length=len(payload),
|
||||
status_headers=status_headers)
|
||||
|
||||
|
||||
writer.write_record(record)
|
||||
|
||||
buff = writer.get_contents()
|
||||
|
||||
self._validate_record_content_len(BytesIO(buff))
|
||||
|
||||
warc_record = '\
|
||||
WARC/1.0\r\n\
|
||||
WARC-Type: response\r\n\
|
||||
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
|
||||
WARC-Target-URI: http://example.com/\r\n\
|
||||
WARC-Date: 2000-01-01T00:00:00Z\r\n\
|
||||
WARC-Block-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
|
||||
WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
|
||||
Content-Type: application/http; msgtype=response\r\n\
|
||||
Content-Length: 97\r\n\
|
||||
\r\n\
|
||||
HTTP/1.0 200 OK\r\n\
|
||||
Content-Type: text/plain; charset="UTF-8"\r\n\
|
||||
Custom-Header: somevalue\r\n\
|
||||
\r\n\
|
||||
some\n\
|
||||
text\
|
||||
\r\n\
|
||||
\r\n\
|
||||
'
|
||||
assert buff.decode('utf-8') == warc_record
|
||||
|
@ -1,304 +0,0 @@
|
||||
import tempfile
|
||||
import uuid
|
||||
import base64
|
||||
import hashlib
|
||||
import datetime
|
||||
import zlib
|
||||
import six
|
||||
|
||||
from socket import gethostname
|
||||
from io import BytesIO
|
||||
|
||||
from pywb.utils.loaders import to_native_str
|
||||
from pywb.utils.timeutils import datetime_to_iso_date
|
||||
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser, StatusAndHeaders
|
||||
|
||||
from pywb.warc.recordloader import ArcWarcRecord
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class BaseWARCWriter(object):
|
||||
BUFF_SIZE = 16384
|
||||
|
||||
WARC_RECORDS = {'warcinfo': 'application/warc-fields',
|
||||
'response': 'application/http; msgtype=response',
|
||||
'revisit': 'application/http; msgtype=response',
|
||||
'request': 'application/http; msgtype=request',
|
||||
'metadata': 'application/warc-fields',
|
||||
}
|
||||
|
||||
REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'
|
||||
|
||||
WARC_VERSION = 'WARC/1.0'
|
||||
|
||||
def __init__(self, gzip=True, header_filter=None, *args, **kwargs):
|
||||
self.gzip = gzip
|
||||
self.header_filter = header_filter
|
||||
self.hostname = gethostname()
|
||||
|
||||
self.parser = StatusAndHeadersParser([], verify=False)
|
||||
self.warc_version = kwargs.get('warc_version', self.WARC_VERSION)
|
||||
|
||||
@classmethod
|
||||
def _iter_stream(cls, stream):
|
||||
while True:
|
||||
buf = stream.read(cls.BUFF_SIZE)
|
||||
if not buf:
|
||||
return
|
||||
|
||||
yield buf
|
||||
|
||||
def ensure_digest(self, record):
|
||||
block_digest = record.rec_headers.get_header('WARC-Block-Digest')
|
||||
payload_digest = record.rec_headers.get_header('WARC-Payload-Digest')
|
||||
if block_digest and payload_digest:
|
||||
return
|
||||
|
||||
block_digester = self._create_digester()
|
||||
payload_digester = self._create_digester()
|
||||
|
||||
pos = record.stream.tell()
|
||||
|
||||
if record.status_headers and hasattr(record.status_headers, 'headers_buff'):
|
||||
block_digester.update(record.status_headers.headers_buff)
|
||||
|
||||
for buf in self._iter_stream(record.stream):
|
||||
block_digester.update(buf)
|
||||
payload_digester.update(buf)
|
||||
|
||||
record.stream.seek(pos)
|
||||
record.rec_headers.add_header('WARC-Block-Digest', str(block_digester))
|
||||
record.rec_headers.add_header('WARC-Payload-Digest', str(payload_digester))
|
||||
|
||||
def _create_digester(self):
|
||||
return Digester('sha1')
|
||||
|
||||
def _set_header_buff(self, record):
|
||||
exclude_list = None
|
||||
if self.header_filter:
|
||||
exclude_list = self.header_filter(record)
|
||||
buff = record.status_headers.to_bytes(exclude_list)
|
||||
record.status_headers.headers_buff = buff
|
||||
|
||||
def create_warcinfo_record(self, filename, info):
|
||||
warc_headers = StatusAndHeaders(self.warc_version, [])
|
||||
warc_headers.add_header('WARC-Type', 'warcinfo')
|
||||
warc_headers.add_header('WARC-Record-ID', self._make_warc_id())
|
||||
if filename:
|
||||
warc_headers.add_header('WARC-Filename', filename)
|
||||
warc_headers.add_header('WARC-Date', self._make_warc_date())
|
||||
|
||||
warcinfo = BytesIO()
|
||||
for n, v in six.iteritems(info):
|
||||
self._header(warcinfo, n, v)
|
||||
|
||||
warcinfo.seek(0)
|
||||
|
||||
record = ArcWarcRecord('warc', 'warcinfo', warc_headers, warcinfo,
|
||||
None, '', len(warcinfo.getvalue()))
|
||||
|
||||
return record
|
||||
|
||||
def copy_warc_record(self, payload):
|
||||
len_ = payload.tell()
|
||||
payload.seek(0)
|
||||
|
||||
warc_headers = self.parser.parse(payload)
|
||||
|
||||
record_type = warc_headers.get_header('WARC-Type', 'response')
|
||||
|
||||
return self._fill_record(record_type, warc_headers, None, payload, '', len_)
|
||||
|
||||
def create_warc_record(self, uri, record_type, payload,
|
||||
length=None,
|
||||
warc_content_type='',
|
||||
warc_headers_dict={},
|
||||
status_headers=None):
|
||||
|
||||
if length is None:
|
||||
length = payload.tell()
|
||||
payload.seek(0)
|
||||
|
||||
warc_headers = StatusAndHeaders(self.warc_version, list(warc_headers_dict.items()))
|
||||
warc_headers.replace_header('WARC-Type', record_type)
|
||||
if not warc_headers.get_header('WARC-Record-ID'):
|
||||
warc_headers.add_header('WARC-Record-ID', self._make_warc_id())
|
||||
|
||||
if uri:
|
||||
warc_headers.replace_header('WARC-Target-URI', uri)
|
||||
|
||||
if not warc_headers.get_header('WARC-Date'):
|
||||
warc_headers.add_header('WARC-Date', self._make_warc_date())
|
||||
|
||||
return self._fill_record(record_type, warc_headers, status_headers,
|
||||
payload, warc_content_type, length)
|
||||
|
||||
def _fill_record(self, record_type, warc_headers, status_headers, payload, warc_content_type, len_):
|
||||
has_http_headers = (record_type in ('request', 'response', 'revisit'))
|
||||
|
||||
if not status_headers and has_http_headers:
|
||||
status_headers = self.parser.parse(payload)
|
||||
|
||||
record = ArcWarcRecord('warc', record_type, warc_headers, payload,
|
||||
status_headers, warc_content_type, len_)
|
||||
|
||||
self.ensure_digest(record)
|
||||
|
||||
if has_http_headers:
|
||||
self._set_header_buff(record)
|
||||
|
||||
return record
|
||||
|
||||
def _write_warc_record(self, out, record, adjust_cl=True):
|
||||
if self.gzip:
|
||||
out = GzippingWrapper(out)
|
||||
|
||||
# compute Content-Type
|
||||
content_type = record.rec_headers.get_header('Content-Type')
|
||||
|
||||
if not content_type:
|
||||
content_type = record.content_type
|
||||
|
||||
if not content_type:
|
||||
content_type = self.WARC_RECORDS.get(record.rec_headers.get_header('WARC-Type'))
|
||||
|
||||
if content_type:
|
||||
record.rec_headers.replace_header('Content-Type', content_type)
|
||||
#self._header(out, 'Content-Type', content_type)
|
||||
|
||||
if record.rec_headers.get_header('WARC-Type') == 'revisit':
|
||||
http_headers_only = True
|
||||
else:
|
||||
http_headers_only = False
|
||||
|
||||
# compute Content-Length
|
||||
if record.length or record.status_headers:
|
||||
actual_len = 0
|
||||
if record.status_headers:
|
||||
actual_len = len(record.status_headers.headers_buff)
|
||||
|
||||
if not http_headers_only:
|
||||
if adjust_cl:
|
||||
diff = record.stream.tell() - actual_len
|
||||
else:
|
||||
diff = 0
|
||||
|
||||
actual_len = record.length - diff
|
||||
|
||||
record.rec_headers.replace_header('Content-Length', str(actual_len))
|
||||
#self._header(out, 'Content-Length', str(actual_len))
|
||||
|
||||
# add empty line
|
||||
#self._line(out, b'')
|
||||
|
||||
# write record headers
|
||||
out.write(record.rec_headers.to_bytes())
|
||||
|
||||
# write headers buffer, if any
|
||||
if record.status_headers:
|
||||
out.write(record.status_headers.headers_buff)
|
||||
|
||||
if not http_headers_only:
|
||||
for buf in self._iter_stream(record.stream):
|
||||
out.write(buf)
|
||||
#out.write(record.stream.read())
|
||||
|
||||
# add two lines
|
||||
self._line(out, b'\r\n')
|
||||
else:
|
||||
# add three lines (1 for end of header, 2 for end of record)
|
||||
self._line(out, b'Content-Length: 0\r\n\r\n')
|
||||
|
||||
out.flush()
|
||||
|
||||
def _header(self, out, name, value):
|
||||
if not value:
|
||||
return
|
||||
|
||||
self._line(out, (name + ': ' + str(value)).encode('latin-1'))
|
||||
|
||||
def _line(self, out, line):
|
||||
out.write(line + b'\r\n')
|
||||
|
||||
@classmethod
|
||||
def _make_warc_id(cls, id_=None):
|
||||
if not id_:
|
||||
id_ = uuid.uuid1()
|
||||
return '<urn:uuid:{0}>'.format(id_)
|
||||
|
||||
@classmethod
|
||||
def _make_warc_date(cls):
|
||||
return datetime_to_iso_date(datetime.datetime.utcnow())
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class GzippingWrapper(object):
|
||||
def __init__(self, out):
|
||||
self.compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16)
|
||||
self.out = out
|
||||
|
||||
def write(self, buff):
|
||||
#if isinstance(buff, str):
|
||||
# buff = buff.encode('utf-8')
|
||||
buff = self.compressor.compress(buff)
|
||||
self.out.write(buff)
|
||||
|
||||
def flush(self):
|
||||
buff = self.compressor.flush()
|
||||
self.out.write(buff)
|
||||
self.out.flush()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class Digester(object):
|
||||
def __init__(self, type_='sha1'):
|
||||
self.type_ = type_
|
||||
self.digester = hashlib.new(type_)
|
||||
|
||||
def update(self, buff):
|
||||
self.digester.update(buff)
|
||||
|
||||
def __str__(self):
|
||||
return self.type_ + ':' + to_native_str(base64.b32encode(self.digester.digest()))
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class BufferWARCWriter(BaseWARCWriter):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(BufferWARCWriter, self).__init__(*args, **kwargs)
|
||||
self.out = self._create_buffer()
|
||||
|
||||
def _create_buffer(self):
|
||||
return tempfile.SpooledTemporaryFile(max_size=512*1024)
|
||||
|
||||
def write_record(self, record):
|
||||
self._write_warc_record(self.out, record)
|
||||
|
||||
def get_contents(self):
|
||||
pos = self.out.tell()
|
||||
self.out.seek(0)
|
||||
buff = self.out.read()
|
||||
self.out.seek(pos)
|
||||
return buff
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class FileWARCWriter(BufferWARCWriter):
|
||||
def __init__(self, *args, **kwargs):
|
||||
file_or_buff = None
|
||||
if len(args) > 0:
|
||||
file_or_buff = args[0]
|
||||
else:
|
||||
file_or_buff = kwargs.get('file')
|
||||
|
||||
if isinstance(file_or_buff, str):
|
||||
self.out = open(file_or_buff, 'rb')
|
||||
elif hasattr(file_or_buff, 'read'):
|
||||
self.out = file_or_buff
|
||||
else:
|
||||
raise Exception('file must be a readable or valid filename')
|
||||
|
||||
|
||||
|
@ -5,7 +5,8 @@ import json
|
||||
import time
|
||||
import os
|
||||
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
from warcio.timeutils import timestamp_now
|
||||
|
||||
from pywb.cdx.cdxops import process_cdx
|
||||
from pywb.cdx.query import CDXQuery
|
||||
|
||||
|
@ -2,7 +2,7 @@ from pywb.webagg.responseloader import WARCPathLoader, LiveWebLoader, VideoLoad
|
||||
from pywb.webagg.utils import MementoUtils
|
||||
from pywb.utils.wbexception import BadRequestException, WbException
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||
from warcio.recordloader import ArchiveLoadFailed
|
||||
|
||||
from pywb.cdx.query import CDXQuery
|
||||
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
|
||||
|
@ -1,6 +1,6 @@
|
||||
from pywb.utils.binsearch import iter_range
|
||||
from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
||||
from warcio.timeutils import timestamp_now
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
from warcio.limitreader import LimitReader
|
||||
from warcio.statusandheaders import StatusAndHeadersParser
|
||||
|
||||
from pywb.utils.loaders import extract_post_query, append_post_query
|
||||
from pywb.utils.limitreader import LimitReader
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
|
||||
from six.moves.urllib.parse import urlsplit, quote
|
||||
from six import iteritems, StringIO
|
||||
|
@ -3,7 +3,7 @@ from pywb.utils.wbexception import NotFoundException
|
||||
from pywb.webagg.indexsource import BaseIndexSource, RemoteIndexSource
|
||||
from pywb.webagg.responseloader import LiveWebLoader
|
||||
from pywb.webagg.utils import ParamFormatter, res_template
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
from warcio.timeutils import timestamp_now
|
||||
|
||||
|
||||
#=============================================================================
|
||||
|
@ -2,12 +2,13 @@ from pywb.webagg.utils import MementoUtils, StreamIter, compress_gzip_iter
|
||||
from pywb.webagg.utils import ParamFormatter
|
||||
from pywb.webagg.indexsource import RedisIndexSource
|
||||
|
||||
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp
|
||||
from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date
|
||||
from pywb.utils.timeutils import http_date_to_datetime, datetime_to_http_date
|
||||
from warcio.timeutils import timestamp_to_datetime, datetime_to_timestamp
|
||||
from warcio.timeutils import iso_date_to_datetime, datetime_to_iso_date
|
||||
from warcio.timeutils import http_date_to_datetime, datetime_to_http_date
|
||||
|
||||
from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
|
||||
|
||||
from pywb.utils.wbexception import LiveResourceException, WbException
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
|
||||
|
||||
from pywb.warc.resolvingloader import ResolvingLoader
|
||||
|
||||
@ -243,11 +244,11 @@ class WARCPathLoader(BaseLoader):
|
||||
|
||||
status = cdx.get('status')
|
||||
if not status or status.startswith('3'):
|
||||
status_headers = self.headers_parser.parse(payload.stream)
|
||||
http_headers = self.headers_parser.parse(payload.raw_stream)
|
||||
self.raise_on_self_redirect(params, cdx,
|
||||
status_headers.get_statuscode(),
|
||||
status_headers.get_header('Location'))
|
||||
http_headers_buff = status_headers.to_bytes()
|
||||
http_headers.get_statuscode(),
|
||||
http_headers.get_header('Location'))
|
||||
http_headers_buff = http_headers.to_bytes()
|
||||
else:
|
||||
http_headers_buff = None
|
||||
|
||||
@ -266,9 +267,9 @@ class WARCPathLoader(BaseLoader):
|
||||
warc_headers.replace_header('WARC-Date',
|
||||
headers.rec_headers.get_header('WARC-Date'))
|
||||
|
||||
headers.stream.close()
|
||||
headers.raw_stream.close()
|
||||
|
||||
return (warc_headers, http_headers_buff, payload.stream)
|
||||
return (warc_headers, http_headers_buff, payload.raw_stream)
|
||||
|
||||
def __str__(self):
|
||||
return 'WARCPathLoader'
|
||||
|
@ -13,10 +13,10 @@ from pywb.webagg.aggregator import DirectoryIndexSource
|
||||
from pywb.webagg.app import ResAggApp
|
||||
from pywb.webagg.utils import MementoUtils
|
||||
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
from warcio.statusandheaders import StatusAndHeadersParser
|
||||
from warcio.bufferedreaders import ChunkedDataReader
|
||||
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
from pywb.utils.bufferedreaders import ChunkedDataReader
|
||||
from io import BytesIO
|
||||
from six.moves.urllib.parse import urlencode
|
||||
|
||||
@ -215,9 +215,9 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
|
||||
|
||||
buff = BytesIO(resp.body)
|
||||
record = ArcWarcRecordLoader().parse_record_stream(buff, no_record_parse=False)
|
||||
print(record.status_headers)
|
||||
assert record.status_headers.get_statuscode() == '302'
|
||||
assert record.status_headers.get_header('Location') == 'https://www.iana.org/'
|
||||
print(record.http_headers)
|
||||
assert record.http_headers.get_statuscode() == '302'
|
||||
assert record.http_headers.get_header('Location') == 'https://www.iana.org/'
|
||||
|
||||
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live'))
|
||||
def test_agg_select_live(self):
|
||||
|
@ -3,7 +3,7 @@ from pywb.webagg.indexsource import LiveIndexSource
|
||||
|
||||
from pywb.webagg.aggregator import SimpleAggregator
|
||||
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
from warcio.timeutils import timestamp_now
|
||||
|
||||
from .testutils import key_ts_res, TEST_CDX_PATH
|
||||
|
||||
|
@ -8,7 +8,7 @@ from pywb.webagg.handlers import DefaultResourceHandler
|
||||
from pywb.webagg.aggregator import SimpleAggregator
|
||||
from pywb.webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource
|
||||
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
from warcio.recordloader import ArcWarcRecordLoader
|
||||
|
||||
from .testutils import LiveServerTests, BaseTestClass
|
||||
|
||||
@ -48,7 +48,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
|
||||
|
||||
record = ArcWarcRecordLoader().parse_record_stream(resp.raw, no_record_parse=False)
|
||||
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
|
||||
assert record.status_headers.get_header('Date') != ''
|
||||
assert record.http_headers.get_header('Date') != ''
|
||||
|
||||
def test_upstream_1(self):
|
||||
resp = self.testapp.get('/upstream/resource?url=http://httpbin.org/get')
|
||||
@ -58,7 +58,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
|
||||
|
||||
record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False)
|
||||
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
|
||||
assert record.status_headers.get_header('Date') != ''
|
||||
assert record.http_headers.get_header('Date') != ''
|
||||
|
||||
def test_upstream_2(self):
|
||||
resp = self.testapp.get('/upstream_opt/resource?url=http://httpbin.org/get')
|
||||
@ -68,7 +68,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
|
||||
|
||||
record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False)
|
||||
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
|
||||
assert record.status_headers.get_header('Date') != ''
|
||||
assert record.http_headers.get_header('Date') != ''
|
||||
|
||||
|
||||
|
||||
|
@ -7,7 +7,9 @@ import zlib
|
||||
|
||||
from contextlib import closing
|
||||
|
||||
from pywb.utils.timeutils import timestamp_to_http_date
|
||||
from warcio.timeutils import timestamp_to_http_date
|
||||
from warcio.utils import BUFF_SIZE
|
||||
|
||||
from pywb.utils.wbexception import BadRequestException
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
|
||||
@ -20,9 +22,6 @@ LINK_SEG_SPLIT = re.compile(';\s*')
|
||||
LINK_URL = re.compile('<(.*)>')
|
||||
LINK_PROP = re.compile('([\w]+)="([^"]+)')
|
||||
|
||||
BUFF_SIZE = 16384
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class MementoException(BadRequestException):
|
||||
pass
|
||||
|
@ -15,7 +15,7 @@ from pywb.cdx.cdxobject import IDXObject, CDXException, CDXObject
|
||||
from pywb.cdx.query import CDXQuery
|
||||
|
||||
from pywb.utils.loaders import BlockLoader, read_last_line
|
||||
from pywb.utils.bufferedreaders import gzip_decompressor
|
||||
from warcio.bufferedreaders import gzip_decompressor
|
||||
from pywb.utils.binsearch import iter_range, linearsearch, search
|
||||
|
||||
|
||||
|
@ -5,9 +5,11 @@ import logging
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from warcio.timeutils import datetime_to_timestamp
|
||||
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
from pywb.utils.loaders import LocalFileLoader
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
|
||||
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
@ -19,7 +21,6 @@ from pywb.warc.pathresolvers import PathResolverMapper
|
||||
from pywb.webapp.views import J2TemplateView, init_view
|
||||
from pywb.webapp.replay_views import ReplayView
|
||||
from pywb.framework.memento import MementoResponse
|
||||
from pywb.utils.timeutils import datetime_to_timestamp
|
||||
|
||||
|
||||
#=================================================================
|
||||
|
@ -1,5 +1,6 @@
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.limitreader import LimitReader
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from warcio.limitreader import LimitReader
|
||||
|
||||
from pywb.framework.cache import create_cache
|
||||
|
||||
from tempfile import NamedTemporaryFile, mkdtemp
|
||||
|
@ -5,16 +5,17 @@ from io import BytesIO
|
||||
from six.moves.urllib.parse import urlsplit
|
||||
from itertools import chain
|
||||
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from warcio.limitreader import LimitReader
|
||||
from warcio.timeutils import timestamp_now
|
||||
from warcio.recordloader import ArchiveLoadFailed
|
||||
|
||||
from pywb.utils.wbexception import WbException, NotFoundException
|
||||
from pywb.utils.limitreader import LimitReader
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.memento import MementoResponse
|
||||
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||
|
||||
from pywb.webapp.views import HeadInsertView
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec
|
||||
from warcio.timeutils import timestamp_to_datetime, timestamp_to_sec
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.memento import make_timemap, LINK_FORMAT
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user