1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

spin-off warcio!

update imports to point to warcio
warcio rename fixes:
- ArcWarcRecord.stream -> raw_stream
- ArcWarcRecord.status_headers -> http_headers
- ArchiveLoadFailed single param init
This commit is contained in:
Ilya Kreymer 2017-03-01 18:37:38 -08:00
parent 4a94699a65
commit 0784e4e5aa
59 changed files with 186 additions and 2538 deletions

View File

@ -3,8 +3,9 @@ from pywb.cdx.cdxobject import TIMESTAMP, STATUSCODE, MIMETYPE, DIGEST
from pywb.cdx.cdxobject import OFFSET, LENGTH, FILENAME from pywb.cdx.cdxobject import OFFSET, LENGTH, FILENAME
from pywb.cdx.query import CDXQuery from pywb.cdx.query import CDXQuery
from pywb.utils.timeutils import timestamp_to_sec, pad_timestamp
from pywb.utils.timeutils import PAD_14_DOWN, PAD_14_UP from warcio.timeutils import timestamp_to_sec, pad_timestamp
from warcio.timeutils import PAD_14_DOWN, PAD_14_UP
import bisect import bisect

View File

@ -14,7 +14,7 @@ com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ
from fakeredis import FakeStrictRedis from fakeredis import FakeStrictRedis
from mock import patch from mock import patch
from pywb.utils.timeutils import timestamp_to_sec from warcio.timeutils import timestamp_to_sec
from pywb.cdx.cdxsource import RedisCDXSource from pywb.cdx.cdxsource import RedisCDXSource
from pywb.cdx.cdxserver import CDXServer from pywb.cdx.cdxserver import CDXServer

View File

@ -13,7 +13,7 @@ from pywb.cdx.cdxsource import CDXSource
from pywb.cdx.cdxobject import IDXObject, CDXException from pywb.cdx.cdxobject import IDXObject, CDXException
from pywb.utils.loaders import BlockLoader, read_last_line from pywb.utils.loaders import BlockLoader, read_last_line
from pywb.utils.bufferedreaders import gzip_decompressor from warcio.bufferedreaders import gzip_decompressor
from pywb.utils.binsearch import iter_range, linearsearch, search from pywb.utils.binsearch import iter_range, linearsearch, search

View File

@ -1,6 +1,6 @@
from pywb.utils.wbexception import BadRequestException from pywb.utils.wbexception import BadRequestException
from pywb.utils.timeutils import http_date_to_timestamp from warcio.timeutils import http_date_to_timestamp
from pywb.utils.timeutils import timestamp_to_http_date from warcio.timeutils import timestamp_to_http_date
from pywb.framework.wbrequestresponse import WbRequest, WbResponse from pywb.framework.wbrequestresponse import WbRequest, WbResponse
from pywb.rewrite.wburl import WbUrl from pywb.rewrite.wburl import WbUrl

View File

@ -4,6 +4,7 @@ from pywb.framework.wbrequestresponse import WbResponse, WbRequest
from pywb.framework.archivalrouter import ArchivalRouter from pywb.framework.archivalrouter import ArchivalRouter
from six.moves.urllib.parse import urlsplit from six.moves.urllib.parse import urlsplit
from six import iteritems
import base64 import base64
import socket import socket
@ -15,8 +16,8 @@ from pywb.rewrite.url_rewriter import SchemeOnlyUrlRewriter, UrlRewriter
from pywb.rewrite.rewrite_content import RewriteContent from pywb.rewrite.rewrite_content import RewriteContent
from pywb.utils.wbexception import BadRequestException from pywb.utils.wbexception import BadRequestException
from pywb.utils.bufferedreaders import BufferedReader from warcio.bufferedreaders import BufferedReader
from pywb.utils.loaders import to_native_str from warcio.utils import to_native_str
from pywb.framework.proxy_resolvers import ProxyAuthResolver, CookieResolver, IPCacheResolver from pywb.framework.proxy_resolvers import ProxyAuthResolver, CookieResolver, IPCacheResolver
@ -250,7 +251,8 @@ class ProxyRouter(object):
# add extra headers for replay responses # add extra headers for replay responses
if wbrequest.wb_url and wbrequest.wb_url.is_replay(): if wbrequest.wb_url and wbrequest.wb_url.is_replay():
response.status_headers.replace_headers(self.extra_headers) for name, value in iteritems(self.extra_headers):
response.status_headers.replace_header(name, value)
# check for content-length # check for content-length
res = response.status_headers.get_header('content-length') res = response.status_headers.get_header('content-length')

View File

@ -1,7 +1,6 @@
from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.wbrequestresponse import WbResponse
from pywb.utils.loaders import extract_client_cookie from pywb.utils.loaders import extract_client_cookie
from pywb.utils.wbexception import WbException from pywb.utils.wbexception import WbException
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.rewrite.wburl import WbUrl from pywb.rewrite.wburl import WbUrl
from pywb.framework.cache import create_cache from pywb.framework.cache import create_cache
@ -10,7 +9,8 @@ from pywb.framework.basehandlers import WbUrlHandler
from six.moves.urllib.parse import parse_qs, urlsplit from six.moves.urllib.parse import parse_qs, urlsplit
import six import six
from pywb.utils.loaders import to_native_str from warcio.statusandheaders import StatusAndHeaders
from warcio.utils import to_native_str
import base64 import base64
import os import os

View File

@ -61,7 +61,7 @@
from pywb.rewrite.wburl import WbUrl from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders from warcio.statusandheaders import StatusAndHeaders
from pywb.framework.wbrequestresponse import WbRequest, WbResponse from pywb.framework.wbrequestresponse import WbRequest, WbResponse

View File

@ -1,4 +1,4 @@
from pywb.utils.statusandheaders import StatusAndHeaders from warcio.statusandheaders import StatusAndHeaders
from pywb.utils.loaders import extract_post_query, append_post_query from pywb.utils.loaders import extract_post_query, append_post_query
from io import BytesIO from io import BytesIO

View File

@ -1,7 +1,10 @@
from pywb.utils.wbexception import WbException, NotFoundException from pywb.utils.wbexception import WbException, NotFoundException
from pywb.utils.loaders import load_yaml_config, to_native_str from pywb.utils.loaders import load_yaml_config
from pywb.utils.loaders import load_yaml_config
from warcio.utils import to_native_str
from pywb.framework.wbrequestresponse import WbResponse, StatusAndHeaders from pywb.framework.wbrequestresponse import WbResponse
from warcio.statusandheaders import StatusAndHeaders
import os import os

View File

@ -13,7 +13,7 @@ from pkg_resources import resource_string
from argparse import ArgumentParser, RawTextHelpFormatter from argparse import ArgumentParser, RawTextHelpFormatter
from pywb.utils.loaders import load_yaml_config from pywb.utils.loaders import load_yaml_config
from pywb.utils.timeutils import timestamp20_now from warcio.timeutils import timestamp20_now
from pywb import DEFAULT_CONFIG from pywb import DEFAULT_CONFIG

View File

@ -1,4 +1,4 @@
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_iso_date from warcio.timeutils import timestamp_to_datetime, datetime_to_iso_date
import re import re

View File

@ -7,12 +7,11 @@ import traceback
import portalocker import portalocker
from pywb.utils.timeutils import timestamp20_now from warcio.timeutils import timestamp20_now
from warcio.warcwriter import BaseWARCWriter
from pywb.webagg.utils import res_template from pywb.webagg.utils import res_template
from pywb.warc.warcwriter import BaseWARCWriter
# ============================================================================ # ============================================================================
class MultiFileWARCWriter(BaseWARCWriter): class MultiFileWARCWriter(BaseWARCWriter):
@ -22,6 +21,8 @@ class MultiFileWARCWriter(BaseWARCWriter):
max_idle_secs=1800, *args, **kwargs): max_idle_secs=1800, *args, **kwargs):
super(MultiFileWARCWriter, self).__init__(*args, **kwargs) super(MultiFileWARCWriter, self).__init__(*args, **kwargs)
self.header_filter = kwargs.get('header_filter')
if not filename_template: if not filename_template:
dir_template, filename_template = os.path.split(dir_template) dir_template, filename_template = os.path.split(dir_template)
dir_template += os.path.sep dir_template += os.path.sep
@ -41,27 +42,8 @@ class MultiFileWARCWriter(BaseWARCWriter):
self.fh_cache = {} self.fh_cache = {}
def write_req_resp(self, req, resp, params):
url = resp.rec_headers.get_header('WARC-Target-URI')
dt = resp.rec_headers.get_header('WARC-Date')
#req.rec_headers['Content-Type'] = req.content_type
req.rec_headers.replace_header('WARC-Target-URI', url)
req.rec_headers.replace_header('WARC-Date', dt)
resp_id = resp.rec_headers.get_header('WARC-Record-ID')
if resp_id:
req.rec_headers.add_header('WARC-Concurrent-To', resp_id)
resp = self._check_revisit(resp, params)
if not resp:
print('Skipping due to dedup')
return
self._do_write_req_resp(req, resp, params)
def _check_revisit(self, record, params): def _check_revisit(self, record, params):
if not self.dedup_index: if not self.dedup_index or record.rec_type != 'response':
return record return record
try: try:
@ -77,14 +59,18 @@ class MultiFileWARCWriter(BaseWARCWriter):
return None return None
if isinstance(result, tuple) and result[0] == 'revisit': if isinstance(result, tuple) and result[0] == 'revisit':
record.rec_headers.replace_header('WARC-Type', 'revisit') record = self.create_revisit_record(url, digest, result[1], result[2],
record.rec_headers.add_header('WARC-Profile', self.REVISIT_PROFILE) http_headers=record.http_headers)
record.rec_headers.add_header('WARC-Refers-To-Target-URI', result[1])
record.rec_headers.add_header('WARC-Refers-To-Date', result[2])
return record return record
def _set_header_buff(self, record):
exclude_list = None
if self.header_filter:
exclude_list = self.header_filter(record)
buff = record.http_headers.to_bytes(exclude_list)
record.http_headers.headers_buff = buff
def get_new_filename(self, dir_, params): def get_new_filename(self, dir_, params):
timestamp = timestamp20_now() timestamp = timestamp20_now()
@ -153,6 +139,11 @@ class MultiFileWARCWriter(BaseWARCWriter):
self._do_write_req_resp(None, record, params) self._do_write_req_resp(None, record, params)
def _do_write_req_resp(self, req, resp, params): def _do_write_req_resp(self, req, resp, params):
resp = self._check_revisit(resp, params)
if not resp:
print('Skipping due to dedup')
return
def write_callback(out, filename): def write_callback(out, filename):
#url = resp.rec_headers.get_header('WARC-Target-URI') #url = resp.rec_headers.get_header('WARC-Target-URI')
#print('Writing req/resp {0} to {1} '.format(url, filename)) #print('Writing req/resp {0} to {1} '.format(url, filename))
@ -180,6 +171,8 @@ class MultiFileWARCWriter(BaseWARCWriter):
close_file = False close_file = False
new_size = start = 0
if result: if result:
out, filename = result out, filename = result
is_new = False is_new = False

View File

@ -69,16 +69,21 @@ class RecorderApp(object):
req_head, req_pay, resp_head, resp_pay, params = result req_head, req_pay, resp_head, resp_pay, params = result
#resp_type, resp = self.writer.read_resp_record(resp_head, resp_pay) #resp_type, resp = self.writer.read_resp_record(resp_head, resp_pay)
resp = self.writer.copy_warc_record(resp_pay) resp_length = resp_pay.tell()
resp_pay.seek(0)
resp = self.writer.create_record_from_stream(resp_pay, resp_length)
if resp.rec_type == 'response': if resp.rec_type == 'response':
uri = resp.rec_headers.get_header('WARC-Target-Uri') uri = resp.rec_headers.get_header('WARC-Target-Uri')
req_length = req_pay.tell()
req_pay.seek(0)
req = self.writer.create_warc_record(uri=uri, req = self.writer.create_warc_record(uri=uri,
record_type='request', record_type='request',
payload=req_pay, payload=req_pay,
length=req_length,
warc_headers_dict=req_head) warc_headers_dict=req_head)
self.writer.write_req_resp(req, resp, params) self.writer.write_request_response_pair(req, resp, params)
else: else:
self.writer.write_record(resp, params) self.writer.write_record(resp, params)
@ -133,9 +138,13 @@ class RecorderApp(object):
content_type = headers.get('Content-Type') content_type = headers.get('Content-Type')
payload_length = req_stream.out.tell()
req_stream.out.seek(0)
record = self.writer.create_warc_record(uri=params['url'], record = self.writer.create_warc_record(uri=params['url'],
record_type=record_type, record_type=record_type,
payload=req_stream.out, payload=req_stream.out,
length=payload_length,
warc_content_type=content_type, warc_content_type=content_type,
warc_headers_dict=req_stream.headers) warc_headers_dict=req_stream.headers)

View File

@ -1,7 +1,8 @@
from pywb.utils.canonicalize import calc_search_range from pywb.utils.canonicalize import calc_search_range
from pywb.cdx.cdxobject import CDXObject from pywb.cdx.cdxobject import CDXObject
from pywb.warc.cdxindexer import write_cdx_index from pywb.warc.cdxindexer import write_cdx_index
from pywb.utils.timeutils import iso_date_to_timestamp
from warcio.timeutils import iso_date_to_timestamp
from io import BytesIO from io import BytesIO
import os import os

View File

@ -20,11 +20,13 @@ from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitD
from pywb.webagg.utils import MementoUtils from pywb.webagg.utils import MementoUtils
from pywb.cdx.cdxobject import CDXObject from pywb.cdx.cdxobject import CDXObject
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.bufferedreaders import DecompressingBufferedReader from warcio.statusandheaders import StatusAndHeadersParser
from pywb.warc.recordloader import ArcWarcRecordLoader from warcio.bufferedreaders import DecompressingBufferedReader
from warcio.recordloader import ArcWarcRecordLoader
from warcio.archiveiterator import ArchiveIterator
from pywb.warc.cdxindexer import write_cdx_index from pywb.warc.cdxindexer import write_cdx_index
from pywb.warc.archiveiterator import ArchiveIterator
from six.moves.urllib.parse import quote, unquote, urlencode from six.moves.urllib.parse import quote, unquote, urlencode
from io import BytesIO from io import BytesIO
@ -122,9 +124,9 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
filename = os.path.join(base_dir, filename) filename = os.path.join(base_dir, filename)
with open(filename, 'rb') as fh: with open(filename, 'rb') as fh:
for record in ArchiveIterator(fh, no_record_parse=True): for record in ArchiveIterator(fh, no_record_parse=True):
assert record.status_headers == None assert record.http_headers == None
assert int(record.rec_headers.get_header('Content-Length')) == record.length assert int(record.rec_headers.get_header('Content-Length')) == record.length
assert record.length == len(record.stream.read()) assert record.length == len(record.raw_stream.read())
def test_record_warc_1(self): def test_record_warc_1(self):
recorder_app = RecorderApp(self.upstream_url, recorder_app = RecorderApp(self.upstream_url,
@ -168,16 +170,16 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
buff = BytesIO(resp.body) buff = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(buff) record = ArcWarcRecordLoader().parse_record_stream(buff)
assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers
stored_req, stored_resp = self._load_resp_req(base_path) stored_req, stored_resp = self._load_resp_req(base_path)
assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.status_headers.headers assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.http_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.status_headers.headers assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.http_headers.headers
assert ('X-Other', 'foo') in stored_req.status_headers.headers assert ('X-Other', 'foo') in stored_req.http_headers.headers
assert ('Cookie', 'boo=far') in stored_req.status_headers.headers assert ('Cookie', 'boo=far') in stored_req.http_headers.headers
self._test_all_warcs('/warcs/cookiecheck/', 1) self._test_all_warcs('/warcs/cookiecheck/', 1)
@ -193,16 +195,16 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
buff = BytesIO(resp.body) buff = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(buff) record = ArcWarcRecordLoader().parse_record_stream(buff)
assert ('Set-Cookie', 'name=value; Path=/') in record.status_headers.headers assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') in record.status_headers.headers assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers
stored_req, stored_resp = self._load_resp_req(warc_path) stored_req, stored_resp = self._load_resp_req(warc_path)
assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.status_headers.headers assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.http_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.status_headers.headers assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.http_headers.headers
assert ('X-Other', 'foo') in stored_req.status_headers.headers assert ('X-Other', 'foo') in stored_req.http_headers.headers
assert ('Cookie', 'boo=far') not in stored_req.status_headers.headers assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers
self._test_all_warcs('/warcs/cookieskip/', 1) self._test_all_warcs('/warcs/cookieskip/', 1)
@ -530,10 +532,10 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert status_headers.get_header('Content-Length') == str(len(buff)) assert status_headers.get_header('Content-Length') == str(len(buff))
assert status_headers.get_header('WARC-Custom') == 'foo' assert status_headers.get_header('WARC-Custom') == 'foo'
assert record.stream.read() == buff assert record.raw_stream.read() == buff
status_headers = record.status_headers status_headers = record.http_headers
assert len(record.status_headers.headers) == 2 assert len(record.http_headers.headers) == 2
assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Type') == 'text/plain'
assert status_headers.get_header('Content-Length') == str(len(buff)) assert status_headers.get_header('Content-Length') == str(len(buff))

View File

@ -1,5 +1,5 @@
from pywb.utils.statusandheaders import StatusAndHeaders from warcio.statusandheaders import StatusAndHeaders
from pywb.utils.timeutils import datetime_to_http_date from warcio.timeutils import datetime_to_http_date
from datetime import datetime, timedelta from datetime import datetime, timedelta
import six import six

View File

@ -12,10 +12,11 @@ from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
from pywb.rewrite.rewriterules import RewriteRules from pywb.rewrite.rewriterules import RewriteRules
from pywb.utils.dsrules import RuleSet from pywb.utils.dsrules import RuleSet
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.bufferedreaders import DecompressingBufferedReader from warcio.statusandheaders import StatusAndHeaders
from pywb.utils.bufferedreaders import ChunkedDataReader, BufferedReader from warcio.bufferedreaders import DecompressingBufferedReader
from pywb.utils.loaders import to_native_str from warcio.bufferedreaders import ChunkedDataReader, BufferedReader
from warcio.utils import to_native_str
from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter from pywb.rewrite.regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter

View File

@ -11,10 +11,11 @@ import os
from six.moves.urllib.parse import urlsplit from six.moves.urllib.parse import urlsplit
import six import six
from warcio.timeutils import timestamp_now
from warcio.statusandheaders import StatusAndHeaders
from pywb.utils.loaders import is_http, LimitReader, LocalFileLoader, to_file_url from pywb.utils.loaders import is_http, LimitReader, LocalFileLoader, to_file_url
from pywb.utils.loaders import extract_client_cookie from pywb.utils.loaders import extract_client_cookie
from pywb.utils.timeutils import timestamp_now
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.canonicalize import canonicalize from pywb.utils.canonicalize import canonicalize
from pywb.rewrite.rewrite_content import RewriteContent from pywb.rewrite.rewrite_content import RewriteContent

View File

@ -50,9 +50,9 @@ HTTP Headers Rewriting
from pywb.rewrite.header_rewriter import HeaderRewriter from pywb.rewrite.header_rewriter import HeaderRewriter
from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders from warcio.statusandheaders import StatusAndHeaders
from pywb.utils.timeutils import datetime_to_http_date from warcio.timeutils import datetime_to_http_date
from datetime import datetime from datetime import datetime
import pprint import pprint

View File

@ -1,5 +1,5 @@
from pywb.rewrite.cookie_rewriter import WbUrlBaseCookieRewriter, HostScopeCookieRewriter from pywb.rewrite.cookie_rewriter import WbUrlBaseCookieRewriter, HostScopeCookieRewriter
from pywb.utils.timeutils import datetime_to_http_date from warcio.timeutils import datetime_to_http_date
from six.moves import zip from six.moves import zip
import redis import redis

View File

@ -6,10 +6,10 @@ from pywb.framework.archivalrouter import Route
from pywb.rewrite.rewrite_content import RewriteContent from pywb.rewrite.rewrite_content import RewriteContent
from pywb.rewrite.wburl import WbUrl from pywb.rewrite.wburl import WbUrl
from pywb.warc.recordloader import ArcWarcRecordLoader from warcio.recordloader import ArcWarcRecordLoader
from pywb.webapp.live_rewrite_handler import RewriteHandler from pywb.webapp.live_rewrite_handler import RewriteHandler
from pywb.utils.canonicalize import canonicalize from pywb.utils.canonicalize import canonicalize
from pywb.utils.timeutils import http_date_to_timestamp from warcio.timeutils import http_date_to_timestamp
from pywb.cdx.cdxobject import CDXObject from pywb.cdx.cdxobject import CDXObject
from io import BytesIO from io import BytesIO
@ -81,7 +81,7 @@ class PlatformHandler(RewriteHandler):
head_insert_func = self.head_insert_view.create_insert_func(wbrequest) head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter, result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter,
record.status_headers, record.http_headers,
record.stream, record.stream,
head_insert_func, head_insert_func,
urlkey, urlkey,

View File

@ -6,14 +6,15 @@ from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.wbexception import WbException from pywb.utils.wbexception import WbException
from pywb.utils.canonicalize import canonicalize from pywb.utils.canonicalize import canonicalize
from pywb.utils.timeutils import http_date_to_timestamp
from pywb.utils.loaders import extract_client_cookie from pywb.utils.loaders import extract_client_cookie
from pywb.utils.bufferedreaders import BufferedReader
from warcio.timeutils import http_date_to_timestamp
from warcio.bufferedreaders import BufferedReader
from warcio.recordloader import ArcWarcRecordLoader
from pywb.webagg.utils import BUFF_SIZE from pywb.webagg.utils import BUFF_SIZE
from pywb.cdx.cdxobject import CDXObject from pywb.cdx.cdxobject import CDXObject
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.wbrequestresponse import WbResponse
from pywb.webagg.utils import MementoUtils, buffer_iter from pywb.webagg.utils import MementoUtils, buffer_iter
@ -200,11 +201,11 @@ class RewriterApp(object):
self._add_custom_params(cdx, r.headers, kwargs) self._add_custom_params(cdx, r.headers, kwargs)
if readd_range: if readd_range:
content_length = (record.status_headers. content_length = (record.http_headers.
get_header('Content-Length')) get_header('Content-Length'))
try: try:
content_length = int(content_length) content_length = int(content_length)
record.status_headers.add_range(0, content_length, record.http_headers.add_range(0, content_length,
content_length) content_length)
except (ValueError, TypeError): except (ValueError, TypeError):
pass pass
@ -228,8 +229,8 @@ class RewriterApp(object):
cookie_key) cookie_key)
result = self.content_rewriter.rewrite_content(urlrewriter, result = self.content_rewriter.rewrite_content(urlrewriter,
record.status_headers, record.http_headers,
record.stream, record.raw_stream,
head_insert_func, head_insert_func,
urlkey, urlkey,
cdx, cdx,

View File

@ -1,5 +1,6 @@
from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec from warcio.timeutils import timestamp_to_datetime, timestamp_to_sec
from pywb.utils.timeutils import timestamp_now from warcio.timeutils import timestamp_now
from pywb.utils.loaders import load from pywb.utils.loaders import load
from six.moves.urllib.parse import urlsplit from six.moves.urllib.parse import urlsplit

View File

@ -1,336 +0,0 @@
from io import BytesIO
import zlib
import brotli
#=================================================================
def gzip_decompressor():
"""
Decompressor which can handle decompress gzip stream
"""
return zlib.decompressobj(16 + zlib.MAX_WBITS)
def deflate_decompressor():
return zlib.decompressobj()
def deflate_decompressor_alt():
return zlib.decompressobj(-zlib.MAX_WBITS)
def brotli_decompressor():
decomp = brotli.Decompressor()
decomp.unused_data = None
return decomp
#=================================================================
class BufferedReader(object):
"""
A wrapping line reader which wraps an existing reader.
Read operations operate on underlying buffer, which is filled to
block_size (1024 default)
If an optional decompress type is specified,
data is fed through the decompressor when read from the buffer.
Currently supported decompression: gzip
If unspecified, default decompression is None
If decompression is specified, and decompress fails on first try,
data is assumed to not be compressed and no exception is thrown.
If a failure occurs after data has been
partially decompressed, the exception is propagated.
"""
DECOMPRESSORS = {'gzip': gzip_decompressor,
'deflate': deflate_decompressor,
'deflate_alt': deflate_decompressor_alt,
'br': brotli_decompressor
}
def __init__(self, stream, block_size=1024,
decomp_type=None,
starting_data=None):
self.stream = stream
self.block_size = block_size
self._init_decomp(decomp_type)
self.buff = None
self.starting_data = starting_data
self.num_read = 0
self.buff_size = 0
def set_decomp(self, decomp_type):
self._init_decomp(decomp_type)
def _init_decomp(self, decomp_type):
self.num_block_read = 0
if decomp_type:
try:
self.decomp_type = decomp_type
self.decompressor = self.DECOMPRESSORS[decomp_type.lower()]()
except KeyError:
raise Exception('Decompression type not supported: ' +
decomp_type)
else:
self.decomp_type = None
self.decompressor = None
def _fillbuff(self, block_size=None):
if not self.empty():
return
# can't read past next member
if self.rem_length() > 0:
return
if self.starting_data:
data = self.starting_data
self.starting_data = None
else:
if not block_size:
block_size = self.block_size
data = self.stream.read(block_size)
self._process_read(data)
def _process_read(self, data):
data = self._decompress(data)
self.buff_size = len(data)
self.num_read += self.buff_size
self.num_block_read += self.buff_size
self.buff = BytesIO(data)
def _decompress(self, data):
if self.decompressor and data:
try:
data = self.decompressor.decompress(data)
except Exception as e:
# if first read attempt, assume non-gzipped stream
if self.num_block_read == 0:
if self.decomp_type == 'deflate':
self._init_decomp('deflate_alt')
data = self._decompress(data)
else:
self.decompressor = None
# otherwise (partly decompressed), something is wrong
else:
print(str(e))
return b''
return data
def read(self, length=None):
"""
Fill bytes and read some number of bytes
(up to length if specified)
<= length bytes may be read if reached the end of input
if at buffer boundary, will attempt to read again until
specified length is read
"""
all_buffs = []
while length is None or length > 0:
self._fillbuff()
buff = self.buff.read(length)
if not buff:
break
all_buffs.append(buff)
if length:
length -= len(buff)
return b''.join(all_buffs)
def readline(self, length=None):
"""
Fill buffer and read a full line from the buffer
(up to specified length, if provided)
If no newline found at end, try filling buffer again in case
at buffer boundary.
"""
if length == 0:
return b''
self._fillbuff()
linebuff = self.buff.readline(length)
# we may be at a boundary
while not linebuff.endswith(b'\n'):
if length:
length -= len(linebuff)
if length <= 0:
break
self._fillbuff()
if self.empty():
break
linebuff += self.buff.readline(length)
return linebuff
def empty(self):
return not self.buff or self.buff.tell() >= self.buff_size
def read_next_member(self):
if not self.decompressor or not self.decompressor.unused_data:
return False
self.starting_data = self.decompressor.unused_data
self._init_decomp(self.decomp_type)
return True
def rem_length(self):
rem = 0
if self.buff:
rem = self.buff_size - self.buff.tell()
if self.decompressor and self.decompressor.unused_data:
rem += len(self.decompressor.unused_data)
return rem
def close(self):
if self.stream:
self.stream.close()
self.stream = None
@classmethod
def get_supported_decompressors(cls):
return cls.DECOMPRESSORS.keys()
#=================================================================
class DecompressingBufferedReader(BufferedReader):
"""
A BufferedReader which defaults to gzip decompression,
(unless different type specified)
"""
def __init__(self, *args, **kwargs):
if 'decomp_type' not in kwargs:
kwargs['decomp_type'] = 'gzip'
super(DecompressingBufferedReader, self).__init__(*args, **kwargs)
#=================================================================
class ChunkedDataException(Exception):
def __init__(self, msg, data=b''):
Exception.__init__(self, msg)
self.data = data
#=================================================================
class ChunkedDataReader(BufferedReader):
r"""
A ChunkedDataReader is a DecompressingBufferedReader
which also supports de-chunking of the data if it happens
to be http 'chunk-encoded'.
If at any point the chunked header is not available, the stream is
assumed to not be chunked and no more dechunking occurs.
"""
def __init__(self, stream, raise_exceptions=False, **kwargs):
super(ChunkedDataReader, self).__init__(stream, **kwargs)
self.all_chunks_read = False
self.not_chunked = False
# if False, we'll use best-guess fallback for parse errors
self.raise_chunked_data_exceptions = raise_exceptions
def _fillbuff(self, block_size=None):
if self.not_chunked:
return super(ChunkedDataReader, self)._fillbuff(block_size)
# Loop over chunks until there is some data (not empty())
# In particular, gzipped data may require multiple chunks to
# return any decompressed result
while (self.empty() and
not self.all_chunks_read and
not self.not_chunked):
try:
length_header = self.stream.readline(64)
self._try_decode(length_header)
except ChunkedDataException as e:
if self.raise_chunked_data_exceptions:
raise
# Can't parse the data as chunked.
# It's possible that non-chunked data is served
# with a Transfer-Encoding: chunked.
# Treat this as non-chunk encoded from here on.
self._process_read(length_header + e.data)
self.not_chunked = True
# parse as block as non-chunked
return super(ChunkedDataReader, self)._fillbuff(block_size)
def _try_decode(self, length_header):
# decode length header
try:
chunk_size = int(length_header.strip().split(b';')[0], 16)
except ValueError:
raise ChunkedDataException(b"Couldn't decode length header " +
length_header)
if not chunk_size:
# chunk_size 0 indicates end of file
self.all_chunks_read = True
self._process_read(b'')
return
data_len = 0
data = b''
# read chunk
while data_len < chunk_size:
new_data = self.stream.read(chunk_size - data_len)
# if we unexpectedly run out of data,
# either raise an exception or just stop reading,
# assuming file was cut off
if not new_data:
if self.raise_chunked_data_exceptions:
msg = 'Ran out of data before end of chunk'
raise ChunkedDataException(msg, data)
else:
chunk_size = data_len
self.all_chunks_read = True
data += new_data
data_len = len(data)
# if we successfully read a block without running out,
# it should end in \r\n
if not self.all_chunks_read:
clrf = self.stream.read(2)
if clrf != b'\r\n':
raise ChunkedDataException(b"Chunk terminator not found.",
data)
# hand to base class for further processing
self._process_read(data)
def read(self, length=None):
""" read bytes from stream, if length specified,
may read across multiple chunks to get exact length
"""
buf = super(ChunkedDataReader, self).read(length)
if not length:
return buf
# if length specified, attempt to read exact length
rem = length - len(buf)
while rem > 0:
new_buf = super(ChunkedDataReader, self).read(rem)
if not new_buf:
break
buf += new_buf
rem -= len(new_buf)
return buf

View File

@ -1,69 +0,0 @@
# ============================================================================
class LimitReader(object):
"""
A reader which will not read more than specified limit
"""
def __init__(self, stream, limit):
self.stream = stream
self.limit = limit
if hasattr(stream, 'tell'):
self.tell = self._tell
def _update(self, buff):
length = len(buff)
self.limit -= length
return buff
def read(self, length=None):
if length is not None:
length = min(length, self.limit)
else:
length = self.limit
if length == 0:
return b''
buff = self.stream.read(length)
return self._update(buff)
def readline(self, length=None):
if length is not None:
length = min(length, self.limit)
else:
length = self.limit
if length == 0:
return b''
buff = self.stream.readline(length)
return self._update(buff)
def close(self):
self.stream.close()
def _tell(self):
return self.stream.tell()
@staticmethod
def wrap_stream(stream, content_length):
"""
If given content_length is an int > 0, wrap the stream
in a LimitReader. Otherwise, return the stream unaltered
"""
try:
content_length = int(content_length)
if content_length >= 0:
# optimize: if already a LimitStream, set limit to
# the smaller of the two limits
if isinstance(stream, LimitReader):
stream.limit = min(stream.limit, content_length)
else:
stream = LimitReader(stream, content_length)
except (ValueError, TypeError):
pass
return stream

View File

@ -17,7 +17,7 @@ import base64
import cgi import cgi
from io import open, BytesIO from io import open, BytesIO
from pywb.utils.limitreader import LimitReader from warcio.limitreader import LimitReader
try: try:
from boto import connect_s3 from boto import connect_s3

View File

@ -1,285 +0,0 @@
"""
Representation and parsing of HTTP-style status + headers
"""
from pprint import pformat
from copy import copy
from six.moves import range
from six import iteritems
from pywb.utils.loaders import to_native_str
import uuid
WRAP_WIDTH = 80
#=================================================================
class StatusAndHeaders(object):
"""
Representation of parsed http-style status line and headers
Status Line if first line of request/response
Headers is a list of (name, value) tuples
An optional protocol which appears on first line may be specified
"""
def __init__(self, statusline, headers, protocol='', total_len=0):
self.statusline = statusline
self.headers = headers
self.protocol = protocol
self.total_len = total_len
def get_header(self, name, default_value=None):
"""
return header (name, value)
if found
"""
name_lower = name.lower()
for value in self.headers:
if value[0].lower() == name_lower:
return value[1]
return default_value
def add_header(self, name, value):
self.headers.append((name, value))
def replace_header(self, name, value):
"""
replace header with new value or add new header
return old header value, if any
"""
name_lower = name.lower()
for index in range(len(self.headers) - 1, -1, -1):
curr_name, curr_value = self.headers[index]
if curr_name.lower() == name_lower:
self.headers[index] = (curr_name, value)
return curr_value
self.headers.append((name, value))
return None
def replace_headers(self, header_dict):
"""
replace all headers in header_dict that already exist
add any remaining headers
"""
header_dict = copy(header_dict)
for index in range(len(self.headers) - 1, -1, -1):
curr_name, curr_value = self.headers[index]
name_lower = curr_name.lower()
if name_lower in header_dict:
self.headers[index] = (curr_name, header_dict[name_lower])
del header_dict[name_lower]
for name, value in iteritems(header_dict):
self.headers.append((name, value))
def remove_header(self, name):
"""
Remove header (case-insensitive)
return True if header removed, False otherwise
"""
name_lower = name.lower()
for index in range(len(self.headers) - 1, -1, -1):
if self.headers[index][0].lower() == name_lower:
del self.headers[index]
return True
return False
def get_statuscode(self):
"""
Return the statuscode part of the status response line
(Assumes no protocol in the statusline)
"""
code = self.statusline.split(' ', 1)[0]
return code
def validate_statusline(self, valid_statusline):
"""
Check that the statusline is valid, eg. starts with a numeric
code. If not, replace with passed in valid_statusline
"""
code = self.get_statuscode()
try:
code = int(code)
assert(code > 0)
return True
except(ValueError, AssertionError):
self.statusline = valid_statusline
return False
def add_range(self, start, part_len, total_len):
"""
Add range headers indicating that this a partial response
"""
content_range = 'bytes {0}-{1}/{2}'.format(start,
start + part_len - 1,
total_len)
self.statusline = '206 Partial Content'
self.replace_header('Content-Range', content_range)
self.replace_header('Accept-Ranges', 'bytes')
return self
def __repr__(self):
headers_str = pformat(self.headers, indent=2, width=WRAP_WIDTH)
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
headers = {2})".format(self.protocol, self.statusline, headers_str)
def __eq__(self, other):
return (self.statusline == other.statusline and
self.headers == other.headers and
self.protocol == other.protocol)
def __str__(self, exclude_list=None):
return self.to_str(exclude_list)
def __bool__(self):
return bool(self.statusline or self.headers)
__nonzero__ = __bool__
def to_str(self, exclude_list):
string = self.protocol
if string and self.statusline:
string += ' '
if self.statusline:
string += self.statusline
if string:
string += '\r\n'
for h in self.headers:
if exclude_list and h[0].lower() in exclude_list:
continue
string += ': '.join(h) + '\r\n'
return string
def to_bytes(self, exclude_list=None):
return self.to_str(exclude_list).encode('iso-8859-1') + b'\r\n'
#=================================================================
def _strip_count(string, total_read):
length = len(string)
return string.rstrip(), total_read + length
#=================================================================
class StatusAndHeadersParser(object):
"""
Parser which consumes a stream support readline() to read
status and headers and return a StatusAndHeaders object
"""
def __init__(self, statuslist, verify=True):
self.statuslist = statuslist
self.verify = verify
def parse(self, stream, full_statusline=None):
"""
parse stream for status line and headers
return a StatusAndHeaders object
support continuation headers starting with space or tab
"""
def readline():
return to_native_str(stream.readline())
# status line w newlines intact
if full_statusline is None:
full_statusline = readline()
else:
full_statusline = to_native_str(full_statusline)
statusline, total_read = _strip_count(full_statusline, 0)
headers = []
# at end of stream
if total_read == 0:
raise EOFError()
elif not statusline:
return StatusAndHeaders(statusline=statusline,
headers=headers,
protocol='',
total_len=total_read)
# validate only if verify is set
if self.verify:
protocol_status = self.split_prefix(statusline, self.statuslist)
if not protocol_status:
msg = 'Expected Status Line starting with {0} - Found: {1}'
msg = msg.format(self.statuslist, statusline)
raise StatusAndHeadersParserException(msg, full_statusline)
else:
protocol_status = statusline.split(' ', 1)
line, total_read = _strip_count(readline(), total_read)
while line:
result = line.split(':', 1)
if len(result) == 2:
name = result[0].rstrip(' \t')
value = result[1].lstrip()
else:
name = result[0]
value = None
next_line, total_read = _strip_count(readline(),
total_read)
# append continuation lines, if any
while next_line and next_line.startswith((' ', '\t')):
if value is not None:
value += next_line
next_line, total_read = _strip_count(readline(),
total_read)
if value is not None:
header = (name, value)
headers.append(header)
line = next_line
if len(protocol_status) > 1:
statusline = protocol_status[1].strip()
else:
statusline = ''
return StatusAndHeaders(statusline=statusline,
headers=headers,
protocol=protocol_status[0],
total_len=total_read)
@staticmethod
def split_prefix(key, prefixs):
"""
split key string into prefix and remainder
for first matching prefix from a list
"""
key_upper = key.upper()
for prefix in prefixs:
if key_upper.startswith(prefix):
plen = len(prefix)
return (key_upper[:plen], key[plen:])
@staticmethod
def make_warc_id(id_=None):
if not id_:
id_ = uuid.uuid1()
return '<urn:uuid:{0}>'.format(id_)
#=================================================================
class StatusAndHeadersParserException(Exception):
"""
status + headers parsing exception
"""
def __init__(self, msg, statusline):
super(StatusAndHeadersParserException, self).__init__(msg)
self.statusline = statusline

View File

@ -1,174 +0,0 @@
r"""
# DecompressingBufferedReader Tests
#=================================================================
# DecompressingBufferedReader readline()
>>> print_str(DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline())
' CDX N b a m s k r M S V g\n'
# detect not compressed
>>> print_str(DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb'), decomp_type = 'gzip').readline())
' CDX N b a m s k r M S V g\n'
# decompress with on the fly compression, default gzip compression
>>> print_str(DecompressingBufferedReader(BytesIO(compress('ABC\n1234\n'))).read())
'ABC\n1234\n'
# decompress with on the fly compression, default 'inflate' compression
>>> print_str(DecompressingBufferedReader(BytesIO(compress_alt('ABC\n1234\n')), decomp_type='deflate').read())
'ABC\n1234\n'
# error: invalid compress type
>>> DecompressingBufferedReader(BytesIO(compress('ABC')), decomp_type = 'bzip2').read()
Traceback (most recent call last):
Exception: Decompression type not supported: bzip2
# invalid output when reading compressed data as not compressed
>>> DecompressingBufferedReader(BytesIO(compress('ABC')), decomp_type = None).read() != b'ABC'
True
# DecompressingBufferedReader readline() with decompression (zipnum file, no header)
>>> print_str(DecompressingBufferedReader(open(test_zip_dir + 'zipnum-sample.cdx.gz', 'rb'), decomp_type = 'gzip').readline())
'com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz\n'
# test very small block size
>>> dbr = DecompressingBufferedReader(BytesIO(b'ABCDEFG\nHIJKLMN\nOPQR\nXYZ'), block_size = 3)
>>> print_str(dbr.readline()); print_str(dbr.readline(4)); print_str(dbr.readline()); print_str(dbr.readline()); print_str(dbr.readline(2)); print_str(dbr.readline()); print_str(dbr.readline())
'ABCDEFG\n'
'HIJK'
'LMN\n'
'OPQR\n'
'XY'
'Z'
''
# test zero length reads
>>> x = DecompressingBufferedReader(LimitReader(BytesIO(b'\r\n'), 1))
>>> print_str(x.readline(0)); print_str(x.read(0))
''
''
# Chunk-Decoding Buffered Reader Tests
#=================================================================
Properly formatted chunked data:
>>> c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n0\r\n\r\n"));
>>> print_str(c.read() + c.read() + c.read())
'1234'
Non-chunked data:
>>> print_str(ChunkedDataReader(BytesIO(b"xyz123!@#")).read())
'xyz123!@#'
Non-chunked, compressed data, specify decomp_type
>>> print_str(ChunkedDataReader(BytesIO(compress('ABCDEF')), decomp_type='gzip').read())
'ABCDEF'
Non-chunked, compressed data, specifiy compression seperately
>>> c = ChunkedDataReader(BytesIO(compress('ABCDEF'))); c.set_decomp('gzip'); print_str(c.read())
'ABCDEF'
Non-chunked, compressed data, wrap in DecompressingBufferedReader
>>> print_str(DecompressingBufferedReader(ChunkedDataReader(BytesIO(compress('\nABCDEF\nGHIJ')))).read())
'\nABCDEF\nGHIJ'
Chunked compressed data
Split compressed stream into 10-byte chunk and a remainder chunk
>>> b = compress('ABCDEFGHIJKLMNOP')
>>> l = len(b)
>>> in_ = format(10, 'x').encode('utf-8') + b"\r\n" + b[:10] + b"\r\n" + format(l - 10, 'x').encode('utf-8') + b"\r\n" + b[10:] + b"\r\n0\r\n\r\n"
>>> c = ChunkedDataReader(BytesIO(in_), decomp_type='gzip')
>>> print_str(c.read())
'ABCDEFGHIJKLMNOP'
Starts like chunked data, but isn't:
>>> c = ChunkedDataReader(BytesIO(b"1\r\nxyz123!@#"));
>>> print_str(c.read() + c.read())
'1\r\nx123!@#'
Chunked data cut off part way through:
>>> c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n4\r\n12"));
>>> print_str(c.read() + c.read())
'123412'
Zero-Length chunk:
>>> print_str(ChunkedDataReader(BytesIO(b"0\r\n\r\n")).read())
''
"""
from io import BytesIO
from pywb.utils.bufferedreaders import ChunkedDataReader, ChunkedDataException
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.limitreader import LimitReader
from pywb import get_test_dir
import six
import zlib
import pytest
test_cdx_dir = get_test_dir() + 'cdx/'
test_zip_dir = get_test_dir() + 'zipcdx/'
def compress(buff):
buff = buff.encode('utf-8')
compressobj = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
compressed = compressobj.compress(buff)
compressed += compressobj.flush()
return compressed
# plain "inflate"
def compress_alt(buff):
buff = buff.encode('utf-8')
compressobj = zlib.compressobj(6, zlib.DEFLATED)
compressed = compressobj.compress(buff)
compressed += compressobj.flush()
# drop gzip headers/tail
compressed = compressed[2:-4]
return compressed
# Brotli
def test_brotli():
with open(get_test_dir() + 'text_content/quickfox_repeated.compressed', 'rb') as fh:
x = DecompressingBufferedReader(fh, decomp_type='br')
x.read() == b'The quick brown fox jumps over the lazy dog' * 4096
# Compression
def test_compress_mix():
# error: compressed member, followed by not compressed -- now allowed!
x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip')
b = x.read()
assert b == b'ABC'
x.read_next_member()
assert x.read() == b'123'
#with pytest.raises(zlib.error):
# x.read()
#error: Error -3 while decompressing: incorrect header check
# Errors
def test_err_chunk_cut_off():
# Chunked data cut off with exceptions
c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n4\r\n12"), raise_exceptions=True)
with pytest.raises(ChunkedDataException):
c.read() + c.read()
#ChunkedDataException: Ran out of data before end of chunk
def print_str(string):
return string.decode('utf-8') if six.PY3 else string
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,27 +1,5 @@
#=================================================================
r""" r"""
# LimitReader Tests #=================================================================
>>> LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26)
'abcdefghji'
>>> LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26)
'abcdefgh'
>>> LimitReader.wrap_stream(LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 8), 4).readline(26)
'abcd'
>>> read_multiple(LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
'efghji'
# zero-length read
>>> print_str(LimitReader(StringIO('a'), 0).readline(0))
''
# don't wrap if invalid length
>>> b = StringIO('b')
>>> LimitReader.wrap_stream(b, 'abc') == b
True
# BlockLoader Tests (includes LimitReader) # BlockLoader Tests (includes LimitReader)
# Ensure attempt to read more than 100 bytes, reads exactly 100 bytes # Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
>>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read(400)) >>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read(400))
@ -143,27 +121,14 @@ import requests
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
from pywb.utils.loaders import extract_client_cookie, extract_post_query from pywb.utils.loaders import extract_client_cookie, extract_post_query
from pywb.utils.loaders import append_post_query, read_last_line from pywb.utils.loaders import append_post_query, read_last_line
from pywb.utils.limitreader import LimitReader
from pywb.utils.bufferedreaders import DecompressingBufferedReader from warcio.bufferedreaders import DecompressingBufferedReader
from pywb import get_test_dir from pywb import get_test_dir
test_cdx_dir = get_test_dir() + 'cdx/' test_cdx_dir = get_test_dir() + 'cdx/'
def read_multiple(reader, inc_reads):
result = None
for x in inc_reads:
result = reader.read(x)
return result
def seek_read_full(seekable_reader, offset):
seekable_reader.seek(offset)
seekable_reader.readline() #skip
return seekable_reader.readline()
def test_s3_read_1(): def test_s3_read_1():
pytest.importorskip('boto') pytest.importorskip('boto')
@ -178,15 +143,6 @@ def test_s3_read_1():
assert reader.readline() == b'WARC/1.0\r\n' assert reader.readline() == b'WARC/1.0\r\n'
assert reader.readline() == b'WARC-Type: response\r\n' assert reader.readline() == b'WARC-Type: response\r\n'
def test_limit_post():
reader = LimitReader(BytesIO(b'abcdefg'), 3)
r = requests.request(method='POST',
url='http://httpbin.org/post',
data=reader,
headers={'Content-Length': '3'})
assert '"abc"' in r.text
# Error # Error
def test_err_no_such_file(): def test_err_no_such_file():
# no such file # no such file

View File

@ -1,182 +0,0 @@
"""
>>> st1 = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
>>> st1
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
('Some', 'Value'),
('Multi-Line', 'Value1 Also This')])
# add range
>>> StatusAndHeaders(statusline = '200 OK', headers=[('Content-Type', 'text/plain')]).add_range(10, 4, 100)
StatusAndHeaders(protocol = '', statusline = '206 Partial Content', headers = [ ('Content-Type', 'text/plain'),
('Content-Range', 'bytes 10-13/100'),
('Accept-Ranges', 'bytes')])
# other protocol expected
>>> StatusAndHeadersParser(['Other']).parse(StringIO(status_headers_1)) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
>>> StatusAndHeadersParser(['Other'], verify=False).parse(StringIO(status_headers_1))
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
('Some', 'Value'),
('Multi-Line', 'Value1 Also This')])
# verify protocol line
>>> StatusAndHeadersParser(['HTTP/1.0'], verify=True).parse(StringIO(unknown_protocol_headers)) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0'] - Found: OtherBlah
# allow unexpected/invalid protocol line
>>> StatusAndHeadersParser(['HTTP/1.0'], verify=False).parse(StringIO(unknown_protocol_headers))
StatusAndHeaders(protocol = 'OtherBlah', statusline = '', headers = [('Foo', 'Bar')])
# test equality op
>>> st1 == StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
True
# replace header, print new headers
>>> st1.replace_header('some', 'Another-Value'); st1
'Value'
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
('Some', 'Another-Value'),
('Multi-Line', 'Value1 Also This')])
# remove header
>>> st1.remove_header('some')
True
# already removed
>>> st1.remove_header('Some')
False
# empty
>>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2
StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = [])
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_3))
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 Empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')])
# case-insensitive match
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_4))
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')])
"""
from pywb.utils.statusandheaders import StatusAndHeadersParser, StatusAndHeaders
from six import StringIO
import pytest
status_headers_1 = "\
HTTP/1.0 200 OK\r\n\
Content-Type: ABC\r\n\
HTTP/1.0 200 OK\r\n\
Some: Value\r\n\
Multi-Line: Value1\r\n\
Also This\r\n\
\r\n\
Body"
status_headers_2 = """
"""
status_headers_3 = "\
HTTP/1.0 204 Empty\r\n\
Content-Type: Value\r\n\
%Invalid%\r\n\
\tMultiline\r\n\
Content-Length: 0\r\n\
\r\n"
status_headers_4 = "\
http/1.0 204 empty\r\n\
Content-Type: Value\r\n\
%Invalid%\r\n\
\tMultiline\r\n\
Content-Length: 0\r\n\
\r\n"
unknown_protocol_headers = "\
OtherBlah\r\n\
Foo: Bar\r\n\
\r\n"
req_headers = "\
GET / HTTP/1.0\r\n\
Foo: Bar\r\n\
Content-Length: 0\r\n"
if __name__ == "__main__":
import doctest
doctest.testmod()
def test_to_str_1():
res = str(StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1)))
exp = "\
HTTP/1.0 200 OK\r\n\
Content-Type: ABC\r\n\
Some: Value\r\n\
Multi-Line: Value1 Also This\r\n\
"
assert(res == exp)
def test_to_str_exclude():
sah = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
res = sah.to_str(['multi-line'])
exp = "\
HTTP/1.0 200 OK\r\n\
Content-Type: ABC\r\n\
Some: Value\r\n\
"
assert(res == exp)
assert(sah.to_bytes(['multi-line']) == (exp.encode('latin-1') + b'\r\n'))
def test_to_str_2():
res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers)))
assert(res == req_headers)
res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers + '\r\n')))
assert(res == req_headers)
def test_to_str_with_remove():
res = StatusAndHeadersParser(['GET']).parse(StringIO(req_headers))
res.remove_header('Foo')
exp = "\
GET / HTTP/1.0\r\n\
Content-Length: 0\r\n"
assert(str(res) == exp)
def test_status_empty():
with pytest.raises(EOFError):
StatusAndHeadersParser([], verify=False).parse(StringIO(''))
def test_status_one_word():
res = StatusAndHeadersParser(['GET'], verify=False).parse(StringIO('A'))
assert(str(res) == 'A\r\n')

View File

@ -1,330 +0,0 @@
"""
utility functions for converting between
datetime, iso date and 14-digit timestamp
"""
import re
import time
import datetime
import calendar
from email.utils import parsedate, formatdate
#=================================================================
# str <-> datetime conversion
#=================================================================
DATE_TIMESPLIT = re.compile(r'[^\d]')
TIMESTAMP_14 = '%Y%m%d%H%M%S'
ISO_DT = '%Y-%m-%dT%H:%M:%SZ'
PAD_14_DOWN = '10000101000000'
PAD_14_UP = '29991231235959'
PAD_6_UP = '299912'
def iso_date_to_datetime(string):
"""
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
datetime.datetime(2013, 12, 26, 10, 11, 12)
>>> iso_date_to_datetime('2013-12-26T10:11:12Z')
datetime.datetime(2013, 12, 26, 10, 11, 12)
"""
nums = DATE_TIMESPLIT.split(string)
if nums[-1] == '':
nums = nums[:-1]
the_datetime = datetime.datetime(*(int(num) for num in nums))
return the_datetime
def http_date_to_datetime(string):
"""
>>> http_date_to_datetime('Thu, 26 Dec 2013 09:50:10 GMT')
datetime.datetime(2013, 12, 26, 9, 50, 10)
"""
return datetime.datetime(*parsedate(string)[:6])
def datetime_to_http_date(the_datetime):
"""
>>> datetime_to_http_date(datetime.datetime(2013, 12, 26, 9, 50, 10))
'Thu, 26 Dec 2013 09:50:10 GMT'
# Verify inverses
>>> x = 'Thu, 26 Dec 2013 09:50:10 GMT'
>>> datetime_to_http_date(http_date_to_datetime(x)) == x
True
"""
timeval = calendar.timegm(the_datetime.utctimetuple())
return formatdate(timeval=timeval,
localtime=False,
usegmt=True)
def datetime_to_iso_date(the_datetime):
"""
>>> datetime_to_iso_date(datetime.datetime(2013, 12, 26, 10, 11, 12))
'2013-12-26T10:11:12Z'
>>> datetime_to_iso_date( datetime.datetime(2013, 12, 26, 10, 11, 12))
'2013-12-26T10:11:12Z'
"""
return the_datetime.strftime(ISO_DT)
def datetime_to_timestamp(the_datetime):
"""
>>> datetime_to_timestamp(datetime.datetime(2013, 12, 26, 10, 11, 12))
'20131226101112'
"""
return the_datetime.strftime(TIMESTAMP_14)
def timestamp_now():
"""
>>> len(timestamp_now())
14
"""
return datetime_to_timestamp(datetime.datetime.utcnow())
def timestamp20_now():
"""
Create 20-digit timestamp, useful to timestamping temp files
>>> n = timestamp20_now()
>>> timestamp20_now() >= n
True
>>> len(n)
20
"""
now = datetime.datetime.utcnow()
return now.strftime('%Y%m%d%H%M%S%f')
def iso_date_to_timestamp(string):
"""
>>> iso_date_to_timestamp('2013-12-26T10:11:12Z')
'20131226101112'
>>> iso_date_to_timestamp('2013-12-26T10:11:12')
'20131226101112'
"""
return datetime_to_timestamp(iso_date_to_datetime(string))
def timestamp_to_iso_date(string):
"""
>>> timestamp_to_iso_date('20131226101112')
'2013-12-26T10:11:12Z'
>>> timestamp_to_iso_date('20131226101112')
'2013-12-26T10:11:12Z'
"""
return datetime_to_iso_date(timestamp_to_datetime(string))
def http_date_to_timestamp(string):
"""
>>> http_date_to_timestamp('Thu, 26 Dec 2013 09:50:00 GMT')
'20131226095000'
>>> http_date_to_timestamp('Sun, 26 Jan 2014 20:08:04 GMT')
'20140126200804'
"""
return datetime_to_timestamp(http_date_to_datetime(string))
# pad to certain length (default 6)
def pad_timestamp(string, pad_str=PAD_6_UP):
"""
>>> pad_timestamp('20')
'209912'
>>> pad_timestamp('2014')
'201412'
>>> pad_timestamp('20141011')
'20141011'
>>> pad_timestamp('201410110010')
'201410110010'
"""
str_len = len(string)
pad_len = len(pad_str)
if str_len < pad_len:
string = string + pad_str[str_len:]
return string
def timestamp_to_datetime(string):
"""
# >14-digit -- rest ignored
>>> timestamp_to_datetime('2014122609501011')
datetime.datetime(2014, 12, 26, 9, 50, 10)
# 14-digit
>>> timestamp_to_datetime('20141226095010')
datetime.datetime(2014, 12, 26, 9, 50, 10)
# 13-digit padding
>>> timestamp_to_datetime('2014122609501')
datetime.datetime(2014, 12, 26, 9, 50, 59)
# 12-digit padding
>>> timestamp_to_datetime('201412260950')
datetime.datetime(2014, 12, 26, 9, 50, 59)
# 11-digit padding
>>> timestamp_to_datetime('20141226095')
datetime.datetime(2014, 12, 26, 9, 59, 59)
# 10-digit padding
>>> timestamp_to_datetime('2014122609')
datetime.datetime(2014, 12, 26, 9, 59, 59)
# 9-digit padding
>>> timestamp_to_datetime('201412260')
datetime.datetime(2014, 12, 26, 23, 59, 59)
# 8-digit padding
>>> timestamp_to_datetime('20141226')
datetime.datetime(2014, 12, 26, 23, 59, 59)
# 7-digit padding
>>> timestamp_to_datetime('2014122')
datetime.datetime(2014, 12, 31, 23, 59, 59)
# 6-digit padding
>>> timestamp_to_datetime('201410')
datetime.datetime(2014, 10, 31, 23, 59, 59)
# 5-digit padding
>>> timestamp_to_datetime('20141')
datetime.datetime(2014, 12, 31, 23, 59, 59)
# 4-digit padding
>>> timestamp_to_datetime('2014')
datetime.datetime(2014, 12, 31, 23, 59, 59)
# 3-digit padding
>>> timestamp_to_datetime('201')
datetime.datetime(2019, 12, 31, 23, 59, 59)
# 2-digit padding
>>> timestamp_to_datetime('20')
datetime.datetime(2099, 12, 31, 23, 59, 59)
# 1-digit padding
>>> timestamp_to_datetime('2')
datetime.datetime(2999, 12, 31, 23, 59, 59)
# 1-digit out-of-range padding
>>> timestamp_to_datetime('3')
datetime.datetime(2999, 12, 31, 23, 59, 59)
# 0-digit padding
>>> timestamp_to_datetime('')
datetime.datetime(2999, 12, 31, 23, 59, 59)
# bad month
>>> timestamp_to_datetime('20131709005601')
datetime.datetime(2013, 12, 9, 0, 56, 1)
# all out of range except minutes
>>> timestamp_to_datetime('40001965252477')
datetime.datetime(2999, 12, 31, 23, 24, 59)
# not a number!
>>> timestamp_to_datetime('2010abc')
datetime.datetime(2010, 12, 31, 23, 59, 59)
"""
# pad to 6 digits
string = pad_timestamp(string, PAD_6_UP)
def clamp(val, min_, max_):
try:
val = int(val)
val = max(min_, min(val, max_))
return val
except:
return max_
def extract(string, start, end, min_, max_):
if len(string) >= end:
return clamp(string[start:end], min_, max_)
else:
return max_
# now parse, clamp to boundary
year = extract(string, 0, 4, 1900, 2999)
month = extract(string, 4, 6, 1, 12)
day = extract(string, 6, 8, 1, calendar.monthrange(year, month)[1])
hour = extract(string, 8, 10, 0, 23)
minute = extract(string, 10, 12, 0, 59)
second = extract(string, 12, 14, 0, 59)
return datetime.datetime(year=year,
month=month,
day=day,
hour=hour,
minute=minute,
second=second)
#return time.strptime(pad_timestamp(string), TIMESTAMP_14)
def timestamp_to_sec(string):
"""
>>> timestamp_to_sec('20131226095010')
1388051410
# rounds to end of 2014
>>> timestamp_to_sec('2014')
1420070399
"""
return calendar.timegm(timestamp_to_datetime(string).utctimetuple())
def sec_to_timestamp(secs):
"""
>>> sec_to_timestamp(1388051410)
'20131226095010'
>>> sec_to_timestamp(1420070399)
'20141231235959'
"""
return datetime_to_timestamp(datetime.datetime.utcfromtimestamp(secs))
def timestamp_to_http_date(string):
"""
>>> timestamp_to_http_date('20131226095000')
'Thu, 26 Dec 2013 09:50:00 GMT'
>>> timestamp_to_http_date('20140126200804')
'Sun, 26 Jan 2014 20:08:04 GMT'
"""
return datetime_to_http_date(timestamp_to_datetime(string))
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,8 +1,8 @@
from pywb.utils.timeutils import iso_date_to_timestamp
from pywb.utils.canonicalize import canonicalize from pywb.utils.canonicalize import canonicalize
from pywb.utils.loaders import extract_post_query, append_post_query from pywb.utils.loaders import extract_post_query, append_post_query
from pywb.warc.archiveiterator import ArchiveIterator from warcio.timeutils import iso_date_to_timestamp
from warcio.archiveiterator import ArchiveIterator
import hashlib import hashlib
import base64 import base64
@ -71,7 +71,7 @@ class ArchiveIndexEntryMixin(object):
self['urlkey'] = canonicalize(url, surt_ordered) self['urlkey'] = canonicalize(url, surt_ordered)
other['urlkey'] = self['urlkey'] other['urlkey'] = self['urlkey']
referer = other.record.status_headers.get_header('referer') referer = other.record.http_headers.get_header('referer')
if referer: if referer:
self['_referer'] = referer self['_referer'] = referer
@ -141,7 +141,7 @@ class DefaultRecordParser(object):
for record in raw_iter: for record in raw_iter:
entry = None entry = None
if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'): if not include_all and not minimal and (record.http_headers.get_statuscode() == '-'):
continue continue
if record.rec_type == 'arc_header': if record.rec_type == 'arc_header':
@ -175,13 +175,13 @@ class DefaultRecordParser(object):
compute_digest = True compute_digest = True
elif not minimal and record.rec_type == 'request' and append_post: elif not minimal and record.rec_type == 'request' and append_post:
method = record.status_headers.protocol method = record.http_headers.protocol
len_ = record.status_headers.get_header('Content-Length') len_ = record.http_headers.get_header('Content-Length')
post_query = extract_post_query(method, post_query = extract_post_query(method,
entry.get('_content_type'), entry.get('_content_type'),
len_, len_,
record.stream) record.raw_stream)
entry['_post_query'] = post_query entry['_post_query'] = post_query
@ -236,7 +236,7 @@ class DefaultRecordParser(object):
if record.rec_type == 'warcinfo': if record.rec_type == 'warcinfo':
entry['url'] = record.rec_headers.get_header('WARC-Filename') entry['url'] = record.rec_headers.get_header('WARC-Filename')
entry['urlkey'] = entry['url'] entry['urlkey'] = entry['url']
entry['_warcinfo'] = record.stream.read(record.length) entry['_warcinfo'] = record.raw_stream.read(record.length)
return entry return entry
entry['url'] = record.rec_headers.get_header('WARC-Target-Uri') entry['url'] = record.rec_headers.get_header('WARC-Target-Uri')
@ -252,13 +252,13 @@ class DefaultRecordParser(object):
entry['mime'] = '-' entry['mime'] = '-'
else: else:
def_mime = '-' if record.rec_type == 'request' else 'unk' def_mime = '-' if record.rec_type == 'request' else 'unk'
entry.extract_mime(record.status_headers. entry.extract_mime(record.http_headers.
get_header('Content-Type'), get_header('Content-Type'),
def_mime) def_mime)
# status -- only for response records (by convention): # status -- only for response records (by convention):
if record.rec_type == 'response' and not self.options.get('minimal'): if record.rec_type == 'response' and not self.options.get('minimal'):
entry.extract_status(record.status_headers) entry.extract_status(record.http_headers)
else: else:
entry['status'] = '-' entry['status'] = '-'
@ -304,7 +304,7 @@ class DefaultRecordParser(object):
entry.extract_mime(record.rec_headers.get_header('content-type')) entry.extract_mime(record.rec_headers.get_header('content-type'))
# status # status
entry.extract_status(record.status_headers) entry.extract_status(record.http_headers)
# digest # digest
entry['digest'] = '-' entry['digest'] = '-'

View File

@ -1,233 +0,0 @@
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.warc.recordloader import ArcWarcRecordLoader
import six
import sys
# ============================================================================
BUFF_SIZE = 16384
class ArchiveIterator(six.Iterator):
""" Iterate over records in WARC and ARC files, both gzip chunk
compressed and uncompressed
The indexer will automatically detect format, and decompress
if necessary.
"""
GZIP_ERR_MSG = """
ERROR: Non-chunked gzip file detected, gzip block continues
beyond single record.
This file is probably not a multi-chunk gzip but a single gzip file.
To allow seek, a gzipped {1} must have each record compressed into
a single gzip chunk and concatenated together.
This file is likely still valid and you can use it by decompressing it:
gunzip myfile.{0}.gz
You can then also use the 'warc2warc' tool from the 'warc-tools'
package which will create a properly chunked gzip file:
warc2warc -Z myfile.{0} > myfile.{0}.gz
"""
INC_RECORD = """\
WARNING: Record not followed by newline, perhaps Content-Length is invalid
Offset: {0}
Remainder: {1}
"""
def __init__(self, fileobj, no_record_parse=False,
verify_http=False, arc2warc=False, block_size=BUFF_SIZE):
self.fh = fileobj
self.loader = ArcWarcRecordLoader(verify_http=verify_http,
arc2warc=arc2warc)
self.reader = None
self.offset = 0
self.known_format = None
self.mixed_arc_warc = arc2warc
self.member_info = None
self.no_record_parse = no_record_parse
self.reader = DecompressingBufferedReader(self.fh,
block_size=block_size)
self.offset = self.fh.tell()
self.next_line = None
self._raise_invalid_gzip = False
self._is_empty = False
self._is_first = True
self.last_record = None
def __iter__(self):
return self
def __next__(self):
while True:
if not self._is_first:
self._finish_record()
self._is_first = False
try:
self.last_record = self._next_record(self.next_line)
if self._raise_invalid_gzip:
self._raise_invalid_gzip_err()
return self.last_record
except EOFError:
self._is_empty = True
def _finish_record(self):
if self.last_record:
self.read_to_end(self.last_record)
if self.reader.decompressor:
# if another gzip member, continue
if self.reader.read_next_member():
return
# if empty record, then we're done
elif self._is_empty:
raise StopIteration()
# otherwise, probably a gzip
# containing multiple non-chunked records
# raise this as an error
else:
self._raise_invalid_gzip = True
# non-gzip, so we're done
elif self._is_empty:
raise StopIteration()
def _raise_invalid_gzip_err(self):
""" A gzip file with multiple ARC/WARC records, non-chunked
has been detected. This is not valid for replay, so notify user
"""
frmt = 'warc/arc'
if self.known_format:
frmt = self.known_format
frmt_up = frmt.upper()
msg = self.GZIP_ERR_MSG.format(frmt, frmt_up)
raise Exception(msg)
def _consume_blanklines(self):
""" Consume blank lines that are between records
- For warcs, there are usually 2
- For arcs, may be 1 or 0
- For block gzipped files, these are at end of each gzip envelope
and are included in record length which is the full gzip envelope
- For uncompressed, they are between records and so are NOT part of
the record length
count empty_size so that it can be substracted from
the record length for uncompressed
if first line read is not blank, likely error in WARC/ARC,
display a warning
"""
empty_size = 0
first_line = True
while True:
line = self.reader.readline()
if len(line) == 0:
return None, empty_size
stripped = line.rstrip()
if len(stripped) == 0 or first_line:
empty_size += len(line)
if len(stripped) != 0:
# if first line is not blank,
# likely content-length was invalid, display warning
err_offset = self.fh.tell() - self.reader.rem_length() - empty_size
sys.stderr.write(self.INC_RECORD.format(err_offset, line))
first_line = False
continue
return line, empty_size
def read_to_end(self, record, payload_callback=None):
""" Read remainder of the stream
If a digester is included, update it
with the data read
"""
# already at end of this record, don't read until it is consumed
if self.member_info:
return None
num = 0
curr_offset = self.offset
while True:
b = record.stream.read(BUFF_SIZE)
if not b:
break
num += len(b)
if payload_callback:
payload_callback(b)
"""
- For compressed files, blank lines are consumed
since they are part of record length
- For uncompressed files, blank lines are read later,
and not included in the record length
"""
#if self.reader.decompressor:
self.next_line, empty_size = self._consume_blanklines()
self.offset = self.fh.tell() - self.reader.rem_length()
#if self.offset < 0:
# raise Exception('Not Gzipped Properly')
if self.next_line:
self.offset -= len(self.next_line)
length = self.offset - curr_offset
if not self.reader.decompressor:
length -= empty_size
self.member_info = (curr_offset, length)
#return self.member_info
#return next_line
def _next_record(self, next_line):
""" Use loader to parse the record from the reader stream
Supporting warc and arc records
"""
record = self.loader.parse_record_stream(self.reader,
next_line,
self.known_format,
self.no_record_parse)
self.member_info = None
# Track known format for faster parsing of other records
if not self.mixed_arc_warc:
self.known_format = record.format
return record

View File

@ -1,6 +1,6 @@
from warcio.bufferedreaders import DecompressingBufferedReader
from warcio.recordloader import ArcWarcRecordLoader
from pywb.utils.loaders import BlockLoader from pywb.utils.loaders import BlockLoader
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.warc.recordloader import ArcWarcRecordLoader
#================================================================= #=================================================================

View File

@ -1,298 +0,0 @@
import collections
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.statusandheaders import StatusAndHeadersParserException
from pywb.utils.limitreader import LimitReader
from pywb.utils.loaders import to_native_str
#from pywb.utils.wbexception import WbException
from pywb.utils.timeutils import timestamp_to_iso_date
from six.moves import zip
#=================================================================
#ArcWarcRecord = collections.namedtuple('ArcWarcRecord',
# 'format, rec_type, rec_headers, ' +
# 'stream, status_headers, ' +
# 'content_type, length')
#=================================================================
class ArcWarcRecord(object):
def __init__(self, *args):
(self.format, self.rec_type, self.rec_headers, self.stream,
self.status_headers, self.content_type, self.length) = args
#=================================================================
class ArchiveLoadFailed(Exception):
def __init__(self, reason, filename=''):
if filename:
msg = filename + ': ' + str(reason)
else:
msg = str(reason)
super(ArchiveLoadFailed, self).__init__(msg)
self.msg = msg
#=================================================================
class ArcWarcRecordLoader(object):
WARC_TYPES = ['WARC/1.0', 'WARC/0.17', 'WARC/0.18']
HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']
HTTP_VERBS = ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'TRACE',
'OPTIONS', 'CONNECT', 'PATCH']
NON_HTTP_RECORDS = ('warcinfo', 'arc_header', 'metadata', 'resource')
NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
HTTP_SCHEMES = ('http:', 'https:')
def __init__(self, verify_http=True, arc2warc=True):
if arc2warc:
self.arc_parser = ARC2WARCHeadersParser()
else:
self.arc_parser = ARCHeadersParser()
self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
def parse_record_stream(self, stream,
statusline=None,
known_format=None,
no_record_parse=False):
""" Parse file-like stream and return an ArcWarcRecord
encapsulating the record headers, http headers (if any),
and a stream limited to the remainder of the record.
Pass statusline and known_format to detect_type_loader_headers()
to faciliate parsing.
"""
(the_format, rec_headers) = (self.
_detect_type_load_headers(stream,
statusline,
known_format))
if the_format == 'arc':
uri = rec_headers.get_header('uri')
length = rec_headers.get_header('length')
content_type = rec_headers.get_header('content-type')
sub_len = rec_headers.total_len
if uri and uri.startswith('filedesc://'):
rec_type = 'arc_header'
else:
rec_type = 'response'
elif the_format in ('warc', 'arc2warc'):
rec_type = rec_headers.get_header('WARC-Type')
uri = rec_headers.get_header('WARC-Target-URI')
length = rec_headers.get_header('Content-Length')
content_type = rec_headers.get_header('Content-Type')
if the_format == 'warc':
sub_len = 0
else:
sub_len = rec_headers.total_len
the_format = 'warc'
is_err = False
try:
if length is not None:
length = int(length) - sub_len
if length < 0:
is_err = True
except (ValueError, TypeError):
is_err = True
# err condition
if is_err:
length = 0
# or status and headers are completely empty (blank lines found)
elif not rec_headers:
length = 0
# limit stream to the length for all valid records
if length is not None and length >= 0:
stream = LimitReader.wrap_stream(stream, length)
# don't parse the http record at all
if no_record_parse:
status_headers = None#StatusAndHeaders('', [])
# if empty record (error or otherwise) set status to 204
elif length == 0:
if is_err:
msg = '204 Possible Error'
else:
msg = '204 No Content'
status_headers = StatusAndHeaders(msg, [])
# response record or non-empty revisit: parse HTTP status and headers!
elif (rec_type in ('response', 'revisit')
and uri.startswith(self.HTTP_SCHEMES)):
status_headers = self.http_parser.parse(stream)
# request record: parse request
elif ((rec_type == 'request')
and uri.startswith(self.HTTP_SCHEMES)):
status_headers = self.http_req_parser.parse(stream)
# everything else: create a no-status entry, set content-type
else:
content_type_header = [('Content-Type', content_type)]
if length is not None and length >= 0:
content_type_header.append(('Content-Length', str(length)))
status_headers = StatusAndHeaders('200 OK', content_type_header)
return ArcWarcRecord(the_format, rec_type,
rec_headers, stream, status_headers,
content_type, length)
def _detect_type_load_headers(self, stream,
statusline=None, known_format=None):
""" If known_format is specified ('warc' or 'arc'),
parse only as that format.
Otherwise, try parsing record as WARC, then try parsing as ARC.
if neither one succeeds, we're out of luck.
"""
if known_format != 'arc':
# try as warc first
try:
rec_headers = self.warc_parser.parse(stream, statusline)
return 'warc', rec_headers
except StatusAndHeadersParserException as se:
if known_format == 'warc':
msg = 'Invalid WARC record, first line: '
raise ArchiveLoadFailed(msg + str(se.statusline))
statusline = se.statusline
pass
# now try as arc
try:
rec_headers = self.arc_parser.parse(stream, statusline)
return self.arc_parser.get_rec_type(), rec_headers
except StatusAndHeadersParserException as se:
if known_format == 'arc':
msg = 'Invalid ARC record, first line: '
else:
msg = 'Unknown archive format, first line: '
raise ArchiveLoadFailed(msg + str(se.statusline))
#=================================================================
class ARCHeadersParser(object):
# ARC 1.0 headers
ARC_HEADERS = ["uri", "ip-address", "archive-date",
"content-type", "length"]
def __init__(self):
self.headernames = self.get_header_names()
def get_rec_type(self):
return 'arc'
def parse(self, stream, headerline=None):
total_read = 0
def readline():
return to_native_str(stream.readline())
# if headerline passed in, use that
if headerline is None:
headerline = readline()
else:
headerline = to_native_str(headerline)
header_len = len(headerline)
if header_len == 0:
raise EOFError()
headerline = headerline.rstrip()
headernames = self.headernames
# if arc header, consume next two lines
if headerline.startswith('filedesc://'):
version = readline() # skip version
spec = readline() # skip header spec, use preset one
total_read += len(version)
total_read += len(spec)
parts = headerline.split(' ')
if len(parts) != len(headernames):
msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'
msg = msg.format(headernames, parts)
raise StatusAndHeadersParserException(msg, parts)
protocol, headers = self._get_protocol_and_headers(headerline, parts)
return StatusAndHeaders(statusline='',
headers=headers,
protocol='WARC/1.0',
total_len=total_read)
@classmethod
def get_header_names(cls):
return cls.ARC_HEADERS
def _get_protocol_and_headers(self, headerline, parts):
headers = []
for name, value in zip(self.headernames, parts):
headers.append((name, value))
return ('ARC/1.0', headers)
#=================================================================
class ARC2WARCHeadersParser(ARCHeadersParser):
# Headers for converting ARC -> WARC Header
ARC_TO_WARC_HEADERS = ["WARC-Target-URI",
"WARC-IP-Address",
"WARC-Date",
"Content-Type",
"Content-Length"]
def get_rec_type(self):
return 'arc2warc'
@classmethod
def get_header_names(cls):
return cls.ARC_TO_WARC_HEADERS
def _get_protocol_and_headers(self, headerline, parts):
headers = []
for name, value in zip(self.headernames, parts):
if name == 'WARC-Date':
value = timestamp_to_iso_date(value)
headers.append((name, value))
if headerline.startswith('filedesc://'):
rec_type = 'arc_header'
else:
rec_type = 'response'
headers.append(('WARC-Type', rec_type))
headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id()))
return ('WARC/1.0', headers)

View File

@ -1,6 +1,7 @@
from pywb.utils.timeutils import iso_date_to_timestamp from warcio.recordloader import ArchiveLoadFailed
from warcio.timeutils import iso_date_to_timestamp
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
from pywb.warc.recordloader import ArchiveLoadFailed
from pywb.utils.wbexception import NotFoundException from pywb.utils.wbexception import NotFoundException
import six import six
@ -27,20 +28,20 @@ class ResolvingLoader(object):
elif headers_record != payload_record: elif headers_record != payload_record:
# close remainder of stream as this record only used for # close remainder of stream as this record only used for
# (already parsed) headers # (already parsed) headers
headers_record.stream.close() headers_record.raw_stream.close()
# special case: check if headers record is actually empty # special case: check if headers record is actually empty
# (eg empty revisit), then use headers from revisit # (eg empty revisit), then use headers from revisit
if not headers_record.status_headers.headers: if not headers_record.http_headers.headers:
headers_record = payload_record headers_record = payload_record
if not headers_record or not payload_record: if not headers_record or not payload_record:
raise ArchiveLoadFailed('Could not load ' + str(cdx)) raise ArchiveLoadFailed('Could not load ' + str(cdx))
# ensure status line is valid from here # ensure status line is valid from here
headers_record.status_headers.validate_statusline('204 No Content') headers_record.http_headers.validate_statusline('204 No Content')
return (headers_record.status_headers, payload_record.stream) return (headers_record.http_headers, payload_record.raw_stream)
def load_headers_and_payload(self, cdx, failed_files, cdx_loader): def load_headers_and_payload(self, cdx, failed_files, cdx_loader):
""" """
@ -107,7 +108,7 @@ class ResolvingLoader(object):
# optimization: if same file already failed this request, # optimization: if same file already failed this request,
# don't try again # don't try again
if failed_files is not None and filename in failed_files: if failed_files is not None and filename in failed_files:
raise ArchiveLoadFailed('Skipping Already Failed', filename) raise ArchiveLoadFailed('Skipping Already Failed: ' + filename)
any_found = False any_found = False
last_exc = None last_exc = None
@ -144,7 +145,7 @@ class ResolvingLoader(object):
msg = 'Archive File Not Found' msg = 'Archive File Not Found'
#raise ArchiveLoadFailed(msg, filename), None, last_traceback #raise ArchiveLoadFailed(msg, filename), None, last_traceback
six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(msg, filename), last_traceback) six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(filename + ': ' + msg), last_traceback)
def _load_different_url_payload(self, cdx, headers_record, def _load_different_url_payload(self, cdx, headers_record,
failed_files, cdx_loader): failed_files, cdx_loader):

View File

@ -292,13 +292,13 @@ import sys
import pprint import pprint
import six import six
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed from warcio.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader from pywb.warc.resolvingloader import ResolvingLoader
from pywb.warc.pathresolvers import PathResolverMapper from pywb.warc.pathresolvers import PathResolverMapper
from pywb.cdx.cdxobject import CDXObject from pywb.cdx.cdxobject import CDXObject
import pywb.utils.statusandheaders import warcio.statusandheaders
from pywb import get_test_dir from pywb import get_test_dir
@ -331,10 +331,12 @@ def load_test_archive(test_file, offset, length):
archive = testloader.load(path, offset, length) archive = testloader.load(path, offset, length)
pywb.utils.statusandheaders.WRAP_WIDTH = 160 warcio.statusandheaders.WRAP_WIDTH = 160
pprint.pprint(((archive.format, archive.rec_type), pprint.pprint(((archive.format, archive.rec_type),
archive.rec_headers, archive.status_headers), indent=1, width=160) archive.rec_headers, archive.http_headers), indent=1, width=160)
warcio.statusandheaders.WRAP_WIDTH = 80
#============================================================================== #==============================================================================

View File

@ -1,121 +0,0 @@
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.warc.warcwriter import BufferWARCWriter
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.archiveiterator import ArchiveIterator
from io import BytesIO
from collections import OrderedDict
import json
# ============================================================================
class FixedTestWARCWriter(BufferWARCWriter):
@classmethod
def _make_warc_id(cls, id_=None):
return '<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>'
@classmethod
def _make_warc_date(cls):
return '2000-01-01T00:00:00Z'
# ============================================================================
class TestWarcWriter(object):
def _validate_record_content_len(self, stream):
for record in ArchiveIterator(stream, no_record_parse=True):
assert record.status_headers == None
assert int(record.rec_headers.get_header('Content-Length')) == record.length
assert record.length == len(record.stream.read())
def test_warcinfo_record(self):
simplewriter = FixedTestWARCWriter(gzip=False)
params = OrderedDict([('software', 'recorder test'),
('format', 'WARC File Format 1.0'),
('json-metadata', json.dumps({'foo': 'bar'}))])
record = simplewriter.create_warcinfo_record('testfile.warc.gz', params)
simplewriter.write_record(record)
buff = simplewriter.get_contents()
assert isinstance(buff, bytes)
buff = BytesIO(buff)
parsed_record = ArcWarcRecordLoader().parse_record_stream(buff)
assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
assert parsed_record.rec_headers.get_header('Content-Type') == 'application/warc-fields'
assert parsed_record.rec_headers.get_header('WARC-Filename') == 'testfile.warc.gz'
buff = parsed_record.stream.read().decode('utf-8')
length = parsed_record.rec_headers.get_header('Content-Length')
assert len(buff) == int(length)
assert 'json-metadata: {"foo": "bar"}\r\n' in buff
assert 'format: WARC File Format 1.0\r\n' in buff
warcinfo_record = '\
WARC/1.0\r\n\
WARC-Type: warcinfo\r\n\
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
WARC-Filename: testfile.warc.gz\r\n\
WARC-Date: 2000-01-01T00:00:00Z\r\n\
Content-Type: application/warc-fields\r\n\
Content-Length: 86\r\n\
\r\n\
software: recorder test\r\n\
format: WARC File Format 1.0\r\n\
json-metadata: {"foo": "bar"}\r\n\
\r\n\
\r\n\
'
assert simplewriter.get_contents().decode('utf-8') == warcinfo_record
def test_generate_response(self):
headers_list = [('Content-Type', 'text/plain; charset="UTF-8"'),
('Custom-Header', 'somevalue')
]
payload = b'some\ntext'
status_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0')
writer = FixedTestWARCWriter(gzip=False)
record = writer.create_warc_record('http://example.com/', 'response',
payload=BytesIO(payload),
length=len(payload),
status_headers=status_headers)
writer.write_record(record)
buff = writer.get_contents()
self._validate_record_content_len(BytesIO(buff))
warc_record = '\
WARC/1.0\r\n\
WARC-Type: response\r\n\
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
WARC-Target-URI: http://example.com/\r\n\
WARC-Date: 2000-01-01T00:00:00Z\r\n\
WARC-Block-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
Content-Type: application/http; msgtype=response\r\n\
Content-Length: 97\r\n\
\r\n\
HTTP/1.0 200 OK\r\n\
Content-Type: text/plain; charset="UTF-8"\r\n\
Custom-Header: somevalue\r\n\
\r\n\
some\n\
text\
\r\n\
\r\n\
'
assert buff.decode('utf-8') == warc_record

View File

@ -1,304 +0,0 @@
import tempfile
import uuid
import base64
import hashlib
import datetime
import zlib
import six
from socket import gethostname
from io import BytesIO
from pywb.utils.loaders import to_native_str
from pywb.utils.timeutils import datetime_to_iso_date
from pywb.utils.statusandheaders import StatusAndHeadersParser, StatusAndHeaders
from pywb.warc.recordloader import ArcWarcRecord
from pywb.warc.recordloader import ArcWarcRecordLoader
# ============================================================================
class BaseWARCWriter(object):
BUFF_SIZE = 16384
WARC_RECORDS = {'warcinfo': 'application/warc-fields',
'response': 'application/http; msgtype=response',
'revisit': 'application/http; msgtype=response',
'request': 'application/http; msgtype=request',
'metadata': 'application/warc-fields',
}
REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'
WARC_VERSION = 'WARC/1.0'
def __init__(self, gzip=True, header_filter=None, *args, **kwargs):
self.gzip = gzip
self.header_filter = header_filter
self.hostname = gethostname()
self.parser = StatusAndHeadersParser([], verify=False)
self.warc_version = kwargs.get('warc_version', self.WARC_VERSION)
@classmethod
def _iter_stream(cls, stream):
while True:
buf = stream.read(cls.BUFF_SIZE)
if not buf:
return
yield buf
def ensure_digest(self, record):
block_digest = record.rec_headers.get_header('WARC-Block-Digest')
payload_digest = record.rec_headers.get_header('WARC-Payload-Digest')
if block_digest and payload_digest:
return
block_digester = self._create_digester()
payload_digester = self._create_digester()
pos = record.stream.tell()
if record.status_headers and hasattr(record.status_headers, 'headers_buff'):
block_digester.update(record.status_headers.headers_buff)
for buf in self._iter_stream(record.stream):
block_digester.update(buf)
payload_digester.update(buf)
record.stream.seek(pos)
record.rec_headers.add_header('WARC-Block-Digest', str(block_digester))
record.rec_headers.add_header('WARC-Payload-Digest', str(payload_digester))
def _create_digester(self):
return Digester('sha1')
def _set_header_buff(self, record):
exclude_list = None
if self.header_filter:
exclude_list = self.header_filter(record)
buff = record.status_headers.to_bytes(exclude_list)
record.status_headers.headers_buff = buff
def create_warcinfo_record(self, filename, info):
warc_headers = StatusAndHeaders(self.warc_version, [])
warc_headers.add_header('WARC-Type', 'warcinfo')
warc_headers.add_header('WARC-Record-ID', self._make_warc_id())
if filename:
warc_headers.add_header('WARC-Filename', filename)
warc_headers.add_header('WARC-Date', self._make_warc_date())
warcinfo = BytesIO()
for n, v in six.iteritems(info):
self._header(warcinfo, n, v)
warcinfo.seek(0)
record = ArcWarcRecord('warc', 'warcinfo', warc_headers, warcinfo,
None, '', len(warcinfo.getvalue()))
return record
def copy_warc_record(self, payload):
len_ = payload.tell()
payload.seek(0)
warc_headers = self.parser.parse(payload)
record_type = warc_headers.get_header('WARC-Type', 'response')
return self._fill_record(record_type, warc_headers, None, payload, '', len_)
def create_warc_record(self, uri, record_type, payload,
length=None,
warc_content_type='',
warc_headers_dict={},
status_headers=None):
if length is None:
length = payload.tell()
payload.seek(0)
warc_headers = StatusAndHeaders(self.warc_version, list(warc_headers_dict.items()))
warc_headers.replace_header('WARC-Type', record_type)
if not warc_headers.get_header('WARC-Record-ID'):
warc_headers.add_header('WARC-Record-ID', self._make_warc_id())
if uri:
warc_headers.replace_header('WARC-Target-URI', uri)
if not warc_headers.get_header('WARC-Date'):
warc_headers.add_header('WARC-Date', self._make_warc_date())
return self._fill_record(record_type, warc_headers, status_headers,
payload, warc_content_type, length)
def _fill_record(self, record_type, warc_headers, status_headers, payload, warc_content_type, len_):
has_http_headers = (record_type in ('request', 'response', 'revisit'))
if not status_headers and has_http_headers:
status_headers = self.parser.parse(payload)
record = ArcWarcRecord('warc', record_type, warc_headers, payload,
status_headers, warc_content_type, len_)
self.ensure_digest(record)
if has_http_headers:
self._set_header_buff(record)
return record
def _write_warc_record(self, out, record, adjust_cl=True):
if self.gzip:
out = GzippingWrapper(out)
# compute Content-Type
content_type = record.rec_headers.get_header('Content-Type')
if not content_type:
content_type = record.content_type
if not content_type:
content_type = self.WARC_RECORDS.get(record.rec_headers.get_header('WARC-Type'))
if content_type:
record.rec_headers.replace_header('Content-Type', content_type)
#self._header(out, 'Content-Type', content_type)
if record.rec_headers.get_header('WARC-Type') == 'revisit':
http_headers_only = True
else:
http_headers_only = False
# compute Content-Length
if record.length or record.status_headers:
actual_len = 0
if record.status_headers:
actual_len = len(record.status_headers.headers_buff)
if not http_headers_only:
if adjust_cl:
diff = record.stream.tell() - actual_len
else:
diff = 0
actual_len = record.length - diff
record.rec_headers.replace_header('Content-Length', str(actual_len))
#self._header(out, 'Content-Length', str(actual_len))
# add empty line
#self._line(out, b'')
# write record headers
out.write(record.rec_headers.to_bytes())
# write headers buffer, if any
if record.status_headers:
out.write(record.status_headers.headers_buff)
if not http_headers_only:
for buf in self._iter_stream(record.stream):
out.write(buf)
#out.write(record.stream.read())
# add two lines
self._line(out, b'\r\n')
else:
# add three lines (1 for end of header, 2 for end of record)
self._line(out, b'Content-Length: 0\r\n\r\n')
out.flush()
def _header(self, out, name, value):
if not value:
return
self._line(out, (name + ': ' + str(value)).encode('latin-1'))
def _line(self, out, line):
out.write(line + b'\r\n')
@classmethod
def _make_warc_id(cls, id_=None):
if not id_:
id_ = uuid.uuid1()
return '<urn:uuid:{0}>'.format(id_)
@classmethod
def _make_warc_date(cls):
return datetime_to_iso_date(datetime.datetime.utcnow())
# ============================================================================
class GzippingWrapper(object):
def __init__(self, out):
self.compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16)
self.out = out
def write(self, buff):
#if isinstance(buff, str):
# buff = buff.encode('utf-8')
buff = self.compressor.compress(buff)
self.out.write(buff)
def flush(self):
buff = self.compressor.flush()
self.out.write(buff)
self.out.flush()
# ============================================================================
class Digester(object):
def __init__(self, type_='sha1'):
self.type_ = type_
self.digester = hashlib.new(type_)
def update(self, buff):
self.digester.update(buff)
def __str__(self):
return self.type_ + ':' + to_native_str(base64.b32encode(self.digester.digest()))
# ============================================================================
class BufferWARCWriter(BaseWARCWriter):
def __init__(self, *args, **kwargs):
super(BufferWARCWriter, self).__init__(*args, **kwargs)
self.out = self._create_buffer()
def _create_buffer(self):
return tempfile.SpooledTemporaryFile(max_size=512*1024)
def write_record(self, record):
self._write_warc_record(self.out, record)
def get_contents(self):
pos = self.out.tell()
self.out.seek(0)
buff = self.out.read()
self.out.seek(pos)
return buff
# ============================================================================
class FileWARCWriter(BufferWARCWriter):
def __init__(self, *args, **kwargs):
file_or_buff = None
if len(args) > 0:
file_or_buff = args[0]
else:
file_or_buff = kwargs.get('file')
if isinstance(file_or_buff, str):
self.out = open(file_or_buff, 'rb')
elif hasattr(file_or_buff, 'read'):
self.out = file_or_buff
else:
raise Exception('file must be a readable or valid filename')

View File

@ -5,7 +5,8 @@ import json
import time import time
import os import os
from pywb.utils.timeutils import timestamp_now from warcio.timeutils import timestamp_now
from pywb.cdx.cdxops import process_cdx from pywb.cdx.cdxops import process_cdx
from pywb.cdx.query import CDXQuery from pywb.cdx.query import CDXQuery

View File

@ -2,7 +2,7 @@ from pywb.webagg.responseloader import WARCPathLoader, LiveWebLoader, VideoLoad
from pywb.webagg.utils import MementoUtils from pywb.webagg.utils import MementoUtils
from pywb.utils.wbexception import BadRequestException, WbException from pywb.utils.wbexception import BadRequestException, WbException
from pywb.utils.wbexception import NotFoundException from pywb.utils.wbexception import NotFoundException
from pywb.warc.recordloader import ArchiveLoadFailed from warcio.recordloader import ArchiveLoadFailed
from pywb.cdx.query import CDXQuery from pywb.cdx.query import CDXQuery
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules

View File

@ -1,6 +1,6 @@
from pywb.utils.binsearch import iter_range from pywb.utils.binsearch import iter_range
from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp
from pywb.utils.timeutils import timestamp_now from warcio.timeutils import timestamp_now
from pywb.utils.canonicalize import canonicalize from pywb.utils.canonicalize import canonicalize
from pywb.utils.wbexception import NotFoundException from pywb.utils.wbexception import NotFoundException

View File

@ -1,6 +1,7 @@
from warcio.limitreader import LimitReader
from warcio.statusandheaders import StatusAndHeadersParser
from pywb.utils.loaders import extract_post_query, append_post_query from pywb.utils.loaders import extract_post_query, append_post_query
from pywb.utils.limitreader import LimitReader
from pywb.utils.statusandheaders import StatusAndHeadersParser
from six.moves.urllib.parse import urlsplit, quote from six.moves.urllib.parse import urlsplit, quote
from six import iteritems, StringIO from six import iteritems, StringIO

View File

@ -3,7 +3,7 @@ from pywb.utils.wbexception import NotFoundException
from pywb.webagg.indexsource import BaseIndexSource, RemoteIndexSource from pywb.webagg.indexsource import BaseIndexSource, RemoteIndexSource
from pywb.webagg.responseloader import LiveWebLoader from pywb.webagg.responseloader import LiveWebLoader
from pywb.webagg.utils import ParamFormatter, res_template from pywb.webagg.utils import ParamFormatter, res_template
from pywb.utils.timeutils import timestamp_now from warcio.timeutils import timestamp_now
#============================================================================= #=============================================================================

View File

@ -2,12 +2,13 @@ from pywb.webagg.utils import MementoUtils, StreamIter, compress_gzip_iter
from pywb.webagg.utils import ParamFormatter from pywb.webagg.utils import ParamFormatter
from pywb.webagg.indexsource import RedisIndexSource from pywb.webagg.indexsource import RedisIndexSource
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp from warcio.timeutils import timestamp_to_datetime, datetime_to_timestamp
from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date from warcio.timeutils import iso_date_to_datetime, datetime_to_iso_date
from pywb.utils.timeutils import http_date_to_datetime, datetime_to_http_date from warcio.timeutils import http_date_to_datetime, datetime_to_http_date
from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
from pywb.utils.wbexception import LiveResourceException, WbException from pywb.utils.wbexception import LiveResourceException, WbException
from pywb.utils.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
from pywb.warc.resolvingloader import ResolvingLoader from pywb.warc.resolvingloader import ResolvingLoader
@ -243,11 +244,11 @@ class WARCPathLoader(BaseLoader):
status = cdx.get('status') status = cdx.get('status')
if not status or status.startswith('3'): if not status or status.startswith('3'):
status_headers = self.headers_parser.parse(payload.stream) http_headers = self.headers_parser.parse(payload.raw_stream)
self.raise_on_self_redirect(params, cdx, self.raise_on_self_redirect(params, cdx,
status_headers.get_statuscode(), http_headers.get_statuscode(),
status_headers.get_header('Location')) http_headers.get_header('Location'))
http_headers_buff = status_headers.to_bytes() http_headers_buff = http_headers.to_bytes()
else: else:
http_headers_buff = None http_headers_buff = None
@ -266,9 +267,9 @@ class WARCPathLoader(BaseLoader):
warc_headers.replace_header('WARC-Date', warc_headers.replace_header('WARC-Date',
headers.rec_headers.get_header('WARC-Date')) headers.rec_headers.get_header('WARC-Date'))
headers.stream.close() headers.raw_stream.close()
return (warc_headers, http_headers_buff, payload.stream) return (warc_headers, http_headers_buff, payload.raw_stream)
def __str__(self): def __str__(self):
return 'WARCPathLoader' return 'WARCPathLoader'

View File

@ -13,10 +13,10 @@ from pywb.webagg.aggregator import DirectoryIndexSource
from pywb.webagg.app import ResAggApp from pywb.webagg.app import ResAggApp
from pywb.webagg.utils import MementoUtils from pywb.webagg.utils import MementoUtils
from pywb.warc.recordloader import ArcWarcRecordLoader from warcio.recordloader import ArcWarcRecordLoader
from warcio.statusandheaders import StatusAndHeadersParser
from warcio.bufferedreaders import ChunkedDataReader
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.bufferedreaders import ChunkedDataReader
from io import BytesIO from io import BytesIO
from six.moves.urllib.parse import urlencode from six.moves.urllib.parse import urlencode
@ -215,9 +215,9 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
buff = BytesIO(resp.body) buff = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(buff, no_record_parse=False) record = ArcWarcRecordLoader().parse_record_stream(buff, no_record_parse=False)
print(record.status_headers) print(record.http_headers)
assert record.status_headers.get_statuscode() == '302' assert record.http_headers.get_statuscode() == '302'
assert record.status_headers.get_header('Location') == 'https://www.iana.org/' assert record.http_headers.get_header('Location') == 'https://www.iana.org/'
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live')) @patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live'))
def test_agg_select_live(self): def test_agg_select_live(self):

View File

@ -3,7 +3,7 @@ from pywb.webagg.indexsource import LiveIndexSource
from pywb.webagg.aggregator import SimpleAggregator from pywb.webagg.aggregator import SimpleAggregator
from pywb.utils.timeutils import timestamp_now from warcio.timeutils import timestamp_now
from .testutils import key_ts_res, TEST_CDX_PATH from .testutils import key_ts_res, TEST_CDX_PATH

View File

@ -8,7 +8,7 @@ from pywb.webagg.handlers import DefaultResourceHandler
from pywb.webagg.aggregator import SimpleAggregator from pywb.webagg.aggregator import SimpleAggregator
from pywb.webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource from pywb.webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource
from pywb.warc.recordloader import ArcWarcRecordLoader from warcio.recordloader import ArcWarcRecordLoader
from .testutils import LiveServerTests, BaseTestClass from .testutils import LiveServerTests, BaseTestClass
@ -48,7 +48,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
record = ArcWarcRecordLoader().parse_record_stream(resp.raw, no_record_parse=False) record = ArcWarcRecordLoader().parse_record_stream(resp.raw, no_record_parse=False)
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get' assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
assert record.status_headers.get_header('Date') != '' assert record.http_headers.get_header('Date') != ''
def test_upstream_1(self): def test_upstream_1(self):
resp = self.testapp.get('/upstream/resource?url=http://httpbin.org/get') resp = self.testapp.get('/upstream/resource?url=http://httpbin.org/get')
@ -58,7 +58,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False) record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False)
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get' assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
assert record.status_headers.get_header('Date') != '' assert record.http_headers.get_header('Date') != ''
def test_upstream_2(self): def test_upstream_2(self):
resp = self.testapp.get('/upstream_opt/resource?url=http://httpbin.org/get') resp = self.testapp.get('/upstream_opt/resource?url=http://httpbin.org/get')
@ -68,7 +68,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False) record = ArcWarcRecordLoader().parse_record_stream(raw, no_record_parse=False)
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get' assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
assert record.status_headers.get_header('Date') != '' assert record.http_headers.get_header('Date') != ''

View File

@ -7,7 +7,9 @@ import zlib
from contextlib import closing from contextlib import closing
from pywb.utils.timeutils import timestamp_to_http_date from warcio.timeutils import timestamp_to_http_date
from warcio.utils import BUFF_SIZE
from pywb.utils.wbexception import BadRequestException from pywb.utils.wbexception import BadRequestException
from pywb.utils.loaders import load_yaml_config from pywb.utils.loaders import load_yaml_config
@ -20,9 +22,6 @@ LINK_SEG_SPLIT = re.compile(';\s*')
LINK_URL = re.compile('<(.*)>') LINK_URL = re.compile('<(.*)>')
LINK_PROP = re.compile('([\w]+)="([^"]+)') LINK_PROP = re.compile('([\w]+)="([^"]+)')
BUFF_SIZE = 16384
#============================================================================= #=============================================================================
class MementoException(BadRequestException): class MementoException(BadRequestException):
pass pass

View File

@ -15,7 +15,7 @@ from pywb.cdx.cdxobject import IDXObject, CDXException, CDXObject
from pywb.cdx.query import CDXQuery from pywb.cdx.query import CDXQuery
from pywb.utils.loaders import BlockLoader, read_last_line from pywb.utils.loaders import BlockLoader, read_last_line
from pywb.utils.bufferedreaders import gzip_decompressor from warcio.bufferedreaders import gzip_decompressor
from pywb.utils.binsearch import iter_range, linearsearch, search from pywb.utils.binsearch import iter_range, linearsearch, search

View File

@ -5,9 +5,11 @@ import logging
from datetime import datetime from datetime import datetime
from warcio.statusandheaders import StatusAndHeaders
from warcio.timeutils import datetime_to_timestamp
from pywb.utils.wbexception import NotFoundException from pywb.utils.wbexception import NotFoundException
from pywb.utils.loaders import LocalFileLoader from pywb.utils.loaders import LocalFileLoader
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.wbrequestresponse import WbResponse
@ -19,7 +21,6 @@ from pywb.warc.pathresolvers import PathResolverMapper
from pywb.webapp.views import J2TemplateView, init_view from pywb.webapp.views import J2TemplateView, init_view
from pywb.webapp.replay_views import ReplayView from pywb.webapp.replay_views import ReplayView
from pywb.framework.memento import MementoResponse from pywb.framework.memento import MementoResponse
from pywb.utils.timeutils import datetime_to_timestamp
#================================================================= #=================================================================

View File

@ -1,5 +1,6 @@
from pywb.utils.statusandheaders import StatusAndHeaders from warcio.statusandheaders import StatusAndHeaders
from pywb.utils.limitreader import LimitReader from warcio.limitreader import LimitReader
from pywb.framework.cache import create_cache from pywb.framework.cache import create_cache
from tempfile import NamedTemporaryFile, mkdtemp from tempfile import NamedTemporaryFile, mkdtemp

View File

@ -5,16 +5,17 @@ from io import BytesIO
from six.moves.urllib.parse import urlsplit from six.moves.urllib.parse import urlsplit
from itertools import chain from itertools import chain
from pywb.utils.statusandheaders import StatusAndHeaders from warcio.statusandheaders import StatusAndHeaders
from warcio.limitreader import LimitReader
from warcio.timeutils import timestamp_now
from warcio.recordloader import ArchiveLoadFailed
from pywb.utils.wbexception import WbException, NotFoundException from pywb.utils.wbexception import WbException, NotFoundException
from pywb.utils.limitreader import LimitReader
from pywb.utils.timeutils import timestamp_now
from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import MementoResponse from pywb.framework.memento import MementoResponse
from pywb.rewrite.rewrite_content import RewriteContent from pywb.rewrite.rewrite_content import RewriteContent
from pywb.warc.recordloader import ArchiveLoadFailed
from pywb.webapp.views import HeadInsertView from pywb.webapp.views import HeadInsertView

View File

@ -1,4 +1,4 @@
from pywb.utils.timeutils import timestamp_to_datetime, timestamp_to_sec from warcio.timeutils import timestamp_to_datetime, timestamp_to_sec
from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import make_timemap, LINK_FORMAT from pywb.framework.memento import make_timemap, LINK_FORMAT

View File

@ -71,6 +71,7 @@ setup(
], ],
install_requires=[ install_requires=[
'six', 'six',
'warcio',
'chardet', 'chardet',
'requests', 'requests',
'redis', 'redis',