1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

refactor: updated dependencies, remove watchdog, add gevent and webassets

update tests, tests should pass for python 2 and 3!
This commit is contained in:
Ilya Kreymer 2016-11-11 10:32:19 -08:00
parent ab77c1b6d9
commit 8765de4fe7
5 changed files with 32 additions and 28 deletions

View File

@ -9,34 +9,39 @@ from pywb.recorder.filters import SkipDupePolicy
import atexit import atexit
import tempfile import tempfile
import redis import redis
import shutil
upstream_url = 'http://localhost:8080' def main():
upstream_url = 'http://localhost:8080'
target = tempfile.mkdtemp(prefix='tmprec') + '/' target = tempfile.mkdtemp(prefix='tmprec') + '/'
print('Recording to ' + target) print('Recording to ' + target)
def rm_target(): def rm_target():
print('Removing ' + target) print('Removing ' + target)
shutil.rmtree(target) shutil.rmtree(target)
atexit.register(rm_target) atexit.register(rm_target)
local_r = redis.StrictRedis.from_url('redis://localhost/2') local_r = redis.StrictRedis.from_url('redis://localhost/2')
local_r.delete('rec:cdxj') local_r.delete('rec:cdxj')
local_r.delete('rec:warc') local_r.delete('rec:warc')
#target = './_recordings/' #target = './_recordings/'
dedup_index = WritableRedisIndexer( dedup_index = WritableRedisIndexer(
redis_url='redis://localhost/2/rec:cdxj', redis_url='redis://localhost/2/rec:cdxj',
file_key_template='rec:warc', file_key_template='rec:warc',
rel_path_template=target, rel_path_template=target,
dupe_policy=SkipDupePolicy()) dupe_policy=SkipDupePolicy())
recorder_app = RecorderApp(upstream_url, recorder_app = RecorderApp(upstream_url,
MultiFileWARCWriter(target, dedup_index=dedup_index), MultiFileWARCWriter(target, dedup_index=dedup_index),
accept_colls='live') accept_colls='live')
application = recorder_app return recorder_app
if __name__ == '__main__':
application = main()

View File

@ -166,7 +166,7 @@ def seek_read_full(seekable_reader, offset):
def test_s3_read_1(): def test_s3_read_1():
pytest.importorskip('boto') pytest.importorskip('boto')
res = BlockLoader().load('s3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz', res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz',
offset=53235662, offset=53235662,
length=2526) length=2526)

View File

@ -152,19 +152,19 @@ StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0',
>>> cli_lines(['--sort', '-', TEST_WARC_DIR]) >>> cli_lines(['--sort', '-', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
Total: 210 Total: 211
# test sort, multiple inputs, recursive, from base test dir # test sort, multiple inputs, recursive, from base test dir
>>> cli_lines(['--sort', '-r', '-', get_test_dir()]) >>> cli_lines(['--sort', '-r', '-', get_test_dir()])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
Total: 210 Total: 211
# test sort, 9-field, multiple inputs, all records + post query # test sort, 9-field, multiple inputs, all records + post query
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR]) >>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
Total: 404 Total: 406
# test writing to stdout # test writing to stdout
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz']) >>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
@ -188,7 +188,7 @@ Total: 4
>>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR]) >>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
Total: 210 Total: 211
# test writing to temp dir, also use unicode filename # test writing to temp dir, also use unicode filename
>>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz') >>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz')

View File

@ -200,8 +200,6 @@ class WARCPathLoader(BaseLoader):
cdx._formatter = formatter cdx._formatter = formatter
yield cdx yield cdx
return cdx_iter
failed_files = [] failed_files = []
headers, payload = (self.resolve_loader. headers, payload = (self.resolve_loader.
load_headers_and_payload(cdx, load_headers_and_payload(cdx,

View File

@ -84,8 +84,9 @@ setup(
'surt>=0.3.0', 'surt>=0.3.0',
'brotlipy', 'brotlipy',
'pyyaml', 'pyyaml',
'watchdog',
'webencodings', 'webencodings',
'gevent>=1.1.1',
'webassets',
], ],
tests_require=[ tests_require=[
'pytest', 'pytest',