mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
refactor: updated dependencies, remove watchdog, add gevent and webassets
update tests, tests should pass for python 2 and 3!
This commit is contained in:
parent
ab77c1b6d9
commit
8765de4fe7
@ -9,34 +9,39 @@ from pywb.recorder.filters import SkipDupePolicy
|
|||||||
import atexit
|
import atexit
|
||||||
import tempfile
|
import tempfile
|
||||||
import redis
|
import redis
|
||||||
|
import shutil
|
||||||
|
|
||||||
upstream_url = 'http://localhost:8080'
|
def main():
|
||||||
|
upstream_url = 'http://localhost:8080'
|
||||||
|
|
||||||
target = tempfile.mkdtemp(prefix='tmprec') + '/'
|
target = tempfile.mkdtemp(prefix='tmprec') + '/'
|
||||||
|
|
||||||
print('Recording to ' + target)
|
print('Recording to ' + target)
|
||||||
|
|
||||||
def rm_target():
|
def rm_target():
|
||||||
print('Removing ' + target)
|
print('Removing ' + target)
|
||||||
shutil.rmtree(target)
|
shutil.rmtree(target)
|
||||||
|
|
||||||
atexit.register(rm_target)
|
atexit.register(rm_target)
|
||||||
|
|
||||||
local_r = redis.StrictRedis.from_url('redis://localhost/2')
|
local_r = redis.StrictRedis.from_url('redis://localhost/2')
|
||||||
local_r.delete('rec:cdxj')
|
local_r.delete('rec:cdxj')
|
||||||
local_r.delete('rec:warc')
|
local_r.delete('rec:warc')
|
||||||
|
|
||||||
#target = './_recordings/'
|
#target = './_recordings/'
|
||||||
|
|
||||||
dedup_index = WritableRedisIndexer(
|
dedup_index = WritableRedisIndexer(
|
||||||
redis_url='redis://localhost/2/rec:cdxj',
|
redis_url='redis://localhost/2/rec:cdxj',
|
||||||
file_key_template='rec:warc',
|
file_key_template='rec:warc',
|
||||||
rel_path_template=target,
|
rel_path_template=target,
|
||||||
dupe_policy=SkipDupePolicy())
|
dupe_policy=SkipDupePolicy())
|
||||||
|
|
||||||
recorder_app = RecorderApp(upstream_url,
|
recorder_app = RecorderApp(upstream_url,
|
||||||
MultiFileWARCWriter(target, dedup_index=dedup_index),
|
MultiFileWARCWriter(target, dedup_index=dedup_index),
|
||||||
accept_colls='live')
|
accept_colls='live')
|
||||||
|
|
||||||
application = recorder_app
|
return recorder_app
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
application = main()
|
||||||
|
|
||||||
|
@ -166,7 +166,7 @@ def seek_read_full(seekable_reader, offset):
|
|||||||
def test_s3_read_1():
|
def test_s3_read_1():
|
||||||
pytest.importorskip('boto')
|
pytest.importorskip('boto')
|
||||||
|
|
||||||
res = BlockLoader().load('s3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz',
|
res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz',
|
||||||
offset=53235662,
|
offset=53235662,
|
||||||
length=2526)
|
length=2526)
|
||||||
|
|
||||||
|
@ -152,19 +152,19 @@ StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0',
|
|||||||
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
|
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
|
||||||
Total: 210
|
Total: 211
|
||||||
|
|
||||||
# test sort, multiple inputs, recursive, from base test dir
|
# test sort, multiple inputs, recursive, from base test dir
|
||||||
>>> cli_lines(['--sort', '-r', '-', get_test_dir()])
|
>>> cli_lines(['--sort', '-r', '-', get_test_dir()])
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz
|
||||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
|
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
|
||||||
Total: 210
|
Total: 211
|
||||||
|
|
||||||
# test sort, 9-field, multiple inputs, all records + post query
|
# test sort, 9-field, multiple inputs, all records + post query
|
||||||
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
|
||||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
|
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
|
||||||
Total: 404
|
Total: 406
|
||||||
|
|
||||||
# test writing to stdout
|
# test writing to stdout
|
||||||
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
|
>>> cli_lines(['-', TEST_WARC_DIR + 'example.warc.gz'])
|
||||||
@ -188,7 +188,7 @@ Total: 4
|
|||||||
>>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR])
|
>>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR])
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz
|
||||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
|
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
|
||||||
Total: 210
|
Total: 211
|
||||||
|
|
||||||
# test writing to temp dir, also use unicode filename
|
# test writing to temp dir, also use unicode filename
|
||||||
>>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz')
|
>>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz')
|
||||||
|
@ -200,8 +200,6 @@ class WARCPathLoader(BaseLoader):
|
|||||||
cdx._formatter = formatter
|
cdx._formatter = formatter
|
||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
return cdx_iter
|
|
||||||
|
|
||||||
failed_files = []
|
failed_files = []
|
||||||
headers, payload = (self.resolve_loader.
|
headers, payload = (self.resolve_loader.
|
||||||
load_headers_and_payload(cdx,
|
load_headers_and_payload(cdx,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user