mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
cdxindexing: encode unicode filenames using system encoding,
add test for unicode filenames
This commit is contained in:
parent
4d31c17d4c
commit
e513b3755c
@ -109,7 +109,6 @@ def cdx_filename(filename):
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def write_multi_cdx_index(output, inputs, **options):
|
def write_multi_cdx_index(output, inputs, **options):
|
||||||
|
|
||||||
# write one cdx per dir
|
# write one cdx per dir
|
||||||
if output != '-' and os.path.isdir(output):
|
if output != '-' and os.path.isdir(output):
|
||||||
for fullpath, filename in iter_file_or_dir(inputs):
|
for fullpath, filename in iter_file_or_dir(inputs):
|
||||||
@ -145,6 +144,9 @@ def write_multi_cdx_index(output, inputs, **options):
|
|||||||
def write_cdx_index(outfile, infile, filename, **options):
|
def write_cdx_index(outfile, infile, filename, **options):
|
||||||
writer_cls = options.get('writer_cls')
|
writer_cls = options.get('writer_cls')
|
||||||
|
|
||||||
|
if type(filename) is unicode:
|
||||||
|
filename = filename.encode(sys.getfilesystemencoding())
|
||||||
|
|
||||||
if writer_cls:
|
if writer_cls:
|
||||||
pass
|
pass
|
||||||
elif options.get('sort'):
|
elif options.get('sort'):
|
||||||
|
@ -130,8 +130,8 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
|
|||||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||||
4
|
4
|
||||||
|
|
||||||
# test writing to temp dir
|
# test writing to temp dir, also use unicode filename
|
||||||
>>> cli_lines_with_dir(TEST_WARC_DIR + 'example.warc.gz')
|
>>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz'))
|
||||||
example.cdx
|
example.cdx
|
||||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||||
|
Loading…
x
Reference in New Issue
Block a user