Skip to content

Commit b192bf0

Browse files
stevengjtkelman
authored andcommitted
fix #10958: buggy handling of embedded NUL chars
(cherry picked from commit 1d90e97) ref PR #10991 Conflicts: base/string.jl base/utf8.jl base/utf8proc.jl test/unicode.jl
1 parent 2cc7c9d commit b192bf0

File tree

4 files changed

+11
-6
lines changed

4 files changed

+11
-6
lines changed

base/string.jl

-2
Original file line numberDiff line numberDiff line change
@@ -538,8 +538,6 @@ beginswith(a::Array{Uint8,1}, b::Array{Uint8,1}) =
538538

539539
charwidth(c::Char) = max(0,int(ccall(:wcwidth, Int32, (Uint32,), c)))
540540
strwidth(s::String) = (w=0; for c in s; w += charwidth(c); end; w)
541-
strwidth(s::ByteString) = int(ccall(:u8_strwidth, Csize_t, (Ptr{Uint8},), s.data))
542-
# TODO: implement and use u8_strnwidth that takes a length argument
543541

544542
## libc character class predicates ##
545543

base/utf8.jl

+2-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ function endof(s::UTF8String)
3737
end
3838
i
3939
end
40-
length(s::UTF8String) = int(ccall(:u8_strlen, Csize_t, (Ptr{Uint8},), s.data))
40+
length(s::UTF8String) = int(ccall(:u8_charnum, Csize_t, (Ptr{Uint8}, Csize_t),
41+
s.data, length(s.data)))
4142

4243
function next(s::UTF8String, i::Int)
4344
# potentially faster version

base/utf8proc.jl

+4-3
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ const UTF8PROC_CATEGORY_CS = 28
4141
const UTF8PROC_CATEGORY_CO = 29
4242
const UTF8PROC_CATEGORY_CN = 30
4343

44-
const UTF8PROC_NULLTERM = (1<<0)
4544
const UTF8PROC_STABLE = (1<<1)
4645
const UTF8PROC_COMPAT = (1<<2)
4746
const UTF8PROC_COMPOSE = (1<<3)
@@ -60,10 +59,10 @@ const UTF8PROC_STRIPMARK = (1<<13)
6059
let
6160
const p = Array(Ptr{Uint8}, 1)
6261
global utf8proc_map
63-
function utf8proc_map(s::String, flags::Integer)
62+
function utf8proc_map(s::ByteString, flags::Integer)
6463
result = ccall(:utf8proc_map, Cssize_t,
6564
(Ptr{Uint8}, Cssize_t, Ptr{Ptr{Uint8}}, Cint),
66-
s, 0, p, flags | UTF8PROC_NULLTERM)
65+
s, sizeof(s), p, flags)
6766
result < 0 && error(bytestring(ccall(:utf8proc_errmsg, Ptr{Uint8},
6867
(Cssize_t,), result)))
6968
a = ccall(:jl_ptr_to_array_1d, Vector{Uint8},
@@ -73,6 +72,8 @@ let
7372
end
7473
end
7574

75+
utf8proc_map(s::String, flags::Integer) = utf8proc_map(bytestring(s), flags)
76+
7677
function normalize_string(s::String; stable::Bool=false, compat::Bool=false, compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false)
7778
flags = 0
7879
stable && (flags = flags | UTF8PROC_STABLE)

test/unicode.jl

+5
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,8 @@ let c_ll = 'β', c_cn = '\u038B'
9999
# check codepoint with category code CN
100100
@test Base.UTF8proc.category_code(c_cn) == Base.UTF8proc.UTF8PROC_CATEGORY_CN
101101
end
102+
103+
# handling of embedded NUL chars (#10958)
104+
@test length("\0w") == length("\0α") == 2
105+
@test strwidth("\0w") == strwidth("\0α") == 1
106+
@test normalize_string("\0W", casefold=true) == "\0w"

0 commit comments

Comments
 (0)