Skip to content

Commit 8bcdb3f

Browse files
committed
Merge pull request #10991 from stevengj/nullsafe
fix #10958: buggy handling of embedded NUL chars
2 parents cad0325 + 1d90e97 commit 8bcdb3f

File tree

4 files changed

+20
-18
lines changed

4 files changed

+20
-18
lines changed

base/string.jl

-2
Original file line numberDiff line numberDiff line change
@@ -541,8 +541,6 @@ startswith(a::Array{UInt8,1}, b::Array{UInt8,1}) =
541541
## character column width function ##
542542

543543
strwidth(s::AbstractString) = (w=0; for c in s; w += charwidth(c); end; w)
544-
strwidth(s::ByteString) = Int(ccall(:u8_strwidth, Csize_t, (Ptr{UInt8},), s.data))
545-
# TODO: implement and use u8_strnwidth that takes a length argument
546544

547545
isascii(c::Char) = c < Char(0x80)
548546
isascii(s::AbstractString) = all(isascii, s)

base/utf8.jl

+2-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ function endof(s::UTF8String)
3737
end
3838
i
3939
end
40-
length(s::UTF8String) = Int(ccall(:u8_strlen, Csize_t, (Ptr{UInt8},), s.data))
40+
length(s::UTF8String) = Int(ccall(:u8_charnum, Csize_t, (Ptr{UInt8}, Csize_t),
41+
s.data, length(s.data)))
4142

4243
function next(s::UTF8String, i::Int)
4344
# potentially faster version

base/utf8proc.jl

+13-15
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@ const UTF8PROC_CATEGORY_CF = 27
4646
const UTF8PROC_CATEGORY_CS = 28
4747
const UTF8PROC_CATEGORY_CO = 29
4848

49-
const UTF8PROC_NULLTERM = (1<<0)
5049
const UTF8PROC_STABLE = (1<<1)
5150
const UTF8PROC_COMPAT = (1<<2)
5251
const UTF8PROC_COMPOSE = (1<<3)
@@ -64,22 +63,21 @@ const UTF8PROC_STRIPMARK = (1<<13)
6463

6564
############################################################################
6665

67-
let
68-
const p = Array(Ptr{UInt8}, 1)
69-
global utf8proc_map
70-
function utf8proc_map(s::AbstractString, flags::Integer)
71-
result = ccall(:utf8proc_map, Cssize_t,
72-
(Ptr{UInt8}, Cssize_t, Ptr{Ptr{UInt8}}, Cint),
73-
s, 0, p, flags | UTF8PROC_NULLTERM)
74-
result < 0 && error(bytestring(ccall(:utf8proc_errmsg, Ptr{UInt8},
75-
(Cssize_t,), result)))
76-
a = ccall(:jl_ptr_to_array_1d, Vector{UInt8},
77-
(Any, Ptr{UInt8}, Csize_t, Cint),
78-
Vector{UInt8}, p[1], result, true)
79-
ccall(:jl_array_to_string, Any, (Any,), a)::ByteString
80-
end
66+
function utf8proc_map(s::ByteString, flags::Integer)
67+
p = Ref{Ptr{UInt8}}()
68+
result = ccall(:utf8proc_map, Cssize_t,
69+
(Ptr{UInt8}, Cssize_t, Ref{Ptr{UInt8}}, Cint),
70+
s, sizeof(s), p, flags)
71+
result < 0 && error(bytestring(ccall(:utf8proc_errmsg, Ptr{UInt8},
72+
(Cssize_t,), result)))
73+
a = ccall(:jl_ptr_to_array_1d, Vector{UInt8},
74+
(Any, Ptr{UInt8}, Csize_t, Cint),
75+
Vector{UInt8}, p[], result, true)
76+
ccall(:jl_array_to_string, Any, (Any,), a)::ByteString
8177
end
8278

79+
utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(bytestring(s), flags)
80+
8381
function normalize_string(s::AbstractString; stable::Bool=false, compat::Bool=false, compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false)
8482
flags = 0
8583
stable && (flags = flags | UTF8PROC_STABLE)

test/unicode.jl

+5
Original file line numberDiff line numberDiff line change
@@ -129,3 +129,8 @@ end
129129

130130
# up-to-date character widths (#3721, #6939)
131131
@test charwidth('\U1f355') == strwidth("\U1f355") == strwidth(utf16("\U1f355")) == strwidth("\U1f355\u0302") == strwidth(utf16("\U1f355\u0302")) == 2
132+
133+
# handling of embedded NUL chars (#10958)
134+
@test length("\0w") == length("\0α") == 2
135+
@test strwidth("\0w") == strwidth("\0α") == 1
136+
@test normalize_string("\0W", casefold=true) == "\0w"

0 commit comments

Comments
 (0)