Skip to content

Commit 81dba36

Browse files
committed
Add isvalid(Type, value) methods, to replace is_valid_*
1 parent ca2ca31 commit 81dba36

12 files changed

+139
-69
lines changed

base/ascii.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ ascii(x) = convert(ASCIIString, x)
100100
convert(::Type{ASCIIString}, s::ASCIIString) = s
101101
convert(::Type{ASCIIString}, s::UTF8String) = ascii(s.data)
102102
convert(::Type{ASCIIString}, a::Vector{UInt8}) = begin
103-
is_valid_ascii(a) || throw(ArgumentError("invalid ASCII sequence"))
103+
isvalid(ASCIIString,a) || throw(ArgumentError("invalid ASCII sequence"))
104104
return ASCIIString(a)
105105
end
106106

base/deprecated.jl

+13
Original file line numberDiff line numberDiff line change
@@ -443,3 +443,16 @@ export float32_isvalid, float64_isvalid
443443
@deprecate (&)(x::Char, y::Char) Char(UInt32(x) & UInt32(y))
444444
@deprecate (|)(x::Char, y::Char) Char(UInt32(x) | UInt32(y))
445445
@deprecate ($)(x::Char, y::Char) Char(UInt32(x) $ UInt32(y))
446+
447+
# 11241
448+
449+
@deprecate is_valid_char(ch::Char) isvalid(ch)
450+
@deprecate is_valid_char(ch::Union(Unsigned, Integer)) isvalid(Char, ch)
451+
@deprecate is_valid_ascii(str::ASCIIString) isvalid(str)
452+
@deprecate is_valid_ascii(str::Union(AbstractArray{UInt8}, UTF8String)) isvalid(ASCIIString, str)
453+
@deprecate is_valid_utf8(str::UTF8String) isvalid(str)
454+
@deprecate is_valid_utf8(str::Union(AbstractArray{UInt8}, ASCIIString)) isvalid(UTF8String, str)
455+
@deprecate is_valid_utf16(str::UTF16String) isvalid(str)
456+
@deprecate is_valid_utf16(str::AbstractArray{UInt16}) isvalid(UTF16String, str)
457+
@deprecate is_valid_utf32(str::UTF32String) isvalid(str)
458+
@deprecate is_valid_utf32(str::AbstractArray{UInt32}) isvalid(UTF32String, str)

base/exports.jl

-5
Original file line numberDiff line numberDiff line change
@@ -820,11 +820,6 @@ export
820820
ind2chr,
821821
info,
822822
is_assigned_char,
823-
is_valid_ascii,
824-
is_valid_char,
825-
is_valid_utf8,
826-
is_valid_utf16,
827-
is_valid_utf32,
828823
isalnum,
829824
isalpha,
830825
isascii,

base/io.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ end
246246

247247
function readall(s::IO)
248248
b = readbytes(s)
249-
return is_valid_ascii(b) ? ASCIIString(b) : UTF8String(b)
249+
return isvalid(ASCIIString, b) ? ASCIIString(b) : UTF8String(b)
250250
end
251251
readall(filename::AbstractString) = open(readall, filename)
252252

base/string.jl

+2-2
Original file line numberDiff line numberDiff line change
@@ -968,8 +968,8 @@ byte_string_classify(s::ByteString) = byte_string_classify(s.data)
968968
# 1: valid ASCII
969969
# 2: valid UTF-8
970970

971-
is_valid_ascii(s::Union(Array{UInt8,1},ByteString)) = byte_string_classify(s) == 1
972-
is_valid_utf8(s::Union(Array{UInt8,1},ByteString)) = byte_string_classify(s) != 0
971+
isvalid(::Type{ASCIIString}, s::Union(Array{UInt8,1},ByteString)) = byte_string_classify(s) == 1
972+
isvalid(::Type{UTF8String}, s::Union(Array{UInt8,1},ByteString)) = byte_string_classify(s) != 0
973973

974974
## multiline strings ##
975975

base/utf16.jl

+4-5
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,8 @@ sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
9595
unsafe_convert{T<:Union(Int16,UInt16)}(::Type{Ptr{T}}, s::UTF16String) =
9696
convert(Ptr{T}, pointer(s))
9797

98-
function is_valid_utf16(data::AbstractArray{UInt16})
98+
isvalid(::Type{UTF16String}, str::UTF16String) = isvalid(UTF16String, str.data)
99+
function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
99100
i = 1
100101
n = length(data) # this may include NULL termination; that's okay
101102
while i < n # check for unpaired surrogates
@@ -110,10 +111,8 @@ function is_valid_utf16(data::AbstractArray{UInt16})
110111
return i > n || !utf16_is_surrogate(data[i])
111112
end
112113

113-
is_valid_utf16(s::UTF16String) = is_valid_utf16(s.data)
114-
115114
function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
116-
!is_valid_utf16(data) && throw(ArgumentError("invalid UTF16 data"))
115+
!isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
117116
len = length(data)
118117
d = Array(UInt16, len + 1)
119118
d[end] = 0 # NULL terminate
@@ -144,7 +143,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
144143
copy!(d,1, data,1, length(data)) # assume native byte order
145144
end
146145
d[end] = 0 # NULL terminate
147-
!is_valid_utf16(d) && throw(ArgumentError("invalid UTF16 data"))
146+
!isvalid(UTF16String, d) && throw(ArgumentError("invalid UTF16 data"))
148147
UTF16String(d)
149148
end
150149

base/utf32.jl

+5-4
Original file line numberDiff line numberDiff line change
@@ -92,13 +92,14 @@ function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
9292
UTF32String(d)
9393
end
9494

95-
function is_valid_utf32(s::Union(Vector{Char}, Vector{UInt32}))
96-
for i=1:length(s)
97-
@inbounds if !is_valid_char(reinterpret(UInt32, s[i])) ; return false ; end
95+
function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
96+
for i=1:length(str)
97+
@inbounds if !isvalid(Char, reinterpret(UInt32, str[i])) ; return false ; end
9898
end
9999
return true
100100
end
101-
is_valid_utf32(s::UTF32String) = is_valid_utf32(s.data)
101+
isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
102+
isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data)
102103

103104
utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len))
104105
utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len)

base/utf8.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ write(io::IO, s::UTF8String) = write(io, s.data)
212212
utf8(x) = convert(UTF8String, x)
213213
convert(::Type{UTF8String}, s::UTF8String) = s
214214
convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data)
215-
convert(::Type{UTF8String}, a::Array{UInt8,1}) = is_valid_utf8(a) ? UTF8String(a) : throw(ArgumentError("invalid UTF-8 sequence"))
215+
convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(ArgumentError("invalid UTF-8 sequence"))
216216
function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString)
217217
l = length(a)
218218
idx = 1

base/utf8proc.jl

+7-5
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,21 @@
33
# Various Unicode functionality from the utf8proc library
44
module UTF8proc
55

6-
import Base: show, showcompact, ==, hash, string, symbol, isless, length, eltype, start, next, done, convert
6+
import Base: show, showcompact, ==, hash, string, symbol, isless, length, eltype, start, next, done, convert, isvalid
77

88
export isgraphemebreak
99

1010
# also exported by Base:
11-
export normalize_string, graphemes, is_valid_char, is_assigned_char, charwidth,
11+
export normalize_string, graphemes, is_assigned_char, charwidth, isvalid,
1212
islower, isupper, isalpha, isdigit, isnumber, isalnum,
1313
iscntrl, ispunct, isspace, isprint, isgraph, isblank
1414

1515
# whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff
16-
is_valid_char(ch::Unsigned) = !Bool((ch-0xd800<0x800)|(ch>0x10ffff))
17-
is_valid_char(ch::Integer) = is_valid_char(Unsigned(ch))
18-
is_valid_char(ch::Char) = is_valid_char(UInt32(ch))
16+
isvalid(::Type{Char}, ch::Unsigned) = !((ch - 0xd800 < 0x800) | (ch > 0x10ffff))
17+
isvalid(::Type{Char}, ch::Integer) = isvalid(Char, Unsigned(ch))
18+
isvalid(::Type{Char}, ch::Char) = isvalid(Char, UInt32(ch))
19+
20+
isvalid(ch::Char) = isvalid(Char, ch)
1921

2022
# utf8 category constants
2123
const UTF8PROC_CATEGORY_CN = 0

doc/manual/strings.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -99,14 +99,14 @@ convert an integer value back to a :obj:`Char` just as easily:
9999
Not all integer values are valid Unicode code points, but for
100100
performance, the :func:`Char` conversion does not check that every character
101101
value is valid. If you want to check that each converted value is a
102-
valid code point, use the :func:`is_valid_char` function:
102+
valid code point, use the :func:`isvalid` function:
103103

104104
.. doctest::
105105

106106
julia> Char(0x110000)
107107
'\U110000'
108108

109-
julia> is_valid_char(0x110000)
109+
julia> isvalid(Char, 0x110000)
110110
false
111111

112112
As of this writing, the valid Unicode code points are ``U+00`` through

doc/stdlib/strings.rst

+10-12
Original file line numberDiff line numberDiff line change
@@ -109,17 +109,19 @@
109109
even though they may contain more than one codepoint; for example
110110
a letter combined with an accent mark is a single grapheme.)
111111

112-
.. function:: is_valid_ascii(s) -> Bool
112+
.. function:: isvalid(value) -> Bool
113113

114-
Returns true if the argument (``ASCIIString``, ``UTF8String``, or byte vector) is valid ASCII, false otherwise.
114+
Returns true if the given value is valid for its type,
115+
which currently can be one of ``Char``, ``ASCIIString``, ``UTF8String``, ``UTF16String``, or ``UTF32String``
115116

116-
.. function:: is_valid_utf8(s) -> Bool
117+
.. function:: isvalid(T, value) -> Bool
117118

118-
Returns true if the argument (``ASCIIString``, ``UTF8String``, or byte vector) is valid UTF-8, false otherwise.
119-
120-
.. function:: is_valid_char(c) -> Bool
121-
122-
Returns true if the given char or integer is a valid Unicode code point.
119+
Returns true if the given value is valid for that type.
120+
Types currently can be ``Char``, ``ASCIIString``, ``UTF8String``, ``UTF16String``, or ``UTF32String``
121+
Values for ``Char`` can be of type ``Char`` or ``UInt32``
122+
Values for ``ASCIIString`` and ``UTF8String`` can be of that type, or ``Vector{UInt8}``
123+
Values for ``UTF16String`` can be ``UTF16String`` or ``Vector{UInt16}``
124+
Values for ``UTF32String`` can be ``UTF32String``, ``Vector{Char}`` or ``Vector{UInt32}``
123125

124126
.. function:: is_assigned_char(c) -> Bool
125127

@@ -379,10 +381,6 @@
379381

380382
Create a string from the address of a NUL-terminated UTF-16 string. A copy is made; the pointer can be safely freed. If ``length`` is specified, the string does not have to be NUL-terminated.
381383

382-
.. function:: is_valid_utf16(s) -> Bool
383-
384-
Returns true if the argument (``UTF16String`` or ``UInt16`` array) is valid UTF-16.
385-
386384
.. function:: utf32(s)
387385

388386
Create a UTF-32 string from a byte array, array of ``UInt32``, or

test/strings.jl

+93-31
Original file line numberDiff line numberDiff line change
@@ -1281,56 +1281,118 @@ end
12811281
@test isxdigit("a") == true
12821282
@test isxdigit("g") == false
12831283

1284-
@test is_valid_ascii("is_valid_ascii") == true
1285-
@test is_valid_ascii("Σ_not_valid_ascii") == false
1286-
@test is_valid_char('a') == true
1287-
@test is_valid_char('\x00') == true
1288-
@test is_valid_char(0xd800) == false
1289-
1290-
@test is_valid_utf16(utf16("a")) == true
1291-
@test is_valid_utf16(UInt16[0xd800,0]) == false
1292-
# TODO is_valid_utf8
1293-
12941284
# Issue #11140
1295-
@test is_valid_utf32(utf32("a")) == true
1296-
@test is_valid_utf32(utf32("\x00")) == true
1297-
@test is_valid_utf32(UInt32[0xd800,0]) == false
1285+
@test isvalid(utf32("a")) == true
1286+
@test isvalid(utf32("\x00")) == true
1287+
@test isvalid(UTF32String, UInt32[0xd800,0]) == false
1288+
1289+
# Issue #11241
1290+
1291+
@test isvalid(ASCIIString, "is_valid_ascii") == true
1292+
@test isvalid(ASCIIString, "Σ_not_valid_ascii") == false
1293+
1294+
# test all edge conditions
1295+
for (val, pass) in (
1296+
(0, true), (0xd7ff, true),
1297+
(0xd800, false), (0xdfff, false),
1298+
(0xe000, true), (0xffff, true),
1299+
(0x10000, true), (0x10ffff, true),
1300+
(0x110000, false)
1301+
)
1302+
@test isvalid(Char, val) == pass
1303+
end
1304+
for (val, pass) in (
1305+
(b"\x00", true),
1306+
(b"\x7f", true),
1307+
(b"\x80", false),
1308+
(b"\xbf", false),
1309+
(b"\xc0", false),
1310+
(b"\xff", false),
1311+
(b"\xc0\x80", false),
1312+
(b"\xc1\x80", false),
1313+
(b"\xc2\x80", true),
1314+
(b"\xc2\xc0", false),
1315+
(b"\xed\x9f\xbf", true),
1316+
(b"\xed\xa0\x80", false),
1317+
(b"\xed\xbf\xbf", false),
1318+
(b"\xee\x80\x80", true),
1319+
(b"\xef\xbf\xbf", true),
1320+
(b"\xf0\x90\x80\x80", true),
1321+
(b"\xf4\x8f\xbf\xbf", true),
1322+
(b"\xf4\x90\x80\x80", false),
1323+
(b"\xf5\x80\x80\x80", false),
1324+
(b"\ud800\udc00", false),
1325+
(b"\udbff\udfff", false),
1326+
(b"\ud800\u0100", false),
1327+
(b"\udc00\u0100", false),
1328+
(b"\udc00\ud800", false)
1329+
)
1330+
@test isvalid(UTF8String, val) == pass
1331+
end
1332+
for (val, pass) in (
1333+
(UInt16[0x0000], true),
1334+
(UInt16[0xd7ff,0], true),
1335+
(UInt16[0xd800,0], false),
1336+
(UInt16[0xdfff,0], false),
1337+
(UInt16[0xe000,0], true),
1338+
(UInt16[0xffff,0], true),
1339+
(UInt16[0xd800,0xdc00,0], true),
1340+
(UInt16[0xdbff,0xdfff,0], true),
1341+
(UInt16[0xd800,0x0100,0], false),
1342+
(UInt16[0xdc00,0x0100,0], false),
1343+
(UInt16[0xdc00,0xd800,0], false)
1344+
)
1345+
@test isvalid(UTF16String, val) == pass
1346+
end
1347+
for (val, pass) in (
1348+
(UInt32[0x0000], true),
1349+
(UInt32[0xd7ff,0], true),
1350+
(UInt32[0xd800,0], false),
1351+
(UInt32[0xdfff,0], false),
1352+
(UInt32[0xe000,0], true),
1353+
(UInt32[0xffff,0], true),
1354+
(UInt32[0x100000,0], true),
1355+
(UInt32[0x10ffff,0], true),
1356+
(UInt32[0x110000,0], false),
1357+
)
1358+
@test isvalid(UTF32String, val) == pass
1359+
end
12981360

12991361
# Issue #11203
1300-
@test is_valid_ascii(UInt8[]) == true
1301-
@test is_valid_utf8(UInt8[]) == true
1302-
@test is_valid_utf16(UInt16[]) == true
1303-
@test is_valid_utf32(UInt32[]) == true
1362+
@test isvalid(ASCIIString,UInt8[]) == true
1363+
@test isvalid(UTF8String, UInt8[]) == true
1364+
@test isvalid(UTF16String,UInt16[]) == true
1365+
@test isvalid(UTF32String,UInt32[]) == true
13041366

13051367
# Check UTF-8 characters
13061368
# Check ASCII range (true),
13071369
# then single continuation bytes and lead bytes with no following continuation bytes (false)
13081370
for (rng,flg) in ((0:0x7f, true), (0x80:0xff, false))
13091371
for byt in rng
1310-
@test is_valid_utf8(UInt8[byt]) == flg
1372+
@test isvalid(UTF8String, UInt8[byt]) == flg
13111373
end
13121374
end
13131375
# Check overlong lead bytes for 2-character sequences (false)
13141376
for byt = 0xc0:0xc1
1315-
@test is_valid_utf8(UInt8[byt,0x80]) == false
1377+
@test isvalid(UTF8String, UInt8[byt,0x80]) == false
13161378
end
13171379
# Check valid lead-in to two-byte sequences (true)
13181380
for byt = 0xc2:0xdf
13191381
for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false))
13201382
for cont in rng
1321-
@test is_valid_utf8(UInt8[byt, cont]) == flg
1383+
@test isvalid(UTF8String, UInt8[byt, cont]) == flg
13221384
end
13231385
end
13241386
end
13251387
# Check three-byte sequences
13261388
for r1 in (0xe0:0xec, 0xee:0xef)
13271389
for byt = r1
13281390
# Check for short sequence
1329-
@test is_valid_utf8(UInt8[byt]) == false
1391+
@test isvalid(UTF8String, UInt8[byt]) == false
13301392
for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false))
13311393
for cont in rng
1332-
@test is_valid_utf8(UInt8[byt, cont]) == false
1333-
@test is_valid_utf8(UInt8[byt, cont, 0x80]) == flg
1394+
@test isvalid(UTF8String, UInt8[byt, cont]) == false
1395+
@test isvalid(UTF8String, UInt8[byt, cont, 0x80]) == flg
13341396
end
13351397
end
13361398
end
@@ -1339,8 +1401,8 @@ end
13391401
# Check for short sequence, or start of surrogate pair
13401402
for (rng,flg) in ((0x00:0x7f, false), (0x80:0x9f, true), (0xa0:0xff, false))
13411403
for cont in rng
1342-
@test is_valid_utf8(UInt8[0xed, cont]) == false
1343-
@test is_valid_utf8(UInt8[0xed, cont, 0x80]) == flg
1404+
@test isvalid(UTF8String, UInt8[0xed, cont]) == false
1405+
@test isvalid(UTF8String, UInt8[0xed, cont, 0x80]) == flg
13441406
end
13451407
end
13461408
# Check valid four-byte sequences
@@ -1354,22 +1416,22 @@ for byt = 0xf0:0xf4
13541416
end
13551417
for (rng,flg) in r0
13561418
for cont in rng
1357-
@test is_valid_utf8(UInt8[byt, cont]) == false
1358-
@test is_valid_utf8(UInt8[byt, cont, 0x80]) == false
1359-
@test is_valid_utf8(UInt8[byt, cont, 0x80, 0x80]) == flg
1419+
@test isvalid(UTF8String, UInt8[byt, cont]) == false
1420+
@test isvalid(UTF8String, UInt8[byt, cont, 0x80]) == false
1421+
@test isvalid(UTF8String, UInt8[byt, cont, 0x80, 0x80]) == flg
13601422
end
13611423
end
13621424
end
13631425
# Check five-byte sequences, should be invalid
13641426
for byt = 0xf8:0xfb
1365-
@test is_valid_utf8(UInt8[byt, 0x80, 0x80, 0x80, 0x80]) == false
1427+
@test isvalid(UTF8String, UInt8[byt, 0x80, 0x80, 0x80, 0x80]) == false
13661428
end
13671429
# Check six-byte sequences, should be invalid
13681430
for byt = 0xfc:0xfd
1369-
@test is_valid_utf8(UInt8[byt, 0x80, 0x80, 0x80, 0x80, 0x80]) == false
1431+
@test isvalid(UTF8String, UInt8[byt, 0x80, 0x80, 0x80, 0x80, 0x80]) == false
13701432
end
13711433
# Check seven-byte sequences, should be invalid
1372-
@test is_valid_utf8(UInt8[0xfe, 0x80, 0x80, 0x80, 0x80, 0x80]) == false
1434+
@test isvalid(UTF8String, UInt8[0xfe, 0x80, 0x80, 0x80, 0x80, 0x80]) == false
13731435

13741436
# This caused JuliaLang/JSON.jl#82
13751437
@test first('\x00':'\x7f') === '\x00'

0 commit comments

Comments
 (0)