Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 2b44660

Browse files
committedDec 15, 2014
add graphemes(s) function to iterate over graphemes (represented by substrings) of a string s
1 parent 3ff5870 commit 2b44660

11 files changed

+110
-9
lines changed
 

‎NEWS.md

+4
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@ Library improvements
100100

101101
* Efficient `mean` and `median` for ranges ([#8089]).
102102

103+
* `graphemes(s)` returns an iterator over grapheme substrings of `s` ([#9261]).
104+
103105
* Character predicates such as `islower()`, `isspace()`, etc. use utf8proc/libmojibake
104106
to provide uniform cross-platform behavior and up-to-date, locale-independent support
105107
for Unicode standards ([#5939]).
@@ -1132,4 +1134,6 @@ Too numerous to mention.
11321134
[#9133]: https://github.com/JuliaLang/julia/issues/9133
11331135
[#9144]: https://github.com/JuliaLang/julia/issues/9144
11341136
[#9249]: https://github.com/JuliaLang/julia/issues/9249
1137+
[#9261]: https://github.com/JuliaLang/julia/issues/9261
11351138
[#9271]: https://github.com/JuliaLang/julia/issues/9271
1139+
[#9294]: https://github.com/JuliaLang/julia/issues/9294

‎base/c.jl

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ dlclose(p::Ptr) = if p!=C_NULL; ccall(:uv_dlclose,Void,(Ptr{Void},),p); end
3939
cfunction(f::Function, r, a) =
4040
ccall(:jl_function_ptr, Ptr{Void}, (Any, Any, Any), f, r, a)
4141

42+
typealias Cbool UInt8
4243
if ccall(:jl_is_char_signed, Any, ())
4344
typealias Cchar Int8
4445
else

‎base/exports.jl

+2
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ export
119119
Zip,
120120

121121
# Ccall types
122+
Cbool,
122123
Cchar,
123124
Cdouble,
124125
Cfloat,
@@ -822,6 +823,7 @@ export
822823
escape_string,
823824
float32_isvalid,
824825
float64_isvalid,
826+
graphemes,
825827
hex,
826828
hex2bytes,
827829
ind2chr,

‎base/string.jl

-1
Original file line numberDiff line numberDiff line change
@@ -1729,4 +1729,3 @@ pointer{T<:ByteString}(x::SubString{T}, i::Integer) = pointer(x.string.data) + x
17291729
pointer(x::Union(UTF16String,UTF32String), i::Integer) = pointer(x)+(i-1)*sizeof(eltype(x.data))
17301730
pointer{T<:Union(UTF16String,UTF32String)}(x::SubString{T}) = pointer(x.string.data) + x.offset*sizeof(eltype(x.data))
17311731
pointer{T<:Union(UTF16String,UTF32String)}(x::SubString{T}, i::Integer) = pointer(x.string.data) + (x.offset + (i-1))*sizeof(eltype(x.data))
1732-

‎base/utf8.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ function getindex(s::UTF8String, r::UnitRange{Int})
114114
if !is_utf8_start(d[i])
115115
i = nextind(s,i)
116116
end
117-
if j > endof(s)
117+
if j > length(d)
118118
throw(BoundsError())
119119
end
120120
j = nextind(s,j)-1

‎base/utf8proc.jl

+58-4
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
# Various Unicode functionality from the utf8proc library
22
module UTF8proc
33

4-
import Base: show, showcompact, ==, string, symbol, isless
4+
import Base: show, showcompact, ==, hash, string, symbol, isless, length, eltype, start, next, done, convert
5+
6+
export isgraphemebreak
57

68
# also exported by Base:
7-
export normalize_string, is_valid_char, is_assigned_char,
9+
export normalize_string, graphemes, is_valid_char, is_assigned_char,
810
islower, isupper, isalpha, isdigit, isnumber, isalnum,
911
iscntrl, ispunct, isspace, isprint, isgraph, isblank
1012

@@ -60,6 +62,8 @@ const UTF8PROC_CHARBOUND = (1<<11)
6062
const UTF8PROC_LUMP = (1<<12)
6163
const UTF8PROC_STRIPMARK = (1<<13)
6264

65+
############################################################################
66+
6367
let
6468
const p = Array(Ptr{UInt8}, 1)
6569
global utf8proc_map
@@ -110,6 +114,8 @@ function normalize_string(s::AbstractString, nf::Symbol)
110114
throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD")))
111115
end
112116

117+
############################################################################
118+
113119
# returns UTF8PROC_CATEGORY code in 1:30 giving Unicode category
114120
function category_code(c)
115121
uint32(c) > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
@@ -118,8 +124,6 @@ end
118124

119125
is_assigned_char(c) = category_code(c) != UTF8PROC_CATEGORY_CN
120126

121-
# TODO: use UTF8PROC_CHARBOUND to extract graphemes from a string, e.g. to iterate over graphemes?
122-
123127
## libc character class predicates ##
124128

125129
islower(c::Char) = (category_code(c) == UTF8PROC_CATEGORY_LL)
@@ -168,4 +172,54 @@ for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph",
168172
end
169173
end
170174

175+
############################################################################
176+
# iterators for grapheme segmentation
177+
178+
isgraphemebreak(c1::Char, c2::Char) = Bool(ccall(:utf8proc_grapheme_break, Cbool, (Char, Char),
179+
c1, c2))
180+
181+
immutable GraphemeIterator{S<:AbstractString}
182+
s::S # original string (for generation of SubStrings)
183+
end
184+
graphemes(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
185+
186+
eltype{S}(::GraphemeIterator{S}) = SubString{S}
187+
188+
function length(g::GraphemeIterator)
189+
c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this)
190+
n = 0
191+
for c in g.s
192+
n += isgraphemebreak(c0, c)
193+
c0 = c
194+
end
195+
return n
196+
end
197+
198+
start(g::GraphemeIterator) = start(g.s)
199+
done(g::GraphemeIterator, i) = done(g.s, i)
200+
201+
function next(g::GraphemeIterator, i)
202+
s = g.s
203+
j = i
204+
c0, k = next(s, i)
205+
while !done(s, k) # loop until next grapheme is s[i:j]
206+
c, ℓ = next(s, k)
207+
isgraphemebreak(c0, c) && break
208+
j = k
209+
k =
210+
c0 = c
211+
end
212+
return (s[i:j], k)
213+
end
214+
215+
==(g1::GraphemeIterator, g2::GraphemeIterator) = g1.s == g2.s
216+
hash(g::GraphemeIterator, h::UInt) = hash(g.s, h)
217+
isless(g1::GraphemeIterator, g2::GraphemeIterator) = isless(g1.s, g2.s)
218+
219+
convert{S<:AbstractString}(::Type{S}, g::GraphemeIterator) = convert(S, g.s)
220+
221+
show{S}(io::IO, g::GraphemeIterator{S}) = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"")
222+
223+
############################################################################
224+
171225
end # module

‎deps/libmojibake

Submodule libmojibake updated from df71da4 to 86447ad

‎doc/manual/calling-c-and-fortran-code.rst

+2
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,8 @@ Julia type with the same name, prefixed by C. This can help for writing portable
223223

224224
**System-independent:**
225225

226+
+------------------------+-------------------+--------------------------------+
227+
| ``bool`` | ``Cbool`` | ``UInt8`` |
226228
+------------------------+-------------------+--------------------------------+
227229
| ``unsigned char`` | ``Cuchar`` | ``UInt8`` |
228230
+------------------------+-------------------+--------------------------------+

‎doc/stdlib/base.rst

+8
Original file line numberDiff line numberDiff line change
@@ -1415,6 +1415,14 @@ Strings
14151415

14161416
For example, NFKC corresponds to the options ``compose=true, compat=true, stable=true``.
14171417

1418+
.. function:: graphemes(s) -> iterator over substrings of s
1419+
1420+
Returns an iterator over substrings of ``s`` that correspond to
1421+
the extended graphemes in the string, as defined by Unicode UAX #29.
1422+
(Roughly, these are what users would perceive as single characters,
1423+
even though they may contain more than one codepoint; for example
1424+
a letter combined with an accent mark is a single grapheme.)
1425+
14181426
.. function:: is_valid_ascii(s) -> Bool
14191427

14201428
Returns true if the argument (``ASCIIString``, ``UTF8String``, or byte vector) is valid ASCII, false otherwise.

‎test/strings.jl

+6-1
Original file line numberDiff line numberDiff line change
@@ -1267,6 +1267,11 @@ Base.done(jt::i9178, n) = (jt.ndone += 1 ; n > 3)
12671267
Base.next(jt::i9178, n) = (jt.nnext += 1 ; ("$(jt.nnext),$(jt.ndone)", n+1))
12681268
@test join(i9178(0,0), ";") == "1,1;2,2;3,3;4,4"
12691269

1270+
# make sure substrings handle last code unit even if not start of codepoint
1271+
let s = "x\u0302"
1272+
@test s[1:3] == s
1273+
end
1274+
12701275
# reverseind
12711276
for T in (ASCIIString, UTF8String, UTF16String, UTF32String)
12721277
for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1")
@@ -1288,4 +1293,4 @@ for T in (ASCIIString, UTF8String, UTF16String, UTF32String)
12881293
end
12891294
end
12901295
end
1291-
end
1296+
end

‎test/unicode.jl

+27-1
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,35 @@ else
9393
end
9494

9595
# check utf8proc handling of CN category constants
96-
9796
let c_ll = 'β', c_cn = '\u038B'
9897
@test Base.UTF8proc.category_code(c_ll) == Base.UTF8proc.UTF8PROC_CATEGORY_LL
9998
# check codepoint with category code CN
10099
@test Base.UTF8proc.category_code(c_cn) == Base.UTF8proc.UTF8PROC_CATEGORY_CN
101100
end
101+
102+
# graphemes
103+
let grphtest = (("b\u0300lahβlahb\u0302láh", ["b\u0300","l","a","h",
104+
"β","l","a","h",
105+
"b\u0302","l","á","h"]),
106+
("", UTF8String[]),
107+
("x\u0302", ["x\u0302"]),
108+
("\U1d4c1\u0302", ["\U1d4c1\u0302"]),
109+
("\U1d4c1\u0302\U1d4c1\u0300", ["\U1d4c1\u0302",
110+
"\U1d4c1\u0300"]),
111+
("x",["x"]),
112+
("abc",["a","b","c"]))
113+
for T in (utf8,utf16,utf32)
114+
for nf in (:NFC, :NFD)
115+
for (s, g) in grphtest
116+
s_ = T(normalize_string(s, nf))
117+
g_ = map(s -> normalize_string(s, nf), g)
118+
grph = collect(graphemes(s_))
119+
@test grph == g_
120+
@test length(graphemes(s_)) == length(grph)
121+
end
122+
S = [T(normalize_string(s)) for (s,g) in grphtest]
123+
G = map(graphemes, S)
124+
@test map(graphemes, sort!(S)) == sort!(G)
125+
end
126+
end
127+
end

0 commit comments

Comments
 (0)
Please sign in to comment.