add graphemes(s) function to iterate over graphemes (represented by substrings) of a string s

stevengj · stevengj · commit 2b4466018cfc · 2014-12-15T14:40:55.000-05:00
diff --git a/NEWS.md b/NEWS.md
@@ -100,6 +100,8 @@ Library improvements
 
   * Efficient `mean` and `median` for ranges ([#8089]).
 
+  * `graphemes(s)` returns an iterator over grapheme substrings of `s` ([#9261]).
+
   * Character predicates such as `islower()`, `isspace()`, etc. use utf8proc/libmojibake
     to provide uniform cross-platform behavior and up-to-date, locale-independent support
     for Unicode standards ([#5939]).
@@ -1132,4 +1134,6 @@ Too numerous to mention.
 [#9133]: https://github.com/JuliaLang/julia/issues/9133
 [#9144]: https://github.com/JuliaLang/julia/issues/9144
 [#9249]: https://github.com/JuliaLang/julia/issues/9249
+[#9261]: https://github.com/JuliaLang/julia/issues/9261
 [#9271]: https://github.com/JuliaLang/julia/issues/9271
+[#9294]: https://github.com/JuliaLang/julia/issues/9294
diff --git a/base/c.jl b/base/c.jl
@@ -39,6 +39,7 @@ dlclose(p::Ptr) = if p!=C_NULL; ccall(:uv_dlclose,Void,(Ptr{Void},),p); end
 cfunction(f::Function, r, a) =
     ccall(:jl_function_ptr, Ptr{Void}, (Any, Any, Any), f, r, a)
 
+typealias Cbool UInt8
 if ccall(:jl_is_char_signed, Any, ())
     typealias Cchar Int8
 else
diff --git a/base/exports.jl b/base/exports.jl
@@ -119,6 +119,7 @@ export
     Zip,
 
 # Ccall types
+    Cbool,
     Cchar,
     Cdouble,
     Cfloat,
@@ -822,6 +823,7 @@ export
     escape_string,
     float32_isvalid,
     float64_isvalid,
+    graphemes,
     hex,
     hex2bytes,
     ind2chr,
diff --git a/base/string.jl b/base/string.jl
@@ -1729,4 +1729,3 @@ pointer{T<:ByteString}(x::SubString{T}, i::Integer) = pointer(x.string.data) + x
 pointer(x::Union(UTF16String,UTF32String), i::Integer) = pointer(x)+(i-1)*sizeof(eltype(x.data))
 pointer{T<:Union(UTF16String,UTF32String)}(x::SubString{T}) = pointer(x.string.data) + x.offset*sizeof(eltype(x.data))
 pointer{T<:Union(UTF16String,UTF32String)}(x::SubString{T}, i::Integer) = pointer(x.string.data) + (x.offset + (i-1))*sizeof(eltype(x.data))
-
diff --git a/base/utf8.jl b/base/utf8.jl
@@ -114,7 +114,7 @@ function getindex(s::UTF8String, r::UnitRange{Int})
     if !is_utf8_start(d[i])
         i = nextind(s,i)
     end
-    if j > endof(s)
+    if j > length(d)
         throw(BoundsError())
     end
     j = nextind(s,j)-1
diff --git a/base/utf8proc.jl b/base/utf8proc.jl
@@ -1,10 +1,12 @@
 # Various Unicode functionality from the utf8proc library
 module UTF8proc
 
-import Base: show, showcompact, ==, string, symbol, isless
+import Base: show, showcompact, ==, hash, string, symbol, isless, length, eltype, start, next, done, convert
+
+export isgraphemebreak
 
 # also exported by Base:
-export normalize_string, is_valid_char, is_assigned_char,
+export normalize_string, graphemes, is_valid_char, is_assigned_char,
    islower, isupper, isalpha, isdigit, isnumber, isalnum,
    iscntrl, ispunct, isspace, isprint, isgraph, isblank
 
@@ -60,6 +62,8 @@ const UTF8PROC_CHARBOUND = (1<<11)
 const UTF8PROC_LUMP      = (1<<12)
 const UTF8PROC_STRIPMARK = (1<<13)
 
+############################################################################
+
 let
     const p = Array(Ptr{UInt8}, 1)
     global utf8proc_map
@@ -110,6 +114,8 @@ function normalize_string(s::AbstractString, nf::Symbol)
                     throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD")))
 end
 
+############################################################################
+
 # returns UTF8PROC_CATEGORY code in 1:30 giving Unicode category
 function category_code(c)
     uint32(c) > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
@@ -118,8 +124,6 @@ end
 
 is_assigned_char(c) = category_code(c) != UTF8PROC_CATEGORY_CN
 
-# TODO: use UTF8PROC_CHARBOUND to extract graphemes from a string, e.g. to iterate over graphemes?
-
 ## libc character class predicates ##
 
 islower(c::Char) = (category_code(c) == UTF8PROC_CATEGORY_LL)
@@ -168,4 +172,54 @@ for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph",
     end
 end
 
+############################################################################
+# iterators for grapheme segmentation
+
+isgraphemebreak(c1::Char, c2::Char) = Bool(ccall(:utf8proc_grapheme_break, Cbool, (Char, Char),
+                                                 c1, c2))
+
+immutable GraphemeIterator{S<:AbstractString}
+    s::S # original string (for generation of SubStrings)
+end
+graphemes(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
+
+eltype{S}(::GraphemeIterator{S}) = SubString{S}
+
+function length(g::GraphemeIterator)
+    c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this)
+    n = 0
+    for c in g.s
+        n += isgraphemebreak(c0, c)
+        c0 = c
+    end
+    return n
+end
+
+start(g::GraphemeIterator) = start(g.s)
+done(g::GraphemeIterator, i) = done(g.s, i)
+
+function next(g::GraphemeIterator, i)
+    s = g.s
+    j = i 
+    c0, k = next(s, i)
+    while !done(s, k) # loop until next grapheme is s[i:j]
+        c, ℓ = next(s, k)
+        isgraphemebreak(c0, c) && break
+        j = k
+        k = ℓ
+        c0 = c
+    end
+    return (s[i:j], k)
+end
+
+==(g1::GraphemeIterator, g2::GraphemeIterator) = g1.s == g2.s
+hash(g::GraphemeIterator, h::UInt) = hash(g.s, h)
+isless(g1::GraphemeIterator, g2::GraphemeIterator) = isless(g1.s, g2.s)
+
+convert{S<:AbstractString}(::Type{S}, g::GraphemeIterator) = convert(S, g.s)
+
+show{S}(io::IO, g::GraphemeIterator{S}) = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"")
+
+############################################################################
+
 end # module
diff --git a/deps/libmojibake b/deps/libmojibake
@@ -1 +1 @@
-Subproject commit df71da45dfbdf68bcc6fd656d1260d609c728ad7
+Subproject commit 86447ad060d6f4edf01f2a64b9598dfeeb6e6f7d
diff --git a/doc/manual/calling-c-and-fortran-code.rst b/doc/manual/calling-c-and-fortran-code.rst
@@ -223,6 +223,8 @@ Julia type with the same name, prefixed by C. This can help for writing portable
 
 **System-independent:**
 
++------------------------+-------------------+--------------------------------+
+| ``bool``               | ``Cbool``         | ``UInt8``                      |
 +------------------------+-------------------+--------------------------------+
 | ``unsigned char``      | ``Cuchar``        | ``UInt8``                      |
 +------------------------+-------------------+--------------------------------+
diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst
@@ -1415,6 +1415,14 @@ Strings
 
    For example, NFKC corresponds to the options ``compose=true, compat=true, stable=true``.
 
+.. function:: graphemes(s) -> iterator over substrings of s
+
+   Returns an iterator over substrings of ``s`` that correspond to
+   the extended graphemes in the string, as defined by Unicode UAX #29.
+   (Roughly, these are what users would perceive as single characters,
+    even though they may contain more than one codepoint; for example
+    a letter combined with an accent mark is a single grapheme.)
+
 .. function:: is_valid_ascii(s) -> Bool
 
    Returns true if the argument (``ASCIIString``, ``UTF8String``, or byte vector) is valid ASCII, false otherwise.
diff --git a/test/strings.jl b/test/strings.jl
@@ -1267,6 +1267,11 @@ Base.done(jt::i9178, n) = (jt.ndone += 1 ; n > 3)
 Base.next(jt::i9178, n) = (jt.nnext += 1 ; ("$(jt.nnext),$(jt.ndone)", n+1))
 @test join(i9178(0,0), ";") == "1,1;2,2;3,3;4,4"
 
+# make sure substrings handle last code unit even if not start of codepoint
+let s = "x\u0302"
+    @test s[1:3] == s
+end
+
 # reverseind
 for T in (ASCIIString, UTF8String, UTF16String, UTF32String)
     for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1")
@@ -1288,4 +1293,4 @@ for T in (ASCIIString, UTF8String, UTF16String, UTF32String)
             end
         end
     end
-end
+end
diff --git a/test/unicode.jl b/test/unicode.jl
@@ -93,9 +93,35 @@ else
 end
 
 # check utf8proc handling of CN category constants
-
 let c_ll = 'β', c_cn = '\u038B'
     @test Base.UTF8proc.category_code(c_ll) == Base.UTF8proc.UTF8PROC_CATEGORY_LL
     # check codepoint with category code CN
     @test Base.UTF8proc.category_code(c_cn) == Base.UTF8proc.UTF8PROC_CATEGORY_CN
 end
+
+# graphemes
+let grphtest = (("b\u0300lahβlahb\u0302láh", ["b\u0300","l","a","h",
+                                              "β","l","a","h",
+                                              "b\u0302","l","á","h"]),
+                ("", UTF8String[]),
+                ("x\u0302", ["x\u0302"]),
+                ("\U1d4c1\u0302", ["\U1d4c1\u0302"]),
+                ("\U1d4c1\u0302\U1d4c1\u0300", ["\U1d4c1\u0302",
+                                                "\U1d4c1\u0300"]),
+                ("x",["x"]),
+                ("abc",["a","b","c"]))
+    for T in (utf8,utf16,utf32)
+        for nf in (:NFC, :NFD)
+            for (s, g) in grphtest
+                s_ = T(normalize_string(s, nf))
+                g_ = map(s -> normalize_string(s, nf), g)
+                grph = collect(graphemes(s_))
+                @test grph == g_
+                @test length(graphemes(s_)) == length(grph)
+            end
+            S = [T(normalize_string(s)) for (s,g) in grphtest]
+            G = map(graphemes, S)
+            @test map(graphemes, sort!(S)) == sort!(G)
+        end
+    end
+end