Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

moji char predicates #8233

Merged
merged 5 commits into from
Sep 13, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ Library improvements

* Efficient `mean` and `median` for ranges ([#8089]).

* Character predicates such as `islower()`, `isspace()`, etc. use utf8proc/libmojibake
to provide uniform cross-platform behavior and up-to-date, locale-independent support
for Unicode standards ([#5939]).


Julia v0.3.0 Release Notes
==========================

Expand Down Expand Up @@ -904,6 +909,7 @@ Too numerous to mention.
[#5832]: https://github.com/JuliaLang/julia/issues/5832
[#5927]: https://github.com/JuliaLang/julia/issues/5927
[#5936]: https://github.com/JuliaLang/julia/issues/5936
[#5939]: https://github.com/JuliaLang/julia/issues/5939
[#5970]: https://github.com/JuliaLang/julia/issues/5970
[#6056]: https://github.com/JuliaLang/julia/issues/6056
[#6057]: https://github.com/JuliaLang/julia/issues/6057
Expand Down
3 changes: 3 additions & 0 deletions base/deprecated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -174,3 +174,6 @@ export TcpSocket, UdpSocket, IpAddr
const TcpSocket = TCPSocket
const UdpSocket = UDPSocket
const IpAddr = IPAddr

@deprecate isblank(c::Char) c == ' ' || c == '\t'
@deprecate isblank(s::String) all(c -> c == ' ' || c == '\t', s)
2 changes: 1 addition & 1 deletion base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -824,12 +824,12 @@ export
isalnum,
isalpha,
isascii,
isblank,
iscntrl,
isdigit,
isgraph,
islower,
ismatch,
isnumber,
isprint,
ispunct,
isspace,
Expand Down
16 changes: 2 additions & 14 deletions base/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -541,22 +541,10 @@ strwidth(s::String) = (w=0; for c in s; w += charwidth(c); end; w)
strwidth(s::ByteString) = int(ccall(:u8_strwidth, Csize_t, (Ptr{Uint8},), s.data))
# TODO: implement and use u8_strnwidth that takes a length argument

## libc character class predicates ##

isascii(c::Char) = c < 0x80
isascii(s::String) = all(isascii, s)
isascii(s::ASCIIString) = true

for name = ("alnum", "alpha", "cntrl", "digit", "graph",
"lower", "print", "punct", "space", "upper")
f = symbol(string("is",name))
@eval ($f)(c::Char) = bool(ccall($(string("isw",name)), Int32, (Cwchar_t,), c))
@eval $f(s::String) = all($f, s)
end

isblank(c::Char) = c==' ' || c=='\t'
isblank(s::String) = all(isblank, s)

## generic string uses only endof and next ##

immutable GenericString <: String
Expand Down Expand Up @@ -987,7 +975,7 @@ end
function indentation(s::String)
count = 0
for c in s
if isblank(c)
if c == ' ' || c == '\t'
count += blank_width(c)
else
return count, false
Expand All @@ -1005,7 +993,7 @@ function unindent(s::String, indent::Int)
cut = 0
while !done(s,i)
c,i_ = next(s,i)
if cutting && isblank(c)
if cutting && (c == ' ' || c == '\t')
a = i_
cut += blank_width(c)
if cut == indent
Expand Down
65 changes: 64 additions & 1 deletion base/utf8proc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ module UTF8proc
import Base: show, showcompact, ==, string, symbol, isless

# also exported by Base:
export normalize_string, is_valid_char, is_assigned_char
export normalize_string, is_valid_char, is_assigned_char,
islower, isupper, isalpha, isdigit, isnumber, isalnum,
iscntrl, ispunct, isspace, isprint, isgraph, isblank

# whether codepoints are valid Unicode
is_valid_char(c) = bool(ccall(:utf8proc_codepoint_valid, Cchar, (Int32,), c))
Expand Down Expand Up @@ -117,6 +119,67 @@ end

is_assigned_char(c) = category_code(c) != UTF8PROC_CATEGORY_CN

# category_code() modified to ignore case of unassigned category CN
# used by character class predicates for improved performance
function _catcode(c)
c > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
cat = unsafe_load(ccall(:utf8proc_get_property, Ptr{Uint16}, (Int32,), c))
end

# TODO: use UTF8PROC_CHARBOUND to extract graphemes from a string, e.g. to iterate over graphemes?


## libc character class predicates ##

islower(c::Char) = (_catcode(c)==UTF8PROC_CATEGORY_LL)

# true for Unicode upper and mixed case
function isupper(c::Char)
ccode=_catcode(c)
return ccode==UTF8PROC_CATEGORY_LU || ccode==UTF8PROC_CATEGORY_LT
end

isalpha(c::Char) = (UTF8PROC_CATEGORY_LU <= _catcode(c) <=
UTF8PROC_CATEGORY_LO)

isdigit(c::Char) = ('0' <= c <= '9')

isnumber(c::Char) = (UTF8PROC_CATEGORY_ND <= _catcode(c) <=
UTF8PROC_CATEGORY_NO)

function isalnum(c::Char)
ccode=_catcode(c)
return (UTF8PROC_CATEGORY_LU <= ccode <= UTF8PROC_CATEGORY_LO) ||
(UTF8PROC_CATEGORY_ND <= ccode <= UTF8PROC_CATEGORY_NO)
end

# following C++ only control characters from the Latin-1 subset return true
iscntrl(c::Char) = (uint(c)<= 0x1f || 0x7f<=uint(c)<=0x9f)

ispunct(c::Char) = (UTF8PROC_CATEGORY_PC <=_catcode(c) <= UTF8PROC_CATEGORY_PO)

# 0x85 is the Unicode Next Line (NEL) character
isspace(c::Char) = c==' ' || '\t'<=c<='\r' || c==0x85 || _catcode(c)==UTF8PROC_CATEGORY_ZS

isprint(c::Char) = (UTF8PROC_CATEGORY_LU <= _catcode(c) <= UTF8PROC_CATEGORY_ZS)

# true in principal if a printer would use ink
isgraph(c::Char) = (UTF8PROC_CATEGORY_LU <= _catcode(c) <= UTF8PROC_CATEGORY_SO)

for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph",
"lower", "print", "punct", "space", "upper")
f = symbol(string("is",name))
@eval begin
function $f(s::String)
for c in s
if !$f(c)
return false
end
end
return true
end
end
end


end # module
Loading