Skip to content

Commit 1d07802

Browse files
committed
fix #2: add charwidth function
1 parent 50381b9 commit 1d07802

8 files changed

+10590
-10087
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
*.dylib
1010
*.dSYM
1111
*.txt
12+
*.ttf
13+
*.sfd
1214
*.out
1315
bench/bench
1416
bench/icu

.travis.yml

+5
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@ compiler:
44
- clang
55
notifications:
66
email: false
7+
before_install:
8+
- sudo add-apt-repository ppa:staticfloat/julia-deps -y
9+
- sudo add-apt-repository ppa:staticfloat/juliareleases -y
10+
- sudo apt-get update -qq -y
11+
- sudo apt-get install libpcre3-dev julia fontforge -y
712
script:
813
- make prefix=`pwd`/local install
914
- make check

Makefile

+8-4
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ PERL=perl
77
MAKE=make
88
AR=ar
99
INSTALL=install
10+
JULIA=julia
1011

1112
# compiler settings
1213
cflags = -O2 -std=c99 -pedantic -Wall -fpic -DUTF8PROC_EXPORTS $(CFLAGS)
@@ -41,16 +42,19 @@ all: c-library
4142
c-library: libutf8proc.a libutf8proc.$(SHLIB_EXT)
4243

4344
clean:
44-
rm -f utf8proc.o libutf8proc.a libutf8proc.$(SHLIB_VERS_EXT) libutf8proc.$(SHLIB_EXT) test/normtest test/graphemetest data/UnicodeData.txt data/DerivedCoreProperties.txt data/CompositionExclusions.txt data/CaseFolding.txt data/NormalizationTest.txt data/GraphemeBreakTest.txt
45+
rm -f utf8proc.o libutf8proc.a libutf8proc.$(SHLIB_VERS_EXT) libutf8proc.$(SHLIB_EXT) test/normtest test/graphemetest data/UnicodeData.txt data/DerivedCoreProperties.txt data/CompositionExclusions.txt data/CaseFolding.txt data/NormalizationTest.txt data/GraphemeBreakTest.txt data/CharWidths.txt data/unifont*.ttf data/unifont*.sfd
4546
$(MAKE) -C bench clean
4647

4748
update: utf8proc_data.c.new
4849
cp -f utf8proc_data.c.new utf8proc_data.c
4950

5051
# real targets
5152

52-
utf8proc_data.c.new: data/data_generator.rb data/UnicodeData.txt data/GraphemeBreakProperty.txt data/DerivedCoreProperties.txt data/CompositionExclusions.txt data/CaseFolding.txt
53-
(cd data; $(RUBY) data_generator.rb < UnicodeData.txt) > utf8proc_data.c.new
53+
utf8proc_data.c.new: data/data_generator.rb data/UnicodeData.txt data/GraphemeBreakProperty.txt data/DerivedCoreProperties.txt data/CompositionExclusions.txt data/CaseFolding.txt data/CharWidths.txt
54+
(cd data; $(RUBY) data_generator.rb < UnicodeData.txt) > $@
55+
56+
data/CharWidths.txt: data/charwidths.jl libutf8proc.$(SHLIB_EXT)
57+
(cd data; $(JULIA) charwidths.jl) > $@
5458

5559
data/UnicodeData.txt:
5660
$(CURL) -o $@ -O http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
@@ -85,7 +89,7 @@ libutf8proc.$(MAJOR).dylib: utf8proc.o
8589
$(cc) -dynamiclib -o $@ $^ -install_name $(libdir)/$@ -Wl,-compatibility_version -Wl,$(MAJOR) -Wl,-current_version -Wl,$(MAJOR).$(MINOR).$(PATCH)
8690

8791
libutf8proc.dylib: libutf8proc.$(MAJOR).dylib
88-
ln -s libutf8proc.$(MAJOR).dylib $@
92+
ln -f -s libutf8proc.$(MAJOR).dylib $@
8993

9094
install: libutf8proc.a libutf8proc.$(SHLIB_EXT) libutf8proc.$(SHLIB_VERS_EXT)
9195
mkdir -m 755 -p $(includedir)

data/charwidths.jl

+157
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
# Following work by @jiahao, we compute character widths using a combination of
2+
# * advance widths from GNU Unifont (advance width 512 = 1 en)
3+
# * UAX 11: East Asian Width
4+
# * a few exceptions as needed
5+
# Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734
6+
#
7+
# Requires Julia (obviously) and FontForge.
8+
9+
#############################################################################
10+
# Julia 0.3/0.4 compatibility (taken from Compat package)
11+
if VERSION < v"0.4.0-dev+1419"
12+
const UInt16 = Uint16
13+
end
14+
15+
#############################################################################
16+
# Widths from GNU Unifont
17+
18+
universion="7.0.06"
19+
for fontfile in ["unifont-$universion", "unifont_upper-$universion"]
20+
isfile("$fontfile.ttf") || download("http://unifoundry.com/pub/unifont-$universion/font-builds/$fontfile.ttf", "$fontfile.ttf")
21+
isfile("$fontfile.sfd") || run(`fontforge -lang=ff -c "Open(\"$fontfile.ttf\");Save(\"$fontfile.sfd\");Quit(0);"`)
22+
end
23+
24+
#Read sfdfile for character widths
25+
function parsesfd(filename::String, CharWidths::Dict{Int,Int}=Dict{Int,Int}())
26+
state=:seekchar
27+
lineno = 0
28+
for line in readlines(open(filename))
29+
lineno += 1
30+
if state==:seekchar #StartChar: nonmarkingreturn
31+
if contains(line, "StartChar: ")
32+
codepoint = nothing
33+
width = nothing
34+
state = :readdata
35+
end
36+
elseif state==:readdata #Encoding: 65538 -1 2, Width: 1024
37+
contains(line, "Encoding:") && (codepoint = int(split(line)[3]))
38+
contains(line, "Width:") && (width = int(split(line)[2]))
39+
if codepoint!=nothing && width!=nothing && codepoint >= 0
40+
CharWidths[codepoint]=width
41+
state = :seekchar
42+
end
43+
end
44+
end
45+
CharWidths
46+
end
47+
CharWidths=parsesfd("unifont-$universion.sfd")
48+
CharWidths=parsesfd("unifont_upper-$universion.sfd", CharWidths)
49+
50+
# convert from advance width (512 units to the en) to character width
51+
for (c,v) in CharWidths
52+
CharWidths[c] = div(v, 512)
53+
end
54+
55+
#############################################################################
56+
# Widths from UAX #11: East Asian Width
57+
58+
isfile("EastAsianWidth.txt") || download("http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt", "EastAsianWidth.txt")
59+
for line in readlines(open("EastAsianWidth.txt"))
60+
#Strip comments
61+
line[1] == '#' && continue
62+
precomment = split(line, '#')[1]
63+
#Parse code point range and width code
64+
tokens = split(precomment, ';')
65+
length(tokens) >= 2 || continue
66+
charrange = tokens[1]
67+
width = strip(tokens[2])
68+
#Parse code point range into Julia UnitRange
69+
rangetokens = split(charrange, "..")
70+
charstart = uint32("0x"*rangetokens[1])
71+
charend = uint32("0x"*rangetokens[length(rangetokens)>1 ? 2 : 1])
72+
73+
#Assign widths
74+
for c in charstart:charend
75+
width=="N" && continue #Ignore neutral characters
76+
CharWidths[c]=(width=="W" || width=="F") ? 2 : #Wide or full
77+
(width=="Na"|| width=="H" || width=="A") ? 1 : #Narrow or half or ambiguous (default to narrow in non-East-Asian contexts, which we can assume to be the default)
78+
error("Unknown East Asian width code: $width for code point: $c")
79+
end
80+
end
81+
82+
#############################################################################
83+
# A few exceptions to the above cases, found by manual comparison
84+
# to other wcwidth functions.
85+
86+
# Use ../libutf8proc for category codes, rather than the one in Julia,
87+
# to minimize bootstrapping complexity when a new version of Unicode comes out.
88+
function catcode(c)
89+
uint(c) > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
90+
return unsafe_load(ccall((:utf8proc_get_property,"../libutf8proc"), Ptr{UInt16}, (Int32,), c))
91+
end
92+
93+
94+
# use Base.UTF8proc module to get category codes constants, since
95+
# we aren't goint to change these in utf8proc.
96+
import Base.UTF8proc
97+
98+
# make sure format control character (category Cf) have width 0,
99+
# except for the Arabic characters 0x06xx (see unicode std 6.2, sec. 8.2)
100+
for c in keys(CharWidths)
101+
if catcode(c)==UTF8proc.UTF8PROC_CATEGORY_CF &&
102+
c [0x0601,0x0602,0x0603,0x06dd]
103+
CharWidths[c]=0
104+
end
105+
end
106+
107+
#By definition, should have zero width (on the same line)
108+
#0x002028 '
' category: Zl name: LINE SEPARATOR/
109+
#0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/
110+
CharWidths[0x2028]=0
111+
CharWidths[0x2029]=0
112+
113+
#By definition, should be narrow = width of 1 en space
114+
#0x00202f ' ' category: Zs name: NARROW NO-BREAK SPACE/
115+
CharWidths[0x202f]=1
116+
117+
#By definition, should be wide = width of 1 em space
118+
#0x002001 ' ' category: Zs name: EM QUAD/
119+
#0x002003 ' ' category: Zs name: EM SPACE/
120+
CharWidths[0x2001]=2
121+
CharWidths[0x2003]=2
122+
123+
#############################################################################
124+
# Non-printable control characters will be assigned a width of zero
125+
# (wcwidth returns -1 for such characters)
126+
127+
isprintable(c::Union(Char,Integer)) = c <= 0x10ffff && is_valid_char(c) && isprintable_category(catcode(c))
128+
isprintable_category(category) =
129+
!( category==UTF8proc.UTF8PROC_CATEGORY_CN # Unassigned
130+
|| category==UTF8proc.UTF8PROC_CATEGORY_CS # Surrogate
131+
|| category==UTF8proc.UTF8PROC_CATEGORY_CC # Control
132+
|| category==0 # Invalid
133+
)
134+
135+
# Question: should we just use Julia's isprint algorithm here? It is different,
136+
# though it is also based on the character category.
137+
138+
#############################################################################
139+
# Output (to a file or pipe) for processing by data_generator.rb
140+
# ... don't bother to output zero widths since that will be the default.
141+
142+
firstc = 0x000000
143+
lastv = 0
144+
uhex(c) = uppercase(hex(c,4))
145+
for c in 0x0000:0x110000
146+
v = isprintable(c) ? get(CharWidths, c, 0) : 0
147+
if v != lastv || c == 0x110000
148+
v < 4 || error("invalid charwidth $v for $c")
149+
if firstc+1 < c
150+
println(uhex(firstc), "..", uhex(c-1), "; ", lastv)
151+
else
152+
println(uhex(firstc), "; ", lastv)
153+
end
154+
firstc = c
155+
lastv = v
156+
end
157+
end

data/data_generator.rb

+14-4
Original file line numberDiff line numberDiff line change
@@ -85,14 +85,23 @@
8585
end
8686
end
8787

88+
$charwidth_list = File.read("CharWidths.txt")
89+
$charwidth = Hash.new(0)
90+
$charwidth_list.each_line do |entry|
91+
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([0-9]+)/
92+
$1.hex.upto($2.hex) { |e2| $charwidth[e2] = $3.to_i }
93+
elsif entry =~ /^([0-9A-F]+)\s*;\s*([0-9]+)/
94+
$charwidth[$1.hex] = $2.to_i
95+
end
96+
end
97+
8898
$exclusions = File.read("CompositionExclusions.txt")[/# \(1\) Script Specifics.*?# Total code points:/m]
8999
$exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex }
90100

91101
$excl_version = File.read("CompositionExclusions.txt")[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m]
92102
$excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }
93103

94-
$case_folding_string = File.open("CaseFolding.txt").read
95-
104+
$case_folding_string = File.open("CaseFolding.txt", :encoding => 'utf-8').read
96105
$case_folding = {}
97106
$case_folding_string.chomp.split("\n").each do |line|
98107
next unless line =~ /([0-9A-F]+); [CFS]; ([0-9A-F ]+);/i
@@ -172,7 +181,8 @@ def c_entry(comb1_indicies, comb2_indicies)
172181
"#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
173182
"#{$ignorable.include?(code)}, " <<
174183
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
175-
"#{$grapheme_boundclass[code]}},\n"
184+
"#{$grapheme_boundclass[code]}, " <<
185+
"#{$charwidth[code]}},\n"
176186
end
177187
end
178188

@@ -295,7 +305,7 @@ def c_entry(comb1_indicies, comb2_indicies)
295305
$stdout << "};\n\n"
296306

297307
$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
298-
$stdout << " {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER},\n"
308+
$stdout << " {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER, 0},\n"
299309
properties.each { |line|
300310
$stdout << line
301311
}

utf8proc.c

+6
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,12 @@ DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2) {
223223
utf8proc_get_property(c2)->boundclass);
224224
}
225225

226+
/* return a character width analogous to wcwidth (except portable and
227+
hopefully less buggy than most system wcwidth functions). */
228+
DLLEXPORT int utf8proc_charwidth(int32_t c) {
229+
return utf8proc_get_property(c)->charwidth;
230+
}
231+
226232
#define utf8proc_decompose_lump(replacement_uc) \
227233
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
228234
options & ~UTF8PROC_LUMP, last_boundclass)

utf8proc.h

+4
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ typedef struct utf8proc_property_struct {
181181
unsigned ignorable:1;
182182
unsigned control_boundary:1;
183183
unsigned boundclass:4;
184+
unsigned charwidth:2;
184185
} utf8proc_property_t;
185186

186187
#define UTF8PROC_CATEGORY_CN 0
@@ -388,6 +389,9 @@ DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2);
388389
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
389390
*/
390391

392+
DLLEXPORT int utf8proc_charwidth(int32_t c);
393+
/* Given a codepoint c, return a character width analogous to wcwidth(c). */
394+
391395
DLLEXPORT ssize_t utf8proc_map(
392396
const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
393397
);

0 commit comments

Comments
 (0)