Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 2c66ba4

Browse files
charlieviethpull[bot]
authored andcommittedNov 6, 2024
unicode: improve SimpleFold performance by 2x for non-foldable code points
Change SimpleFold to search the CaseRanges table only once when no folding is specified for the rune (previously up to two searches could be performed). This improves performance by 2x for runes that have no folds or are already upper case. As a side effect this improves the performance of To by roughly ~15% goos: darwin goarch: arm64 pkg: unicode cpu: Apple M1 Max │ base.10.txt │ new.10.txt │ │ sec/op │ sec/op vs base │ ToUpper-10 11.860n ± 1% 9.731n ± 1% -17.95% (p=0.000 n=10) ToLower-10 12.31n ± 1% 10.34n ± 1% -16.00% (p=0.000 n=10) SimpleFold/Upper-10 19.16n ± 0% 15.98n ± 1% -16.64% (p=0.000 n=10) SimpleFold/Lower-10 32.41n ± 1% 17.09n ± 1% -47.27% (p=0.000 n=10) SimpleFold/Fold-10 8.884n ± 4% 8.856n ± 8% ~ (p=0.700 n=10) SimpleFold/NoFold-10 30.87n ± 0% 15.49n ± 3% -49.84% (p=0.000 n=10) geomean 17.09n 12.47n -26.99% Change-Id: I6e5c7554106842955aadeef7b266c4c7944d3a97 Reviewed-on: https://go-review.googlesource.com/c/go/+/454958 Reviewed-by: Ian Lance Taylor <[email protected]> Reviewed-by: Dmitri Shuralyov <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Auto-Submit: Ian Lance Taylor <[email protected]>
1 parent 4a687aa commit 2c66ba4

File tree

2 files changed

+68
-25
lines changed

2 files changed

+68
-25
lines changed
 

‎src/unicode/letter.go

+42-25
Original file line numberDiff line numberDiff line change
@@ -206,41 +206,55 @@ func IsTitle(r rune) bool {
206206
return isExcludingLatin(Title, r)
207207
}
208208

209-
// to maps the rune using the specified case mapping.
210-
// It additionally reports whether caseRange contained a mapping for r.
211-
func to(_case int, r rune, caseRange []CaseRange) (mappedRune rune, foundMapping bool) {
212-
if _case < 0 || MaxCase <= _case {
213-
return ReplacementChar, false // as reasonable an error as any
214-
}
209+
// lookupCaseRange returns the CaseRange mapping for rune r or nil if no
210+
// mapping exists for r.
211+
func lookupCaseRange(r rune, caseRange []CaseRange) *CaseRange {
215212
// binary search over ranges
216213
lo := 0
217214
hi := len(caseRange)
218215
for lo < hi {
219216
m := int(uint(lo+hi) >> 1)
220-
cr := caseRange[m]
217+
cr := &caseRange[m]
221218
if rune(cr.Lo) <= r && r <= rune(cr.Hi) {
222-
delta := cr.Delta[_case]
223-
if delta > MaxRune {
224-
// In an Upper-Lower sequence, which always starts with
225-
// an UpperCase letter, the real deltas always look like:
226-
// {0, 1, 0} UpperCase (Lower is next)
227-
// {-1, 0, -1} LowerCase (Upper, Title are previous)
228-
// The characters at even offsets from the beginning of the
229-
// sequence are upper case; the ones at odd offsets are lower.
230-
// The correct mapping can be done by clearing or setting the low
231-
// bit in the sequence offset.
232-
// The constants UpperCase and TitleCase are even while LowerCase
233-
// is odd so we take the low bit from _case.
234-
return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 | rune(_case&1)), true
235-
}
236-
return r + delta, true
219+
return cr
237220
}
238221
if r < rune(cr.Lo) {
239222
hi = m
240223
} else {
241224
lo = m + 1
242225
}
243226
}
227+
return nil
228+
}
229+
230+
// convertCase converts r to _case using CaseRange cr.
231+
func convertCase(_case int, r rune, cr *CaseRange) rune {
232+
delta := cr.Delta[_case]
233+
if delta > MaxRune {
234+
// In an Upper-Lower sequence, which always starts with
235+
// an UpperCase letter, the real deltas always look like:
236+
// {0, 1, 0} UpperCase (Lower is next)
237+
// {-1, 0, -1} LowerCase (Upper, Title are previous)
238+
// The characters at even offsets from the beginning of the
239+
// sequence are upper case; the ones at odd offsets are lower.
240+
// The correct mapping can be done by clearing or setting the low
241+
// bit in the sequence offset.
242+
// The constants UpperCase and TitleCase are even while LowerCase
243+
// is odd so we take the low bit from _case.
244+
return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 | rune(_case&1))
245+
}
246+
return r + delta
247+
}
248+
249+
// to maps the rune using the specified case mapping.
250+
// It additionally reports whether caseRange contained a mapping for r.
251+
func to(_case int, r rune, caseRange []CaseRange) (mappedRune rune, foundMapping bool) {
252+
if _case < 0 || MaxCase <= _case {
253+
return ReplacementChar, false // as reasonable an error as any
254+
}
255+
if cr := lookupCaseRange(r, caseRange); cr != nil {
256+
return convertCase(_case, r, cr), true
257+
}
244258
return r, false
245259
}
246260

@@ -364,8 +378,11 @@ func SimpleFold(r rune) rune {
364378
// No folding specified. This is a one- or two-element
365379
// equivalence class containing rune and ToLower(rune)
366380
// and ToUpper(rune) if they are different from rune.
367-
if l := ToLower(r); l != r {
368-
return l
381+
if cr := lookupCaseRange(r, CaseRanges); cr != nil {
382+
if l := convertCase(LowerCase, r, cr); l != r {
383+
return l
384+
}
385+
return convertCase(UpperCase, r, cr)
369386
}
370-
return ToUpper(r)
387+
return r
371388
}

‎src/unicode/letter_test.go

+26
Original file line numberDiff line numberDiff line change
@@ -642,3 +642,29 @@ func TestNegativeRune(t *testing.T) {
642642
}
643643
}
644644
}
645+
646+
func BenchmarkToUpper(b *testing.B) {
647+
for i := 0; i < b.N; i++ {
648+
_ = ToUpper('δ')
649+
}
650+
}
651+
652+
func BenchmarkToLower(b *testing.B) {
653+
for i := 0; i < b.N; i++ {
654+
_ = ToLower('Δ')
655+
}
656+
}
657+
658+
func BenchmarkSimpleFold(b *testing.B) {
659+
bench := func(name string, r rune) {
660+
b.Run(name, func(b *testing.B) {
661+
for i := 0; i < b.N; i++ {
662+
_ = SimpleFold(r)
663+
}
664+
})
665+
}
666+
bench("Upper", 'Δ')
667+
bench("Lower", 'δ')
668+
bench("Fold", '\u212A')
669+
bench("NoFold", '習')
670+
}

0 commit comments

Comments
 (0)
Please sign in to comment.