Skip to content

Commit 8304d10

Browse files
committed
cmd/compile: ppc64x intrinsics for math/bits
This adds math/bits intrinsics for OnesCount, Len, TrailingZeros on ppc64x. benchmark old ns/op new ns/op delta BenchmarkLeadingZeros-16 4.26 1.71 -59.86% BenchmarkLeadingZeros16-16 3.04 1.83 -39.80% BenchmarkLeadingZeros32-16 3.31 1.82 -45.02% BenchmarkLeadingZeros64-16 3.69 1.71 -53.66% BenchmarkTrailingZeros-16 2.55 1.62 -36.47% BenchmarkTrailingZeros32-16 2.55 1.77 -30.59% BenchmarkTrailingZeros64-16 2.78 1.62 -41.73% BenchmarkOnesCount-16 3.19 0.93 -70.85% BenchmarkOnesCount32-16 2.55 1.18 -53.73% BenchmarkOnesCount64-16 3.22 0.93 -71.12% Update #18616 I also made a change to bits_test.go because when debugging some failures the output was not quite providing the right argument information. Change-Id: Ia58d31d1777cf4582a4505f85b11a1202ca07d3e Reviewed-on: https://go-review.googlesource.com/41630 Run-TryBot: Lynn Boger <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Carlos Eduardo Seo <[email protected]> Reviewed-by: Keith Randall <[email protected]>
1 parent a486409 commit 8304d10

File tree

7 files changed

+279
-17
lines changed

7 files changed

+279
-17
lines changed

src/cmd/compile/internal/gc/ssa.go

+22-12
Original file line numberDiff line numberDiff line change
@@ -2730,12 +2730,12 @@ func init() {
27302730
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
27312731
return s.newValue1(ssa.OpCtz64, types.Types[TINT], args[0])
27322732
},
2733-
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
2733+
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
27342734
addF("math/bits", "TrailingZeros32",
27352735
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
27362736
return s.newValue1(ssa.OpCtz32, types.Types[TINT], args[0])
27372737
},
2738-
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
2738+
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
27392739
addF("math/bits", "TrailingZeros16",
27402740
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
27412741
x := s.newValue1(ssa.OpZeroExt16to32, types.Types[TUINT32], args[0])
@@ -2776,7 +2776,7 @@ func init() {
27762776
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
27772777
return s.newValue1(ssa.OpBitLen64, types.Types[TINT], args[0])
27782778
},
2779-
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
2779+
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
27802780
addF("math/bits", "Len32",
27812781
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
27822782
if s.config.PtrSize == 4 {
@@ -2785,7 +2785,7 @@ func init() {
27852785
x := s.newValue1(ssa.OpZeroExt32to64, types.Types[TUINT64], args[0])
27862786
return s.newValue1(ssa.OpBitLen64, types.Types[TINT], x)
27872787
},
2788-
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
2788+
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
27892789
addF("math/bits", "Len16",
27902790
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
27912791
if s.config.PtrSize == 4 {
@@ -2795,7 +2795,7 @@ func init() {
27952795
x := s.newValue1(ssa.OpZeroExt16to64, types.Types[TUINT64], args[0])
27962796
return s.newValue1(ssa.OpBitLen64, types.Types[TINT], x)
27972797
},
2798-
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
2798+
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
27992799
// Note: disabled on AMD64 because the Go code is faster!
28002800
addF("math/bits", "Len8",
28012801
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
@@ -2806,7 +2806,7 @@ func init() {
28062806
x := s.newValue1(ssa.OpZeroExt8to64, types.Types[TUINT64], args[0])
28072807
return s.newValue1(ssa.OpBitLen64, types.Types[TINT], x)
28082808
},
2809-
sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
2809+
sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
28102810

28112811
addF("math/bits", "Len",
28122812
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
@@ -2815,7 +2815,7 @@ func init() {
28152815
}
28162816
return s.newValue1(ssa.OpBitLen64, types.Types[TINT], args[0])
28172817
},
2818-
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
2818+
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
28192819
// LeadingZeros is handled because it trivially calls Len.
28202820
addF("math/bits", "Reverse64",
28212821
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
@@ -2845,7 +2845,7 @@ func init() {
28452845
return s.newValue1(ssa.OpBitRev64, types.Types[TINT], args[0])
28462846
},
28472847
sys.ARM64)
2848-
makeOnesCount := func(op64 ssa.Op, op32 ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
2848+
makeOnesCountAMD64 := func(op64 ssa.Op, op32 ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
28492849
return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
28502850
aux := s.lookupSymbol(n, &ssa.ExternSymbol{Sym: syslook("support_popcnt").Sym.Linksym()})
28512851
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), aux, s.sb)
@@ -2881,17 +2881,27 @@ func init() {
28812881
}
28822882
}
28832883
addF("math/bits", "OnesCount64",
2884-
makeOnesCount(ssa.OpPopCount64, ssa.OpPopCount64),
2884+
makeOnesCountAMD64(ssa.OpPopCount64, ssa.OpPopCount64),
28852885
sys.AMD64)
2886+
addF("math/bits", "OnesCount64",
2887+
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
2888+
return s.newValue1(ssa.OpPopCount64, types.Types[TINT], args[0])
2889+
},
2890+
sys.PPC64)
28862891
addF("math/bits", "OnesCount32",
2887-
makeOnesCount(ssa.OpPopCount32, ssa.OpPopCount32),
2892+
makeOnesCountAMD64(ssa.OpPopCount32, ssa.OpPopCount32),
28882893
sys.AMD64)
2894+
addF("math/bits", "OnesCount32",
2895+
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
2896+
return s.newValue1(ssa.OpPopCount32, types.Types[TINT], args[0])
2897+
},
2898+
sys.PPC64)
28892899
addF("math/bits", "OnesCount16",
2890-
makeOnesCount(ssa.OpPopCount16, ssa.OpPopCount16),
2900+
makeOnesCountAMD64(ssa.OpPopCount16, ssa.OpPopCount16),
28912901
sys.AMD64)
28922902
// Note: no OnesCount8, the Go implementation is faster - just a table load.
28932903
addF("math/bits", "OnesCount",
2894-
makeOnesCount(ssa.OpPopCount64, ssa.OpPopCount32),
2904+
makeOnesCountAMD64(ssa.OpPopCount64, ssa.OpPopCount32),
28952905
sys.AMD64)
28962906

28972907
/******** sync/atomic ********/

src/cmd/compile/internal/ppc64/ssa.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -596,7 +596,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
596596
p.To.Type = obj.TYPE_REG
597597
p.To.Reg = ppc64.REGTMP // Ignored; this is for the carry effect.
598598

599-
case ssa.OpPPC64NEG, ssa.OpPPC64FNEG, ssa.OpPPC64FSQRT, ssa.OpPPC64FSQRTS, ssa.OpPPC64FCTIDZ, ssa.OpPPC64FCTIWZ, ssa.OpPPC64FCFID, ssa.OpPPC64FRSP:
599+
case ssa.OpPPC64NEG, ssa.OpPPC64FNEG, ssa.OpPPC64FSQRT, ssa.OpPPC64FSQRTS, ssa.OpPPC64FCTIDZ, ssa.OpPPC64FCTIWZ, ssa.OpPPC64FCFID, ssa.OpPPC64FRSP, ssa.OpPPC64CNTLZD, ssa.OpPPC64CNTLZW, ssa.OpPPC64POPCNTD, ssa.OpPPC64POPCNTW, ssa.OpPPC64POPCNTB:
600600
r := v.Reg()
601601
p := s.Prog(v.Op.Asm())
602602
p.To.Type = obj.TYPE_REG

src/cmd/compile/internal/ssa/gen/PPC64.rules

+11
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,17 @@
244244
// (Addr {sym} base) -> (ADDconst {sym} base)
245245
(OffPtr [off] ptr) -> (ADD (MOVDconst <typ.Int64> [off]) ptr)
246246

247+
(Ctz64 x) -> (POPCNTD (ANDN <types.Int64> (ADDconst <types.Int64> [-1] x) x))
248+
(Ctz32 x) -> (POPCNTW (MOVWZreg (ANDN <types.Int> (ADDconst <types.Int> [-1] x) x)))
249+
250+
(BitLen64 x) -> (SUB (MOVDconst [64]) (CNTLZD <types.Int> x))
251+
(BitLen32 x) -> (SUB (MOVDconst [32]) (CNTLZW <types.Int> x))
252+
253+
(PopCount64 x) -> (POPCNTD x)
254+
(PopCount32 x) -> (POPCNTW (MOVWZreg x))
255+
(PopCount16 x) -> (POPCNTW (MOVHZreg x))
256+
(PopCount8 x) -> (POPCNTB (MOVBreg x))
257+
247258
(And64 x y) -> (AND x y)
248259
(And32 x y) -> (AND x y)
249260
(And16 x y) -> (AND x y)

src/cmd/compile/internal/ssa/gen/PPC64Ops.go

+7
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,13 @@ func init() {
198198
{name: "ROTLconst", argLength: 1, reg: gp11, asm: "ROTL", aux: "Int64"}, // arg0 rotate left by auxInt bits
199199
{name: "ROTLWconst", argLength: 1, reg: gp11, asm: "ROTLW", aux: "Int64"}, // uint32(arg0) rotate left by auxInt bits
200200

201+
{name: "CNTLZD", argLength: 1, reg: gp11, asm: "CNTLZD", clobberFlags: true}, // count leading zeros
202+
{name: "CNTLZW", argLength: 1, reg: gp11, asm: "CNTLZW", clobberFlags: true}, // count leading zeros (32 bit)
203+
204+
{name: "POPCNTD", argLength: 1, reg: gp11, asm: "POPCNTD"}, // number of set bits in arg0
205+
{name: "POPCNTW", argLength: 1, reg: gp11, asm: "POPCNTW"}, // number of set bits in each word of arg0 placed in corresponding word
206+
{name: "POPCNTB", argLength: 1, reg: gp11, asm: "POPCNTB"}, // number of set bits in each byte of arg0 placed in corresonding byte
207+
201208
{name: "FDIV", argLength: 2, reg: fp21, asm: "FDIV"}, // arg0/arg1
202209
{name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS"}, // arg0/arg1
203210

src/cmd/compile/internal/ssa/opGen.go

+72
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)