Skip to content

Commit 6b99fb5

Browse files
committed
cmd/compile: use sparse algorithm for phis in large program
This adds a sparse method for locating nearest ancestors in a dominator tree, and checks blocks with more than one predecessor for differences and inserts phi functions where there are. Uses reversed post order to cut number of passes, running it from first def to last use ("last use" for paramout and mem is end-of-program; last use for a phi input from a backedge is the source of the back edge) Includes a cutover from old algorithm to new to avoid paying large constant factor for small programs. This keeps normal builds running at about the same time, while not running over-long on large machine-generated inputs. Add "phase" flags for ssa/build -- ssa/build/stats prints number of blocks, values (before and after linking references and inserting phis, so expansion can be measured), and their product; the product governs the cutover, where a good value seems to be somewhere between 1 and 5 million. Among the files compiled by make.bash, this is the shape of the tail of the distribution for #blocks, #vars, and their product: #blocks #vars product max 6171 28180 173,898,780 99.9% 1641 6548 10,401,878 99% 463 1909 873,721 95% 152 639 95,235 90% 84 359 30,021 The old algorithm is indeed usually fastest, for 99%ile values of usually. The fix to LookupVarOutgoing ( https://go-review.googlesource.com/#/c/22790/ ) deals with some of the same problems addressed by this CL, but on at least one bug ( #15537 ) this change is still a significant help. With this CL: /tmp/gopath$ rm -rf pkg bin /tmp/gopath$ time go get -v -gcflags -memprofile=y.mprof \ github.com/gogo/protobuf/test/theproto3/combos/... ... real 4m35.200s user 13m16.644s sys 0m36.712s and pprof reports 3.4GB allocated in one of the larger profiles With tip: /tmp/gopath$ rm -rf pkg bin /tmp/gopath$ time go get -v -gcflags -memprofile=y.mprof \ github.com/gogo/protobuf/test/theproto3/combos/... ... real 10m36.569s user 25m52.286s sys 4m3.696s and pprof reports 8.3GB allocated in the same larger profile With this CL, most of the compilation time on the benchmarked input is spent in register/stack allocation (cumulative 53%) and in the sparse lookup algorithm itself (cumulative 20%). Fixes #15537. Change-Id: Ia0299dda6a291534d8b08e5f9883216ded677a00 Reviewed-on: https://go-review.googlesource.com/22342 Reviewed-by: Keith Randall <[email protected]> Run-TryBot: David Chase <[email protected]> TryBot-Result: Gobot Gobot <[email protected]>
1 parent 466cae6 commit 6b99fb5

16 files changed

+1194
-52
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
// Copyright 2016 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package gc
6+
7+
import (
8+
"cmd/compile/internal/ssa"
9+
"fmt"
10+
"math"
11+
)
12+
13+
// sparseDefState contains a Go map from ONAMEs (*Node) to sparse definition trees, and
14+
// a search helper for the CFG's dominator tree in which those definitions are embedded.
15+
// Once initialized, given a use of an ONAME within a block, the ssa definition for
16+
// that ONAME can be discovered in time roughly proportional to the log of the number
17+
// of SSA definitions of that ONAME (thus avoiding pathological quadratic behavior for
18+
// very large programs). The helper contains state (a dominator tree numbering) common
19+
// to all the sparse definition trees, as well as some necessary data obtained from
20+
// the ssa package.
21+
//
22+
// This algorithm has improved asymptotic complexity, but the constant factor is
23+
// rather large and thus it is only preferred for very large inputs containing
24+
// 1000s of blocks and variables.
25+
type sparseDefState struct {
26+
helper *ssa.SparseTreeHelper // contains one copy of information needed to do sparse mapping
27+
defmapForOname map[*Node]*onameDefs // for each ONAME, its definition set (normal and phi)
28+
}
29+
30+
// onameDefs contains a record of definitions (ordinary and implied phi function) for a single OName.
31+
// stm is the set of definitions for the OName.
32+
// firstdef and lastuse are postorder block numberings that
33+
// conservatively bracket the entire lifetime of the OName.
34+
type onameDefs struct {
35+
stm *ssa.SparseTreeMap
36+
// firstdef and lastuse define an interval in the postorder numbering
37+
// that is guaranteed to include the entire lifetime of an ONAME.
38+
// In the postorder numbering, math.MaxInt32 is before anything,
39+
// and 0 is after-or-equal all exit nodes and infinite loops.
40+
firstdef int32 // the first definition of this ONAME *in the postorder numbering*
41+
lastuse int32 // the last use of this ONAME *in the postorder numbering*
42+
}
43+
44+
// defsFor finds or creates-and-inserts-in-map the definition information
45+
// (sparse tree and live range) for a given OName.
46+
func (m *sparseDefState) defsFor(n *Node) *onameDefs {
47+
d := m.defmapForOname[n]
48+
if d != nil {
49+
return d
50+
}
51+
// Reminder: firstdef/lastuse are postorder indices, not block indices,
52+
// so these default values define an empty interval, not the entire one.
53+
d = &onameDefs{stm: m.helper.NewTree(), firstdef: 0, lastuse: math.MaxInt32}
54+
m.defmapForOname[n] = d
55+
return d
56+
}
57+
58+
// Insert adds a definition at b (with specified before/within/after adjustment)
59+
// to sparse tree onameDefs. The lifetime is extended as necessary.
60+
func (m *sparseDefState) Insert(tree *onameDefs, b *ssa.Block, adjust int32) {
61+
bponum := m.helper.Ponums[b.ID]
62+
if bponum > tree.firstdef {
63+
tree.firstdef = bponum
64+
}
65+
tree.stm.Insert(b, adjust, b, m.helper)
66+
}
67+
68+
// Use updates tree to record a use within b, extending the lifetime as necessary.
69+
func (m *sparseDefState) Use(tree *onameDefs, b *ssa.Block) {
70+
bponum := m.helper.Ponums[b.ID]
71+
if bponum < tree.lastuse {
72+
tree.lastuse = bponum
73+
}
74+
}
75+
76+
// locatePotentialPhiFunctions finds all the places where phi functions
77+
// will be inserted into a program and records those and ordinary definitions
78+
// in a "map" (not a Go map) that given an OName and use site, returns the
79+
// SSA definition for that OName that will reach the use site (that is,
80+
// the use site's nearest def/phi site in the dominator tree.)
81+
func (s *state) locatePotentialPhiFunctions(fn *Node) *sparseDefState {
82+
// s.config.SparsePhiCutoff() is compared with product of numblocks and numvalues,
83+
// if product is smaller than cutoff, use old non-sparse method.
84+
// cutoff == 0 implies all sparse
85+
// cutoff == uint(-1) implies all non-sparse
86+
if uint64(s.f.NumValues())*uint64(s.f.NumBlocks()) < s.config.SparsePhiCutoff() {
87+
return nil
88+
}
89+
90+
helper := ssa.NewSparseTreeHelper(s.f)
91+
po := helper.Po // index by block.ID to obtain postorder # of block.
92+
trees := make(map[*Node]*onameDefs)
93+
dm := &sparseDefState{defmapForOname: trees, helper: helper}
94+
95+
// Process params, taking note of their special lifetimes
96+
b := s.f.Entry
97+
for _, n := range fn.Func.Dcl {
98+
switch n.Class {
99+
case PPARAM, PPARAMOUT:
100+
t := dm.defsFor(n)
101+
dm.Insert(t, b, ssa.AdjustBefore) // define param at entry block
102+
if n.Class == PPARAMOUT {
103+
dm.Use(t, po[0]) // Explicitly use PPARAMOUT at very last block
104+
}
105+
default:
106+
}
107+
}
108+
109+
// Process memory variable.
110+
t := dm.defsFor(&memVar)
111+
dm.Insert(t, b, ssa.AdjustBefore) // define memory at entry block
112+
dm.Use(t, po[0]) // Explicitly use memory at last block
113+
114+
// Next load the map w/ basic definitions for ONames recorded per-block
115+
// Iterate over po to avoid unreachable blocks.
116+
for i := len(po) - 1; i >= 0; i-- {
117+
b := po[i]
118+
m := s.defvars[b.ID]
119+
for n := range m { // no specified order, but per-node trees are independent.
120+
t := dm.defsFor(n)
121+
dm.Insert(t, b, ssa.AdjustWithin)
122+
}
123+
}
124+
125+
// Find last use of each variable
126+
for _, v := range s.fwdRefs {
127+
b := v.Block
128+
name := v.Aux.(*Node)
129+
t := dm.defsFor(name)
130+
dm.Use(t, b)
131+
}
132+
133+
for _, t := range trees {
134+
// iterating over names in the outer loop
135+
for change := true; change; {
136+
change = false
137+
for i := t.firstdef; i >= t.lastuse; i-- {
138+
// Iterating in reverse of post-order reduces number of 'change' iterations;
139+
// all possible forward flow goes through each time.
140+
b := po[i]
141+
// Within tree t, would a use at b require a phi function to ensure a single definition?
142+
// TODO: perhaps more efficient to record specific use sites instead of range?
143+
if len(b.Preds) < 2 {
144+
continue // no phi possible
145+
}
146+
phi := t.stm.Find(b, ssa.AdjustWithin, helper) // Look for defs in earlier block or AdjustBefore in this one.
147+
if phi != nil && phi.(*ssa.Block) == b {
148+
continue // has a phi already in this block.
149+
}
150+
var defseen interface{}
151+
// Do preds see different definitions? if so, need a phi function.
152+
for _, e := range b.Preds {
153+
p := e.Block()
154+
dm.Use(t, p) // always count phi pred as "use"; no-op except for loop edges, which matter.
155+
x := t.stm.Find(p, ssa.AdjustAfter, helper) // Look for defs reaching or within predecessors.
156+
if defseen == nil {
157+
defseen = x
158+
}
159+
if defseen != x || x == nil { // TODO: too conservative at loops, does better if x == nil -> continue
160+
// Need to insert a phi function here because predecessors's definitions differ.
161+
change = true
162+
// Phi insertion is at AdjustBefore, visible with find in same block at AdjustWithin or AdjustAfter.
163+
dm.Insert(t, b, ssa.AdjustBefore)
164+
break
165+
}
166+
}
167+
}
168+
}
169+
}
170+
return dm
171+
}
172+
173+
// FindBetterDefiningBlock tries to find a better block for a definition of OName name
174+
// reaching (or within) p than p itself. If it cannot, it returns p instead.
175+
// This aids in more efficient location of phi functions, since it can skip over
176+
// branch code that might contain a definition of name if it actually does not.
177+
func (m *sparseDefState) FindBetterDefiningBlock(name *Node, p *ssa.Block) *ssa.Block {
178+
if m == nil {
179+
return p
180+
}
181+
t := m.defmapForOname[name]
182+
// For now this is fail-soft, since the old algorithm still works using the unimproved block.
183+
if t == nil {
184+
return p
185+
}
186+
x := t.stm.Find(p, ssa.AdjustAfter, m.helper)
187+
if x == nil {
188+
return p
189+
}
190+
b := x.(*ssa.Block)
191+
if b == nil {
192+
return p
193+
}
194+
return b
195+
}
196+
197+
func (d *onameDefs) String() string {
198+
return fmt.Sprintf("onameDefs:first=%d,last=%d,tree=%s", d.firstdef, d.lastuse, d.stm.String())
199+
}

src/cmd/compile/internal/gc/ssa.go

+14-4
Original file line numberDiff line numberDiff line change
@@ -218,8 +218,16 @@ func buildssa(fn *Node) *ssa.Func {
218218
return nil
219219
}
220220

221+
prelinkNumvars := s.f.NumValues()
222+
sparseDefState := s.locatePotentialPhiFunctions(fn)
223+
221224
// Link up variable uses to variable definitions
222-
s.linkForwardReferences()
225+
s.linkForwardReferences(sparseDefState)
226+
227+
if ssa.BuildStats > 0 {
228+
s.f.LogStat("build", s.f.NumBlocks(), "blocks", prelinkNumvars, "vars_before",
229+
s.f.NumValues(), "vars_after", prelinkNumvars*s.f.NumBlocks(), "ssa_phi_loc_cutoff_score")
230+
}
223231

224232
// Don't carry reference this around longer than necessary
225233
s.exitCode = Nodes{}
@@ -3741,7 +3749,8 @@ func (s *state) mem() *ssa.Value {
37413749
return s.variable(&memVar, ssa.TypeMem)
37423750
}
37433751

3744-
func (s *state) linkForwardReferences() {
3752+
func (s *state) linkForwardReferences(dm *sparseDefState) {
3753+
37453754
// Build SSA graph. Each variable on its first use in a basic block
37463755
// leaves a FwdRef in that block representing the incoming value
37473756
// of that variable. This function links that ref up with possible definitions,
@@ -3756,13 +3765,13 @@ func (s *state) linkForwardReferences() {
37563765
for len(s.fwdRefs) > 0 {
37573766
v := s.fwdRefs[len(s.fwdRefs)-1]
37583767
s.fwdRefs = s.fwdRefs[:len(s.fwdRefs)-1]
3759-
s.resolveFwdRef(v)
3768+
s.resolveFwdRef(v, dm)
37603769
}
37613770
}
37623771

37633772
// resolveFwdRef modifies v to be the variable's value at the start of its block.
37643773
// v must be a FwdRef op.
3765-
func (s *state) resolveFwdRef(v *ssa.Value) {
3774+
func (s *state) resolveFwdRef(v *ssa.Value, dm *sparseDefState) {
37663775
b := v.Block
37673776
name := v.Aux.(*Node)
37683777
v.Aux = nil
@@ -3801,6 +3810,7 @@ func (s *state) resolveFwdRef(v *ssa.Value) {
38013810
args := argstore[:0]
38023811
for _, e := range b.Preds {
38033812
p := e.Block()
3813+
p = dm.FindBetterDefiningBlock(name, p) // try sparse improvement on p
38043814
args = append(args, s.lookupVarOutgoing(p, v.Type, name, v.Line))
38053815
}
38063816

src/cmd/compile/internal/ssa/check.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ func checkFunc(f *Func) {
316316
}
317317

318318
// domCheck reports whether x dominates y (including x==y).
319-
func domCheck(f *Func, sdom sparseTree, x, y *Block) bool {
319+
func domCheck(f *Func, sdom SparseTree, x, y *Block) bool {
320320
if !sdom.isAncestorEq(f.Entry, y) {
321321
// unreachable - ignore
322322
return true

src/cmd/compile/internal/ssa/compile.go

+19-2
Original file line numberDiff line numberDiff line change
@@ -86,14 +86,14 @@ func Compile(f *Func) {
8686
// Surround timing information w/ enough context to allow comparisons.
8787
time := tEnd.Sub(tStart).Nanoseconds()
8888
if p.time {
89-
f.logStat("TIME(ns)", time)
89+
f.LogStat("TIME(ns)", time)
9090
}
9191
if p.mem {
9292
var mEnd runtime.MemStats
9393
runtime.ReadMemStats(&mEnd)
9494
nBytes := mEnd.TotalAlloc - mStart.TotalAlloc
9595
nAllocs := mEnd.Mallocs - mStart.Mallocs
96-
f.logStat("TIME(ns):BYTES:ALLOCS", time, nBytes, nAllocs)
96+
f.LogStat("TIME(ns):BYTES:ALLOCS", time, nBytes, nAllocs)
9797
}
9898
}
9999
if checkEnabled {
@@ -124,6 +124,10 @@ var checkEnabled = false
124124
var IntrinsicsDebug int
125125
var IntrinsicsDisable bool
126126

127+
var BuildDebug int
128+
var BuildTest int
129+
var BuildStats int
130+
127131
// PhaseOption sets the specified flag in the specified ssa phase,
128132
// returning empty string if this was successful or a string explaining
129133
// the error if it was not.
@@ -174,6 +178,19 @@ func PhaseOption(phase, flag string, val int) string {
174178
}
175179
return ""
176180
}
181+
if phase == "build" {
182+
switch flag {
183+
case "debug":
184+
BuildDebug = val
185+
case "test":
186+
BuildTest = val
187+
case "stats":
188+
BuildStats = val
189+
default:
190+
return fmt.Sprintf("Did not find a flag matching %s in -d=ssa/%s debug option", flag, phase)
191+
}
192+
return ""
193+
}
177194

178195
underphase := strings.Replace(phase, "_", " ", -1)
179196
var re *regexp.Regexp

src/cmd/compile/internal/ssa/config.go

+36-13
Original file line numberDiff line numberDiff line change
@@ -9,22 +9,24 @@ import (
99
"crypto/sha1"
1010
"fmt"
1111
"os"
12+
"strconv"
1213
"strings"
1314
)
1415

1516
type Config struct {
16-
arch string // "amd64", etc.
17-
IntSize int64 // 4 or 8
18-
PtrSize int64 // 4 or 8
19-
lowerBlock func(*Block) bool // lowering function
20-
lowerValue func(*Value, *Config) bool // lowering function
21-
registers []Register // machine registers
22-
fe Frontend // callbacks into compiler frontend
23-
HTML *HTMLWriter // html writer, for debugging
24-
ctxt *obj.Link // Generic arch information
25-
optimize bool // Do optimization
26-
noDuffDevice bool // Don't use Duff's device
27-
curFunc *Func
17+
arch string // "amd64", etc.
18+
IntSize int64 // 4 or 8
19+
PtrSize int64 // 4 or 8
20+
lowerBlock func(*Block) bool // lowering function
21+
lowerValue func(*Value, *Config) bool // lowering function
22+
registers []Register // machine registers
23+
fe Frontend // callbacks into compiler frontend
24+
HTML *HTMLWriter // html writer, for debugging
25+
ctxt *obj.Link // Generic arch information
26+
optimize bool // Do optimization
27+
noDuffDevice bool // Don't use Duff's device
28+
sparsePhiCutoff uint64 // Sparse phi location algorithm used above this #blocks*#variables score
29+
curFunc *Func
2830

2931
// TODO: more stuff. Compiler flags of interest, ...
3032

@@ -159,10 +161,27 @@ func NewConfig(arch string, fe Frontend, ctxt *obj.Link, optimize bool) *Config
159161

160162
c.logfiles = make(map[string]*os.File)
161163

164+
// cutoff is compared with product of numblocks and numvalues,
165+
// if product is smaller than cutoff, use old non-sparse method.
166+
// cutoff == 0 implies all sparse.
167+
// cutoff == -1 implies none sparse.
168+
// Good cutoff values seem to be O(million) depending on constant factor cost of sparse.
169+
// TODO: get this from a flag, not an environment variable
170+
c.sparsePhiCutoff = 2500000 // 0 for testing. // 2500000 determined with crude experiments w/ make.bash
171+
ev := os.Getenv("GO_SSA_PHI_LOC_CUTOFF")
172+
if ev != "" {
173+
v, err := strconv.ParseInt(ev, 10, 64)
174+
if err != nil {
175+
fe.Fatalf(0, "Environment variable GO_SSA_PHI_LOC_CUTOFF (value '%s') did not parse as a number", ev)
176+
}
177+
c.sparsePhiCutoff = uint64(v) // convert -1 to maxint, for never use sparse
178+
}
179+
162180
return c
163181
}
164182

165-
func (c *Config) Frontend() Frontend { return c.fe }
183+
func (c *Config) Frontend() Frontend { return c.fe }
184+
func (c *Config) SparsePhiCutoff() uint64 { return c.sparsePhiCutoff }
166185

167186
// NewFunc returns a new, empty function object.
168187
// Caller must call f.Free() before calling NewFunc again.
@@ -259,3 +278,7 @@ func (c *Config) DebugHashMatch(evname, name string) bool {
259278
}
260279
return false
261280
}
281+
282+
func (c *Config) DebugNameMatch(evname, name string) bool {
283+
return os.Getenv(evname) == name
284+
}

0 commit comments

Comments
 (0)