Skip to content

Commit 41dbcc1

Browse files
committed
runtime: Remove write barriers during STW.
The GC assumes that there will be no asynchronous write barriers when the world is stopped. This keeps the synchronization between write barriers and the GC simple. However, currently, there are a few places in runtime code where this assumption does not hold. The GC stops the world by collecting all Ps, which stops all user Go code, but small parts of the runtime can run without a P. For example, the code that releases a P must still deschedule its G onto a runnable queue before stopping. Similarly, when a G returns from a long-running syscall, it must run code to reacquire a P. Currently, this code can contain write barriers. This can lead to the GC collecting reachable objects if something like the following sequence of events happens: 1. GC stops the world by collecting all Ps. 2. G #1 returns from a syscall (for example), tries to install a pointer to object X, and calls greyobject on X. 3. greyobject on G #1 marks X, but does not yet add it to a write buffer. At this point, X is effectively black, not grey, even though it may point to white objects. 4. GC reaches X through some other path and calls greyobject on X, but greyobject does nothing because X is already marked. 5. GC completes. 6. greyobject on G #1 adds X to a work buffer, but it's too late. 7. Objects that were reachable only through X are incorrectly collected. To fix this, we check the invariant that no asynchronous write barriers happen when the world is stopped by checking that write barriers always have a P, and modify all currently known sources of these writes to disable the write barrier. In all modified cases this is safe because the object in question will always be reachable via some other path. Some of the trace code was turned off, in particular the code that traces returning from a syscall. The GC assumes that as far as the heap is concerned the thread is stopped when it is in a syscall. Upon returning the trace code must not do any heap writes for the same reasons discussed above. Fixes #10098 Fixes #9953 Fixes #9951 Fixes #9884 May relate to #9610 #9771 Change-Id: Ic2e70b7caffa053e56156838eb8d89503e3c0c8a Reviewed-on: https://go-review.googlesource.com/7504 Reviewed-by: Austin Clements <[email protected]>
1 parent ce9b512 commit 41dbcc1

File tree

4 files changed

+122
-34
lines changed

4 files changed

+122
-34
lines changed

src/runtime/mbarrier.go

+17-1
Original file line numberDiff line numberDiff line change
@@ -83,15 +83,31 @@ func needwb() bool {
8383
return gcphase == _GCmark || gcphase == _GCmarktermination || mheap_.shadow_enabled
8484
}
8585

86+
// Write barrier calls must not happen during critical GC and scheduler
87+
// related operations. In particular there are times when the GC assumes
88+
// that the world is stopped but scheduler related code is still being
89+
// executed, dealing with syscalls, dealing with putting gs on runnable
90+
// queues and so forth. This code can not execute write barriers because
91+
// the GC might drop them on the floor. Stopping the world involves removing
92+
// the p associated with an m. We use the fact that m.p == nil to indicate
93+
// that we are in one these critical section and throw if the write is of
94+
// a pointer to a heap object.
95+
// The p, m, and g pointers are the pointers that are used by the scheduler
96+
// and need to be operated on without write barriers. We use
97+
// the setPNoWriteBarrier, setMNoWriteBarrier and setGNowriteBarrier to
98+
// avoid having to do the write barrier.
8699
//go:nosplit
87100
func writebarrierptr_nostore1(dst *uintptr, src uintptr) {
88101
mp := acquirem()
89102
if mp.inwb || mp.dying > 0 {
90103
releasem(mp)
91104
return
92105
}
93-
mp.inwb = true
94106
systemstack(func() {
107+
if mp.p == nil && memstats.enablegc && !mp.inwb && inheap(src) {
108+
throw("writebarrierptr_nostore1 called with mp.p == nil")
109+
}
110+
mp.inwb = true
95111
gcmarkwb_m(dst, src)
96112
})
97113
mp.inwb = false

src/runtime/netpoll.go

+10-6
Original file line numberDiff line numberDiff line change
@@ -274,21 +274,25 @@ func net_runtime_pollUnblock(pd *pollDesc) {
274274
}
275275

276276
// make pd ready, newly runnable goroutines (if any) are returned in rg/wg
277+
// May run during STW, so write barriers are not allowed.
278+
// Eliminating WB calls using setGNoWriteBarrier are safe since the gs are
279+
// reachable through allg.
280+
//go:nowritebarrier
277281
func netpollready(gpp **g, pd *pollDesc, mode int32) {
278282
var rg, wg *g
279283
if mode == 'r' || mode == 'r'+'w' {
280-
rg = netpollunblock(pd, 'r', true)
284+
setGNoWriteBarrier(&rg, netpollunblock(pd, 'r', true))
281285
}
282286
if mode == 'w' || mode == 'r'+'w' {
283-
wg = netpollunblock(pd, 'w', true)
287+
setGNoWriteBarrier(&wg, netpollunblock(pd, 'w', true))
284288
}
285289
if rg != nil {
286-
rg.schedlink = *gpp
287-
*gpp = rg
290+
setGNoWriteBarrier(&rg.schedlink, *gpp)
291+
setGNoWriteBarrier(gpp, rg)
288292
}
289293
if wg != nil {
290-
wg.schedlink = *gpp
291-
*gpp = wg
294+
setGNoWriteBarrier(&wg.schedlink, *gpp)
295+
setGNoWriteBarrier(gpp, wg)
292296
}
293297
}
294298

src/runtime/proc1.go

+56-20
Original file line numberDiff line numberDiff line change
@@ -720,7 +720,7 @@ func mstart1() {
720720
initsig()
721721
}
722722

723-
if _g_.m.mstartfn != nil {
723+
if _g_.m.mstartfn != 0 {
724724
fn := *(*func())(unsafe.Pointer(&_g_.m.mstartfn))
725725
fn()
726726
}
@@ -971,17 +971,22 @@ func unlockextra(mp *m) {
971971
}
972972

973973
// Create a new m. It will start off with a call to fn, or else the scheduler.
974+
// fn needs to be static and not a heap allocated closure.
975+
// May run during STW, so write barriers are not allowed.
976+
//go:nowritebarrier
974977
func newm(fn func(), _p_ *p) {
975978
mp := allocm(_p_)
976-
mp.nextp = _p_
977-
mp.mstartfn = *(*unsafe.Pointer)(unsafe.Pointer(&fn))
978-
979+
// procresize made _p_ reachable through allp, which doesn't change during GC, so WB can be eliminated
980+
setPNoWriteBarrier(&mp.nextp, _p_)
981+
// Store &fn as a uintptr since it is not heap allocated so the WB can be eliminated
982+
mp.mstartfn = *(*uintptr)(unsafe.Pointer(&fn))
979983
if iscgo {
980984
var ts cgothreadstart
981985
if _cgo_thread_start == nil {
982986
throw("_cgo_thread_start missing")
983987
}
984-
ts.g = mp.g0
988+
// mp is reachable via allm and mp.g0 never changes, so WB can be eliminated.
989+
setGNoWriteBarrier(&ts.g, mp.g0)
985990
ts.tls = (*uint64)(unsafe.Pointer(&mp.tls[0]))
986991
ts.fn = unsafe.Pointer(funcPC(mstart))
987992
asmcgocall(_cgo_thread_start, unsafe.Pointer(&ts))
@@ -1029,6 +1034,8 @@ func mspinning() {
10291034

10301035
// Schedules some M to run the p (creates an M if necessary).
10311036
// If p==nil, tries to get an idle P, if no idle P's does nothing.
1037+
// May run during STW, so write barriers are not allowed.
1038+
//go:nowritebarrier
10321039
func startm(_p_ *p, spinning bool) {
10331040
lock(&sched.lock)
10341041
if _p_ == nil {
@@ -1058,7 +1065,8 @@ func startm(_p_ *p, spinning bool) {
10581065
throw("startm: m has p")
10591066
}
10601067
mp.spinning = spinning
1061-
mp.nextp = _p_
1068+
// procresize made _p_ reachable through allp, which doesn't change during GC, so WB can be eliminated
1069+
setPNoWriteBarrier(&mp.nextp, _p_)
10621070
notewakeup(&mp.park)
10631071
}
10641072

@@ -1139,6 +1147,8 @@ func stoplockedm() {
11391147
}
11401148

11411149
// Schedules the locked m to run the locked gp.
1150+
// May run during STW, so write barriers are not allowed.
1151+
//go:nowritebarrier
11421152
func startlockedm(gp *g) {
11431153
_g_ := getg()
11441154

@@ -1152,7 +1162,8 @@ func startlockedm(gp *g) {
11521162
// directly handoff current P to the locked m
11531163
incidlelocked(-1)
11541164
_p_ := releasep()
1155-
mp.nextp = _p_
1165+
// procresize made _p_ reachable through allp, which doesn't change during GC, so WB can be eliminated
1166+
setPNoWriteBarrier(&mp.nextp, _p_)
11561167
notewakeup(&mp.park)
11571168
stopm()
11581169
}
@@ -1805,7 +1816,11 @@ func exitsyscall(dummy int32) {
18051816
for oldp != nil && oldp.syscalltick == _g_.m.syscalltick {
18061817
osyield()
18071818
}
1808-
systemstack(traceGoSysExit)
1819+
// This can't be done since the GC may be running and this code
1820+
// will invoke write barriers.
1821+
// TODO: Figure out how to get traceGoSysExit into the trace log or
1822+
// it is likely not to work as expected.
1823+
// systemstack(traceGoSysExit)
18091824
}
18101825

18111826
_g_.m.locks--
@@ -2569,6 +2584,8 @@ func procresize(nprocs int32) *p {
25692584
}
25702585

25712586
// Associate p and the current m.
2587+
// May run during STW, so write barriers are not allowed.
2588+
//go:nowritebarrier
25722589
func acquirep(_p_ *p) {
25732590
_g_ := getg()
25742591

@@ -2583,9 +2600,12 @@ func acquirep(_p_ *p) {
25832600
print("acquirep: p->m=", _p_.m, "(", id, ") p->status=", _p_.status, "\n")
25842601
throw("acquirep: invalid p state")
25852602
}
2586-
_g_.m.mcache = _p_.mcache
2587-
_g_.m.p = _p_
2588-
_p_.m = _g_.m
2603+
// _p_.mcache holds the mcache and _p_ is in allp, so WB can be eliminated
2604+
setMcacheNoWriteBarrier(&_g_.m.mcache, _p_.mcache)
2605+
// _p_ is in allp so WB can be eliminated
2606+
setPNoWriteBarrier(&_g_.m.p, _p_)
2607+
// m is in _g_.m and is reachable through allg, so WB can be eliminated
2608+
setMNoWriteBarrier(&_p_.m, _g_.m)
25892609
_p_.status = _Prunning
25902610

25912611
if trace.enabled {
@@ -2991,34 +3011,44 @@ func schedtrace(detailed bool) {
29913011

29923012
// Put mp on midle list.
29933013
// Sched must be locked.
3014+
// May run during STW, so write barriers are not allowed.
3015+
//go:nowritebarrier
29943016
func mput(mp *m) {
2995-
mp.schedlink = sched.midle
2996-
sched.midle = mp
3017+
// sched.midle is reachable via allm, so WB can be eliminated.
3018+
setMNoWriteBarrier(&mp.schedlink, sched.midle)
3019+
// mp is reachable via allm, so WB can be eliminated.
3020+
setMNoWriteBarrier(&sched.midle, mp)
29973021
sched.nmidle++
29983022
checkdead()
29993023
}
30003024

30013025
// Try to get an m from midle list.
30023026
// Sched must be locked.
3027+
// May run during STW, so write barriers are not allowed.
3028+
//go:nowritebarrier
30033029
func mget() *m {
30043030
mp := sched.midle
30053031
if mp != nil {
3006-
sched.midle = mp.schedlink
3032+
// mp.schedlink is reachable via mp, which is on allm, so WB can be eliminated.
3033+
setMNoWriteBarrier(&sched.midle, mp.schedlink)
30073034
sched.nmidle--
30083035
}
30093036
return mp
30103037
}
30113038

30123039
// Put gp on the global runnable queue.
30133040
// Sched must be locked.
3041+
// May run during STW, so write barriers are not allowed.
3042+
//go:nowritebarrier
30143043
func globrunqput(gp *g) {
30153044
gp.schedlink = nil
30163045
if sched.runqtail != nil {
3017-
sched.runqtail.schedlink = gp
3046+
// gp is on allg, so these three WBs can be eliminated.
3047+
setGNoWriteBarrier(&sched.runqtail.schedlink, gp)
30183048
} else {
3019-
sched.runqhead = gp
3049+
setGNoWriteBarrier(&sched.runqhead, gp)
30203050
}
3021-
sched.runqtail = gp
3051+
setGNoWriteBarrier(&sched.runqtail, gp)
30223052
sched.runqsize++
30233053
}
30243054

@@ -3071,18 +3101,24 @@ func globrunqget(_p_ *p, max int32) *g {
30713101

30723102
// Put p to on _Pidle list.
30733103
// Sched must be locked.
3104+
// May run during STW, so write barriers are not allowed.
3105+
//go:nowritebarrier
30743106
func pidleput(_p_ *p) {
3075-
_p_.link = sched.pidle
3076-
sched.pidle = _p_
3107+
// sched.pidle, _p_.link and _p_ are reachable via allp, so WB can be eliminated.
3108+
setPNoWriteBarrier(&_p_.link, sched.pidle)
3109+
setPNoWriteBarrier(&sched.pidle, _p_)
30773110
xadd(&sched.npidle, 1) // TODO: fast atomic
30783111
}
30793112

30803113
// Try get a p from _Pidle list.
30813114
// Sched must be locked.
3115+
// May run during STW, so write barriers are not allowed.
3116+
//go:nowritebarrier
30823117
func pidleget() *p {
30833118
_p_ := sched.pidle
30843119
if _p_ != nil {
3085-
sched.pidle = _p_.link
3120+
// _p_.link is reachable via a _p_ in allp, so WB can be eliminated.
3121+
setPNoWriteBarrier(&sched.pidle, _p_.link)
30863122
xadd(&sched.npidle, -1) // TODO: fast atomic
30873123
}
30883124
return _p_

src/runtime/runtime2.go

+39-7
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,38 @@ func (gp guintptr) ptr() *g {
117117
return (*g)(unsafe.Pointer(gp))
118118
}
119119

120+
// ps, ms, gs, and mcache are structures that must be manipulated at a level
121+
// lower than that of the normal Go language. For example the routine that
122+
// stops the world removes the p from the m structure informing the GC that
123+
// this P is stopped and then it moves the g to the global runnable queue.
124+
// If write barriers were allowed to happen at this point not only does
125+
// the GC think the thread is stopped but the underlying structures
126+
// like a p or m are not in a state that is not coherent enough to
127+
// support the write barrier actions.
128+
// This is particularly painful since a partially executed write barrier
129+
// may mark the object but be delinquent in informing the GC that the
130+
// object needs to be scanned.
131+
132+
// setGNoWriteBarriers does *gdst = gval without a write barrier.
133+
func setGNoWriteBarrier(gdst **g, gval *g) {
134+
*(*uintptr)(unsafe.Pointer(gdst)) = uintptr(unsafe.Pointer(gval))
135+
}
136+
137+
// setMNoWriteBarriers does *mdst = mval without a write barrier.
138+
func setMNoWriteBarrier(mdst **m, mval *m) {
139+
*(*uintptr)(unsafe.Pointer(mdst)) = uintptr(unsafe.Pointer(mval))
140+
}
141+
142+
// setPNoWriteBarriers does *pdst = pval without a write barrier.
143+
func setPNoWriteBarrier(pdst **p, pval *p) {
144+
*(*uintptr)(unsafe.Pointer(pdst)) = uintptr(unsafe.Pointer(pval))
145+
}
146+
147+
// setMcacheNoWriteBarriers does *mcachedst = mcacheval without a write barrier.
148+
func setMcacheNoWriteBarrier(mcachedst **mcache, mcacheval *mcache) {
149+
*(*uintptr)(unsafe.Pointer(mcachedst)) = uintptr(unsafe.Pointer(mcacheval))
150+
}
151+
120152
type gobuf struct {
121153
// The offsets of sp, pc, and g are known to (hard-coded in) libmach.
122154
sp uintptr
@@ -233,13 +265,13 @@ type m struct {
233265
morebuf gobuf // gobuf arg to morestack
234266

235267
// Fields not known to debuggers.
236-
procid uint64 // for debuggers, but offset not hard-coded
237-
gsignal *g // signal-handling g
238-
tls [4]uintptr // thread-local storage (for x86 extern register)
239-
mstartfn unsafe.Pointer // todo go func()
240-
curg *g // current running goroutine
241-
caughtsig *g // goroutine running during fatal signal
242-
p *p // attached p for executing go code (nil if not executing go code)
268+
procid uint64 // for debuggers, but offset not hard-coded
269+
gsignal *g // signal-handling g
270+
tls [4]uintptr // thread-local storage (for x86 extern register)
271+
mstartfn uintptr // TODO: type as func(); note: this is a non-heap allocated func()
272+
curg *g // current running goroutine
273+
caughtsig *g // goroutine running during fatal signal
274+
p *p // attached p for executing go code (nil if not executing go code)
243275
nextp *p
244276
id int32
245277
mallocing int32

0 commit comments

Comments
 (0)