Skip to content

Commit faa7a7e

Browse files
committed
runtime: implement GC stack barriers
This commit implements stack barriers to minimize the amount of stack re-scanning that must be done during mark termination. Currently the GC scans stacks of active goroutines twice during every GC cycle: once at the beginning during root discovery and once at the end during mark termination. The second scan happens while the world is stopped and guarantees that we've seen all of the roots (since there are no write barriers on writes to local stack variables). However, this means pause time is proportional to stack size. In particularly recursive programs, this can drive pause time up past our 10ms goal (e.g., it takes about 150ms to scan a 50MB heap). Re-scanning the entire stack is rarely necessary, especially for large stacks, because usually most of the frames on the stack were not active between the first and second scans and hence any changes to these frames (via non-escaping pointers passed down the stack) were tracked by write barriers. To efficiently track how far a stack has been unwound since the first scan (and, hence, how much needs to be re-scanned), this commit introduces stack barriers. During the first scan, at exponentially spaced points in each stack, the scan overwrites return PCs with the PC of the stack barrier function. When "returned" to, the stack barrier function records how far the stack has unwound and jumps to the original return PC for that point in the stack. Then the second scan only needs to proceed as far as the lowest barrier that hasn't been hit. For deeply recursive programs, this substantially reduces mark termination time (and hence pause time). For the goscheme example linked in issue #10898, prior to this change, mark termination times were typically between 100 and 500ms; with this change, mark termination times are typically between 10 and 20ms. As a result of the reduced stack scanning work, this reduces overall execution time of the goscheme example by 20%. Fixes #10898. The effect of this on programs that are not deeply recursive is minimal: name old time/op new time/op delta BinaryTree17 3.16s ± 2% 3.26s ± 1% +3.31% (p=0.000 n=19+19) Fannkuch11 2.42s ± 1% 2.48s ± 1% +2.24% (p=0.000 n=17+19) FmtFprintfEmpty 50.0ns ± 3% 49.8ns ± 1% ~ (p=0.534 n=20+19) FmtFprintfString 173ns ± 0% 175ns ± 0% +1.49% (p=0.000 n=16+19) FmtFprintfInt 170ns ± 1% 175ns ± 1% +2.97% (p=0.000 n=20+19) FmtFprintfIntInt 288ns ± 0% 295ns ± 0% +2.73% (p=0.000 n=16+19) FmtFprintfPrefixedInt 242ns ± 1% 252ns ± 1% +4.13% (p=0.000 n=18+18) FmtFprintfFloat 324ns ± 0% 323ns ± 0% -0.36% (p=0.000 n=20+19) FmtManyArgs 1.14µs ± 0% 1.12µs ± 1% -1.01% (p=0.000 n=18+19) GobDecode 8.88ms ± 1% 8.87ms ± 0% ~ (p=0.480 n=19+18) GobEncode 6.80ms ± 1% 6.85ms ± 0% +0.82% (p=0.000 n=20+18) Gzip 363ms ± 1% 363ms ± 1% ~ (p=0.077 n=18+20) Gunzip 90.6ms ± 0% 90.0ms ± 1% -0.71% (p=0.000 n=17+18) HTTPClientServer 51.5µs ± 1% 50.8µs ± 1% -1.32% (p=0.000 n=18+18) JSONEncode 17.0ms ± 0% 17.1ms ± 0% +0.40% (p=0.000 n=18+17) JSONDecode 61.8ms ± 0% 63.8ms ± 1% +3.11% (p=0.000 n=18+17) Mandelbrot200 3.84ms ± 0% 3.84ms ± 1% ~ (p=0.583 n=19+19) GoParse 3.71ms ± 1% 3.72ms ± 1% ~ (p=0.159 n=18+19) RegexpMatchEasy0_32 100ns ± 0% 100ns ± 1% -0.19% (p=0.033 n=17+19) RegexpMatchEasy0_1K 342ns ± 1% 331ns ± 0% -3.41% (p=0.000 n=19+19) RegexpMatchEasy1_32 82.5ns ± 0% 81.7ns ± 0% -0.98% (p=0.000 n=18+18) RegexpMatchEasy1_1K 505ns ± 0% 494ns ± 1% -2.16% (p=0.000 n=18+18) RegexpMatchMedium_32 137ns ± 1% 137ns ± 1% -0.24% (p=0.048 n=20+18) RegexpMatchMedium_1K 41.6µs ± 0% 41.3µs ± 1% -0.57% (p=0.004 n=18+20) RegexpMatchHard_32 2.11µs ± 0% 2.11µs ± 1% +0.20% (p=0.037 n=17+19) RegexpMatchHard_1K 63.9µs ± 2% 63.3µs ± 0% -0.99% (p=0.000 n=20+17) Revcomp 560ms ± 1% 522ms ± 0% -6.87% (p=0.000 n=18+16) Template 75.0ms ± 0% 75.1ms ± 1% +0.18% (p=0.013 n=18+19) TimeParse 358ns ± 1% 364ns ± 0% +1.74% (p=0.000 n=20+15) TimeFormat 360ns ± 0% 372ns ± 0% +3.55% (p=0.000 n=20+18) Change-Id: If8a9bfae6c128d15a4f405e02bcfa50129df82a2 Reviewed-on: https://go-review.googlesource.com/10314 Reviewed-by: Russ Cox <[email protected]> Run-TryBot: Austin Clements <[email protected]> TryBot-Result: Gobot Gobot <[email protected]>
1 parent 724f829 commit faa7a7e

13 files changed

+481
-23
lines changed

src/runtime/asm_386.s

+32-2
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,22 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0-0
341341
MOVL $0, DX
342342
JMP runtime·morestack(SB)
343343

344+
TEXT runtime·stackBarrier(SB),NOSPLIT,$0
345+
// We came here via a RET to an overwritten return PC.
346+
// AX may be live. Other registers are available.
347+
348+
// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
349+
get_tls(CX)
350+
MOVL g(CX), CX
351+
MOVL (g_stkbar+slice_array)(CX), DX
352+
MOVL g_stkbarPos(CX), BX
353+
IMULL $stkbar__size, BX // Too big for SIB.
354+
MOVL stkbar_savedLRVal(DX)(BX*1), BX
355+
// Record that this stack barrier was hit.
356+
ADDL $1, g_stkbarPos(CX)
357+
// Jump to the original return PC.
358+
JMP BX
359+
344360
// reflectcall: call a function with the given argument list
345361
// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
346362
// we don't have variable-sized frames, so we use a small number
@@ -860,17 +876,31 @@ TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
860876
INT $3
861877
RET
862878

863-
TEXT runtime·getcallerpc(SB),NOSPLIT,$0-8
879+
TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8
864880
MOVL argp+0(FP),AX // addr of first arg
865881
MOVL -4(AX),AX // get calling pc
882+
CMPL AX, runtime·stackBarrierPC(SB)
883+
JNE nobar
884+
// Get original return PC.
885+
CALL runtime·nextBarrierPC(SB)
886+
MOVL 0(SP), AX
887+
nobar:
866888
MOVL AX, ret+4(FP)
867889
RET
868890

869-
TEXT runtime·setcallerpc(SB),NOSPLIT,$0-8
891+
TEXT runtime·setcallerpc(SB),NOSPLIT,$4-8
870892
MOVL argp+0(FP),AX // addr of first arg
871893
MOVL pc+4(FP), BX
894+
MOVL -4(AX), CX
895+
CMPL CX, runtime·stackBarrierPC(SB)
896+
JEQ setbar
872897
MOVL BX, -4(AX) // set calling pc
873898
RET
899+
setbar:
900+
// Set the stack barrier return PC.
901+
MOVL BX, 0(SP)
902+
CALL runtime·setNextBarrierPC(SB)
903+
RET
874904

875905
TEXT runtime·getcallersp(SB), NOSPLIT, $0-8
876906
MOVL argp+0(FP), AX

src/runtime/asm_amd64.s

+32-2
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,22 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
336336
MOVL $0, DX
337337
JMP runtime·morestack(SB)
338338

339+
TEXT runtime·stackBarrier(SB),NOSPLIT,$0
340+
// We came here via a RET to an overwritten return PC.
341+
// AX may be live. Other registers are available.
342+
343+
// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
344+
get_tls(CX)
345+
MOVQ g(CX), CX
346+
MOVQ (g_stkbar+slice_array)(CX), DX
347+
MOVQ g_stkbarPos(CX), BX
348+
IMULQ $stkbar__size, BX // Too big for SIB.
349+
MOVQ stkbar_savedLRVal(DX)(BX*1), BX
350+
// Record that this stack barrier was hit.
351+
ADDQ $1, g_stkbarPos(CX)
352+
// Jump to the original return PC.
353+
JMP BX
354+
339355
// reflectcall: call a function with the given argument list
340356
// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
341357
// we don't have variable-sized frames, so we use a small number
@@ -860,17 +876,31 @@ TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
860876
INT $3
861877
RET
862878

863-
TEXT runtime·getcallerpc(SB),NOSPLIT,$0-16
879+
TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
864880
MOVQ argp+0(FP),AX // addr of first arg
865881
MOVQ -8(AX),AX // get calling pc
882+
CMPQ AX, runtime·stackBarrierPC(SB)
883+
JNE nobar
884+
// Get original return PC.
885+
CALL runtime·nextBarrierPC(SB)
886+
MOVQ 0(SP), AX
887+
nobar:
866888
MOVQ AX, ret+8(FP)
867889
RET
868890

869-
TEXT runtime·setcallerpc(SB),NOSPLIT,$0-16
891+
TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
870892
MOVQ argp+0(FP),AX // addr of first arg
871893
MOVQ pc+8(FP), BX
894+
MOVQ -8(AX), CX
895+
CMPQ CX, runtime·stackBarrierPC(SB)
896+
JEQ setbar
872897
MOVQ BX, -8(AX) // set calling pc
873898
RET
899+
setbar:
900+
// Set the stack barrier return PC.
901+
MOVQ BX, 0(SP)
902+
CALL runtime·setNextBarrierPC(SB)
903+
RET
874904

875905
TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
876906
MOVQ argp+0(FP), AX

src/runtime/asm_amd64p32.s

+33-2
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,23 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
289289
MOVL $0, DX
290290
JMP runtime·morestack(SB)
291291

292+
TEXT runtime·stackBarrier(SB),NOSPLIT,$0
293+
// We came here via a RET to an overwritten return PC.
294+
// AX may be live. Other registers are available.
295+
296+
// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
297+
get_tls(CX)
298+
MOVL g(CX), CX
299+
MOVL (g_stkbar+slice_array)(CX), DX
300+
MOVL g_stkbarPos(CX), BX
301+
IMULL $stkbar__size, BX // Too big for SIB.
302+
ADDL DX, BX
303+
MOVL stkbar_savedLRVal(BX), BX
304+
// Record that this stack barrier was hit.
305+
ADDL $1, g_stkbarPos(CX)
306+
// Jump to the original return PC.
307+
JMP BX
308+
292309
// reflectcall: call a function with the given argument list
293310
// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
294311
// we don't have variable-sized frames, so we use a small number
@@ -616,17 +633,31 @@ TEXT runtime·memclr(SB),NOSPLIT,$0-8
616633
STOSB
617634
RET
618635

619-
TEXT runtime·getcallerpc(SB),NOSPLIT,$0-12
636+
TEXT runtime·getcallerpc(SB),NOSPLIT,$8-12
620637
MOVL argp+0(FP),AX // addr of first arg
621638
MOVL -8(AX),AX // get calling pc
639+
CMPL AX, runtime·stackBarrierPC(SB)
640+
JNE nobar
641+
// Get original return PC.
642+
CALL runtime·nextBarrierPC(SB)
643+
MOVL 0(SP), AX
644+
nobar:
622645
MOVL AX, ret+8(FP)
623646
RET
624647

625-
TEXT runtime·setcallerpc(SB),NOSPLIT,$0-8
648+
TEXT runtime·setcallerpc(SB),NOSPLIT,$8-8
626649
MOVL argp+0(FP),AX // addr of first arg
627650
MOVL pc+4(FP), BX // pc to set
651+
MOVL -8(AX), CX
652+
CMPL CX, runtime·stackBarrierPC(SB)
653+
JEQ setbar
628654
MOVQ BX, -8(AX) // set calling pc
629655
RET
656+
setbar:
657+
// Set the stack barrier return PC.
658+
MOVL BX, 0(SP)
659+
CALL runtime·setNextBarrierPC(SB)
660+
RET
630661

631662
TEXT runtime·getcallersp(SB),NOSPLIT,$0-12
632663
MOVL argp+0(FP), AX

src/runtime/asm_arm.s

+37-4
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,23 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT,$-4-0
309309
MOVW $0, R7
310310
B runtime·morestack(SB)
311311

312+
TEXT runtime·stackBarrier(SB),NOSPLIT,$0
313+
// We came here via a RET to an overwritten LR.
314+
// R0 may be live. Other registers are available.
315+
316+
// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
317+
MOVW (g_stkbar+slice_array)(g), R4
318+
MOVW g_stkbarPos(g), R5
319+
MOVW $stkbar__size, R6
320+
MUL R5, R6
321+
ADD R4, R6
322+
MOVW stkbar_savedLRVal(R6), R6
323+
// Record that this stack barrier was hit.
324+
ADD $1, R5
325+
MOVW R5, g_stkbarPos(g)
326+
// Jump to the original return PC.
327+
B (R6)
328+
312329
// reflectcall: call a function with the given argument list
313330
// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
314331
// we don't have variable-sized frames, so we use a small number
@@ -645,14 +662,30 @@ TEXT setg<>(SB),NOSPLIT,$-4-0
645662
MOVW g, R0
646663
RET
647664

648-
TEXT runtime·getcallerpc(SB),NOSPLIT,$-4-8
649-
MOVW 0(R13), R0
665+
TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8
666+
MOVW 8(R13), R0 // LR saved by caller
667+
MOVW runtime·stackBarrierPC(SB), R1
668+
CMP R0, R1
669+
BNE nobar
670+
// Get original return PC.
671+
BL runtime·nextBarrierPC(SB)
672+
MOVW 4(R13), R0
673+
nobar:
650674
MOVW R0, ret+4(FP)
651675
RET
652676

653-
TEXT runtime·setcallerpc(SB),NOSPLIT,$-4-8
677+
TEXT runtime·setcallerpc(SB),NOSPLIT,$4-8
654678
MOVW pc+4(FP), R0
655-
MOVW R0, 0(R13)
679+
MOVW 8(R13), R1
680+
MOVW runtime·stackBarrierPC(SB), R2
681+
CMP R1, R2
682+
BEQ setbar
683+
MOVW R0, 8(R13) // set LR in caller
684+
RET
685+
setbar:
686+
// Set the stack barrier return PC.
687+
MOVW R0, 4(R13)
688+
BL runtime·setNextBarrierPC(SB)
656689
RET
657690

658691
TEXT runtime·getcallersp(SB),NOSPLIT,$-4-8

src/runtime/asm_arm64.s

+37-4
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,23 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT,$-4-0
307307
MOVW $0, R26
308308
B runtime·morestack(SB)
309309

310+
TEXT runtime·stackBarrier(SB),NOSPLIT,$0
311+
// We came here via a RET to an overwritten LR.
312+
// R0 may be live (see return0). Other registers are available.
313+
314+
// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
315+
MOVD (g_stkbar+slice_array)(g), R4
316+
MOVD g_stkbarPos(g), R5
317+
MOVD $stkbar__size, R6
318+
MUL R5, R6
319+
ADD R4, R6
320+
MOVD stkbar_savedLRVal(R6), R6
321+
// Record that this stack barrier was hit.
322+
ADD $1, R5
323+
MOVD R5, g_stkbarPos(g)
324+
// Jump to the original return PC.
325+
B (R6)
326+
310327
// reflectcall: call a function with the given argument list
311328
// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
312329
// we don't have variable-sized frames, so we use a small number
@@ -743,14 +760,30 @@ TEXT setg_gcc<>(SB),NOSPLIT,$8
743760
MOVD savedR27-8(SP), R27
744761
RET
745762

746-
TEXT runtime·getcallerpc(SB),NOSPLIT,$-8-16
747-
MOVD 0(RSP), R0
763+
TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
764+
MOVD 16(RSP), R0 // LR saved by caller
765+
MOVD runtime·stackBarrierPC(SB), R1
766+
CMP R0, R1
767+
BNE nobar
768+
// Get original return PC.
769+
BL runtime·nextBarrierPC(SB)
770+
MOVD 8(RSP), R0
771+
nobar:
748772
MOVD R0, ret+8(FP)
749773
RET
750774

751-
TEXT runtime·setcallerpc(SB),NOSPLIT,$-8-16
775+
TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
752776
MOVD pc+8(FP), R0
753-
MOVD R0, 0(RSP) // set calling pc
777+
MOVD 16(RSP), R1
778+
MOVD runtime·stackBarrierPC(SB), R2
779+
CMP R1, R2
780+
BEQ setbar
781+
MOVD R0, 16(RSP) // set LR in caller
782+
RET
783+
setbar:
784+
// Set the stack barrier return PC.
785+
MOVD R0, 8(RSP)
786+
BL runtime·setNextBarrierPC(SB)
754787
RET
755788

756789
TEXT runtime·getcallersp(SB),NOSPLIT,$0-16

src/runtime/asm_ppc64x.s

+38-4
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,24 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT,$-8-0
304304
MOVD R0, R11
305305
BR runtime·morestack(SB)
306306

307+
TEXT runtime·stackBarrier(SB),NOSPLIT,$0
308+
// We came here via a RET to an overwritten LR.
309+
// R3 may be live. Other registers are available.
310+
311+
// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
312+
MOVD (g_stkbar+slice_array)(g), R4
313+
MOVD g_stkbarPos(g), R5
314+
MOVD $stkbar__size, R6
315+
MULLD R5, R6
316+
ADD R4, R6
317+
MOVD stkbar_savedLRVal(R6), R6
318+
// Record that this stack barrier was hit.
319+
ADD $1, R5
320+
MOVD R5, g_stkbarPos(g)
321+
// Jump to the original return PC.
322+
MOVD R6, CTR
323+
BR (CTR)
324+
307325
// reflectcall: call a function with the given argument list
308326
// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
309327
// we don't have variable-sized frames, so we use a small number
@@ -883,15 +901,31 @@ TEXT setg_gcc<>(SB),NOSPLIT,$-8-0
883901
MOVD R4, LR
884902
RET
885903

886-
TEXT runtime·getcallerpc(SB),NOSPLIT,$-8-16
887-
MOVD 0(R1), R3
904+
TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
905+
MOVD 16(R1), R3 // LR saved by caller
906+
MOVD runtime·stackBarrierPC(SB), R4
907+
CMP R3, R4
908+
BNE nobar
909+
// Get original return PC.
910+
BL runtime·nextBarrierPC(SB)
911+
MOVD 8(R1), R3
912+
nobar:
888913
MOVD R3, ret+8(FP)
889914
RETURN
890915

891-
TEXT runtime·setcallerpc(SB),NOSPLIT,$-8-16
916+
TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
892917
MOVD pc+8(FP), R3
893-
MOVD R3, 0(R1) // set calling pc
918+
MOVD 16(R1), R4
919+
MOVD runtime·stackBarrierPC(SB), R5
920+
CMP R4, R5
921+
BEQ setbar
922+
MOVD R3, 16(R1) // set LR in caller
894923
RETURN
924+
setbar:
925+
// Set the stack barrier return PC.
926+
MOVD R3, 8(R1)
927+
BL runtime·setNextBarrierPC(SB)
928+
RET
895929

896930
TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
897931
MOVD argp+0(FP), R3

src/runtime/mbarrier.go

+17-1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ import "unsafe"
2727
// slot is the destination (dst) in go code
2828
// ptr is the value that goes into the slot (src) in the go code
2929
//
30+
//
31+
// Dealing with memory ordering:
32+
//
3033
// Dijkstra pointed out that maintaining the no black to white
3134
// pointers means that white to white pointers not need
3235
// to be noted by the write barrier. Furthermore if either
@@ -54,7 +57,20 @@ import "unsafe"
5457
// Peterson/Dekker algorithms for mutual exclusion). Rather than require memory
5558
// barriers, which will slow down both the mutator and the GC, we always grey
5659
// the ptr object regardless of the slot's color.
57-
//go:nowritebarrier
60+
//
61+
//
62+
// Stack writes:
63+
//
64+
// The compiler omits write barriers for writes to the current frame,
65+
// but if a stack pointer has been passed down the call stack, the
66+
// compiler will generate a write barrier for writes through that
67+
// pointer (because it doesn't know it's not a heap pointer).
68+
//
69+
// One might be tempted to ignore the write barrier if slot points
70+
// into to the stack. Don't do it! Mark termination only re-scans
71+
// frames that have potentially been active since the concurrent scan,
72+
// so it depends on write barriers to track changes to pointers in
73+
// stack frames that have not been active. go:nowritebarrier
5874
func gcmarkwb_m(slot *uintptr, ptr uintptr) {
5975
switch gcphase {
6076
default:

0 commit comments

Comments
 (0)