Skip to content

Commit 6922b5b

Browse files
committed
Move GC debugging bits to a separate file
1 parent f6c6aa0 commit 6922b5b

File tree

3 files changed

+532
-530
lines changed

3 files changed

+532
-530
lines changed

src/Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ $(julia_flisp.boot): julia-parser.scm julia-syntax.scm \
8484
$(BUILDDIR)/ast.o $(BUILDDIR)/ast.dbg.obj: $(BUILDDIR)/julia_flisp.boot.inc flisp/*.h
8585
$(BUILDDIR)/codegen.o $(BUILDDIR)/codegen.dbg.obj: intrinsics.cpp cgutils.cpp ccall.cpp abi_*.cpp
8686
$(BUILDDIR)/builtins.o $(BUILDDIR)/builtins.dbg.obj: table.c
87+
$(BUILDDIR)/gc.o $(BUILDDIR)/gc.dbg.obj: gc-debug.c
8788

8889
$(BUILDDIR)/support/libsupport.a: support/*.h support/*.c
8990
$(MAKE) -C support BUILDDIR='$(abspath $(BUILDDIR)/support)'

src/gc-debug.c

+331
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,331 @@
1+
// This file is a part of Julia. License is MIT: http://julialang.org/license
2+
3+
#ifdef GC_DEBUG_ENV
4+
#include <inttypes.h>
5+
#include <stdio.h>
6+
#endif
7+
8+
void jl_(void *jl_value);
9+
10+
// mark verification
11+
#ifdef GC_VERIFY
12+
static jl_value_t* lostval = 0;
13+
static arraylist_t lostval_parents;
14+
static arraylist_t lostval_parents_done;
15+
static int verifying;
16+
17+
static void add_lostval_parent(jl_value_t* parent)
18+
{
19+
for(int i = 0; i < lostval_parents_done.len; i++) {
20+
if ((jl_value_t*)lostval_parents_done.items[i] == parent)
21+
return;
22+
}
23+
for(int i = 0; i < lostval_parents.len; i++) {
24+
if ((jl_value_t*)lostval_parents.items[i] == parent)
25+
return;
26+
}
27+
arraylist_push(&lostval_parents, parent);
28+
}
29+
30+
#define verify_val(v) do { \
31+
if (lostval == (jl_value_t*)(v) && (v) != 0) { \
32+
jl_printf(JL_STDOUT, \
33+
"Found lostval %p at %s:%d oftype: ", \
34+
(void*)(lostval), __FILE__, __LINE__); \
35+
jl_static_show(JL_STDOUT, jl_typeof(v)); \
36+
jl_printf(JL_STDOUT, "\n"); \
37+
} \
38+
} while(0);
39+
40+
41+
#define verify_parent(ty, obj, slot, args...) do { \
42+
if (*(jl_value_t**)(slot) == lostval && \
43+
(jl_value_t*)(obj) != lostval) { \
44+
jl_printf(JL_STDOUT, "Found parent %p %p at %s:%d\n", \
45+
(void*)(ty), (void*)(obj), __FILE__, __LINE__); \
46+
jl_printf(JL_STDOUT, "\tloc %p : ", (void*)(slot)); \
47+
jl_printf(JL_STDOUT, args); \
48+
jl_printf(JL_STDOUT, "\n"); \
49+
jl_printf(JL_STDOUT, "\ttype: "); \
50+
jl_static_show(JL_STDOUT, jl_typeof(obj)); \
51+
jl_printf(JL_STDOUT, "\n"); \
52+
add_lostval_parent((jl_value_t*)(obj)); \
53+
} \
54+
} while(0);
55+
56+
#define verify_parent1(ty,obj,slot,arg1) verify_parent(ty,obj,slot,arg1)
57+
#define verify_parent2(ty,obj,slot,arg1,arg2) verify_parent(ty,obj,slot,arg1,arg2)
58+
59+
/*
60+
How to debug a missing write barrier :
61+
(or rather how I do it, if you know of a better way update this)
62+
First, reproduce it with GC_VERIFY. It does change the allocation profile so if the error
63+
is rare enough this may not be straightforward. If the backtracking goes well you should know
64+
which object and which of its slots was written to without being caught by the write
65+
barrier. Most times this allows you to take a guess. If this type of object is modified
66+
by C code directly, look for missing jl_gc_wb() on pointer updates. Be aware that there are
67+
innocent looking functions which allocate (and thus trigger marking) only on special cases.
68+
69+
If you cant find it, you can try the following :
70+
- Ensure that should_timeout() is deterministic instead of clock based.
71+
- Once you have a completly deterministic program which crashes on gc_verify, the addresses
72+
should stay constant between different runs (with same binary, same environment ...).
73+
Do not forget to turn off ASLR (linux: echo 0 > /proc/sys/kernel/randomize_va_space).
74+
At this point you should be able to run under gdb and use a hw watch to look for writes
75+
at the exact addr of the slot (use something like watch *slot_addr if *slot_addr == val).
76+
- If it went well you are now stopped at the exact point the problem is happening.
77+
Backtraces in JIT'd code wont work for me (but I'm not sure they should) so in that
78+
case you can try to jl_throw(something) from gdb.
79+
*/
80+
// this does not yet detect missing writes from marked to marked_noesc
81+
// the error is caught at the first long collection
82+
static arraylist_t bits_save[4];
83+
84+
// set all mark bits to bits
85+
// record the state of the region and can replay it in restore()
86+
// restore _must_ be called as this will overwrite parts of the
87+
// freelist in pools
88+
static void clear_mark(int bits)
89+
{
90+
gcval_t *pv;
91+
if (!verifying) {
92+
for (int i = 0; i < 4; i++) {
93+
bits_save[i].len = 0;
94+
}
95+
}
96+
void *current_heap = NULL;
97+
bigval_t *bigs[2];
98+
bigs[0] = big_objects;
99+
bigs[1] = big_objects_marked;
100+
for (int i = 0; i < 2; i++) {
101+
bigval_t *v = bigs[i];
102+
while (v != NULL) {
103+
void* gcv = &v->header;
104+
if (!verifying) arraylist_push(&bits_save[gc_bits(gcv)], gcv);
105+
gc_bits(gcv) = bits;
106+
v = v->next;
107+
}
108+
}
109+
for (int h = 0; h < REGION_COUNT; h++) {
110+
region_t* region = regions[h];
111+
if (!region) break;
112+
for (int pg_i = 0; pg_i < REGION_PG_COUNT/32; pg_i++) {
113+
uint32_t line = region->freemap[pg_i];
114+
if (!!~line) {
115+
for (int j = 0; j < 32; j++) {
116+
if (!((line >> j) & 1)) {
117+
gcpage_t *pg = page_metadata(&region->pages[pg_i*32 + j][0] + GC_PAGE_OFFSET);
118+
pool_t *pool = &norm_pools[pg->pool_n];
119+
pv = (gcval_t*)(pg->data + GC_PAGE_OFFSET);
120+
char *lim = (char*)pv + GC_PAGE_SZ - GC_PAGE_OFFSET - pool->osize;
121+
while ((char*)pv <= lim) {
122+
if (!verifying) arraylist_push(&bits_save[gc_bits(pv)], pv);
123+
gc_bits(pv) = bits;
124+
pv = (gcval_t*)((char*)pv + pool->osize);
125+
}
126+
}
127+
}
128+
}
129+
}
130+
}
131+
}
132+
133+
static void restore(void)
134+
{
135+
for(int b = 0; b < 4; b++) {
136+
for(int i = 0; i < bits_save[b].len; i++) {
137+
gc_bits(bits_save[b].items[i]) = b;
138+
}
139+
}
140+
}
141+
142+
static void gc_verify_track(void)
143+
{
144+
do {
145+
arraylist_push(&lostval_parents_done, lostval);
146+
jl_printf(JL_STDERR, "Now looking for %p =======\n", lostval);
147+
clear_mark(GC_CLEAN);
148+
pre_mark();
149+
post_mark(&finalizer_list, 1);
150+
post_mark(&finalizer_list_marked, 1);
151+
if (lostval_parents.len == 0) {
152+
jl_printf(JL_STDERR, "Could not find the missing link. We missed a toplevel root. This is odd.\n");
153+
break;
154+
}
155+
jl_value_t* lostval_parent = NULL;
156+
for(int i = 0; i < lostval_parents.len; i++) {
157+
lostval_parent = (jl_value_t*)lostval_parents.items[i];
158+
int clean_len = bits_save[GC_CLEAN].len;
159+
for(int j = 0; j < clean_len + bits_save[GC_QUEUED].len; j++) {
160+
void* p = bits_save[j >= clean_len ? GC_QUEUED : GC_CLEAN].items[j >= clean_len ? j - clean_len : j];
161+
if (jl_valueof(p) == lostval_parent) {
162+
lostval = lostval_parent;
163+
lostval_parent = NULL;
164+
break;
165+
}
166+
}
167+
if (lostval_parent != NULL) break;
168+
}
169+
if (lostval_parent == NULL) { // all parents of lostval were also scheduled for deletion
170+
lostval = arraylist_pop(&lostval_parents);
171+
}
172+
else {
173+
jl_printf(JL_STDERR, "Missing write barrier found !\n");
174+
jl_printf(JL_STDERR, "%p was written a reference to %p that was not recorded\n", lostval_parent, lostval);
175+
jl_printf(JL_STDERR, "(details above)\n");
176+
lostval = NULL;
177+
}
178+
restore();
179+
} while(lostval != NULL);
180+
}
181+
182+
static void gc_verify(void)
183+
{
184+
lostval = NULL;
185+
lostval_parents.len = 0;
186+
lostval_parents_done.len = 0;
187+
check_timeout = 0;
188+
clear_mark(GC_CLEAN);
189+
verifying = 1;
190+
pre_mark();
191+
post_mark(&finalizer_list, 1);
192+
post_mark(&finalizer_list_marked, 1);
193+
int clean_len = bits_save[GC_CLEAN].len;
194+
for(int i = 0; i < clean_len + bits_save[GC_QUEUED].len; i++) {
195+
gcval_t* v = (gcval_t*)bits_save[i >= clean_len ? GC_QUEUED : GC_CLEAN].items[i >= clean_len ? i - clean_len : i];
196+
if (gc_marked(v)) {
197+
jl_printf(JL_STDERR, "Error. Early free of 0x%lx type :", (uptrint_t)v);
198+
jl_(jl_typeof(jl_valueof(v)));
199+
jl_printf(JL_STDERR, "val : ");
200+
jl_(jl_valueof(v));
201+
jl_printf(JL_STDERR, "Let's try to backtrack the missing write barrier :\n");
202+
lostval = jl_valueof(v);
203+
break;
204+
}
205+
}
206+
if (lostval == NULL) {
207+
verifying = 0;
208+
restore(); // we did not miss anything
209+
return;
210+
}
211+
restore();
212+
gc_verify_track();
213+
abort();
214+
}
215+
216+
#else
217+
#define verify_val(v)
218+
#define verify_parent1(ty,obj,slot,arg1)
219+
#define verify_parent2(ty,obj,slot,arg1,arg2)
220+
#endif
221+
222+
#ifdef GC_DEBUG_ENV
223+
224+
typedef struct {
225+
uint64_t num;
226+
227+
uint64_t min;
228+
uint64_t interv;
229+
uint64_t max;
230+
} AllocNum;
231+
232+
DLLEXPORT struct {
233+
int init;
234+
int sweep_mask;
235+
AllocNum pool;
236+
AllocNum other;
237+
AllocNum print;
238+
} gc_debug_env = {0, GC_MARKED_NOESC,
239+
{0, 0, 0, 0},
240+
{0, 0, 0, 0},
241+
{0, 0, 0, 0}};
242+
243+
static void gc_debug_alloc_init(AllocNum *num, const char *name)
244+
{
245+
// Not very generic and robust but good enough for a
246+
// debug option
247+
char buff[128];
248+
sprintf(buff, "JL_GC_ALLOC_%s", name);
249+
char *env = getenv(buff);
250+
if (!env)
251+
return;
252+
num->interv = 1;
253+
num->max = (uint64_t)-1ll;
254+
sscanf(env, "%" SCNd64 ":%" SCNd64 ":%" SCNd64,
255+
(int64_t*)&num->min, (int64_t*)&num->interv, (int64_t*)&num->max);
256+
}
257+
258+
static int gc_debug_alloc_check(AllocNum *num)
259+
{
260+
num->num++;
261+
if (num->interv == 0 || num->num < num->min || num->num > num->max)
262+
return 0;
263+
return ((num->num - num->min) % num->interv) == 0;
264+
}
265+
266+
static void gc_debug_init()
267+
{
268+
if (__likely(gc_debug_env.init))
269+
return;
270+
gc_debug_env.init = 1;
271+
char *env = getenv("JL_GC_NO_GENERATIONAL");
272+
if (env && strcmp(env, "0") != 0) {
273+
gc_debug_env.sweep_mask = GC_MARKED;
274+
}
275+
gc_debug_alloc_init(&gc_debug_env.pool, "POOL");
276+
gc_debug_alloc_init(&gc_debug_env.other, "OTHER");
277+
gc_debug_alloc_init(&gc_debug_env.print, "PRINT");
278+
}
279+
280+
static inline int gc_debug_check_pool()
281+
{
282+
gc_debug_init();
283+
return gc_debug_alloc_check(&gc_debug_env.pool);
284+
}
285+
286+
static inline int gc_debug_check_other()
287+
{
288+
gc_debug_init();
289+
return gc_debug_alloc_check(&gc_debug_env.other);
290+
}
291+
292+
void gc_debug_print_status()
293+
{
294+
uint64_t pool_count = gc_debug_env.pool.num;
295+
uint64_t other_count = gc_debug_env.other.num;
296+
jl_printf(JL_STDOUT,
297+
"Allocations: %" PRIu64 " "
298+
"(Pool: %" PRIu64 "; Other: %" PRIu64 "); GC: %d\n",
299+
pool_count + other_count, pool_count, other_count,
300+
n_pause);
301+
}
302+
303+
static inline void gc_debug_print()
304+
{
305+
gc_debug_init();
306+
if (!gc_debug_alloc_check(&gc_debug_env.print))
307+
return;
308+
gc_debug_print_status();
309+
}
310+
311+
#else
312+
313+
static inline int gc_debug_check_other()
314+
{
315+
return 0;
316+
}
317+
318+
static inline int gc_debug_check_pool()
319+
{
320+
return 0;
321+
}
322+
323+
static inline void gc_debug_print()
324+
{
325+
}
326+
327+
static inline void gc_debug_init()
328+
{
329+
}
330+
331+
#endif

0 commit comments

Comments
 (0)