File tree 3 files changed +9
-4
lines changed
3 files changed +9
-4
lines changed Original file line number Diff line number Diff line change 10
10
# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
11
11
batch_size = 12
12
12
block_size = 1024
13
- gradient_accumulation_steps = 5
13
+ gradient_accumulation_steps = 5 * 8
14
14
15
15
# this makes total number of tokens be 300B
16
16
max_iters = 600000
Original file line number Diff line number Diff line change 14
14
wandb_run_name = 'mini-gpt'
15
15
16
16
dataset = 'shakespeare_char'
17
+ gradient_accumulation_steps = 1
17
18
batch_size = 64
18
19
block_size = 256 # context of up to 256 previous characters
19
20
Original file line number Diff line number Diff line change 45
45
wandb_run_name = 'gpt2' # 'run' + str(time.time())
46
46
# data
47
47
dataset = 'openwebtext'
48
- gradient_accumulation_steps = 5 # used to simulate larger batch sizes
48
+ gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
49
49
batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
50
50
block_size = 1024
51
51
# model
84
84
init_process_group (backend = backend )
85
85
ddp_rank = int (os .environ ['RANK' ])
86
86
ddp_local_rank = int (os .environ ['LOCAL_RANK' ])
87
+ ddp_world_size = int (os .environ ['WORLD_SIZE' ])
87
88
device = f'cuda:{ ddp_local_rank } '
88
89
torch .cuda .set_device (device )
89
90
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
90
91
seed_offset = ddp_rank # each process gets a different seed
92
+ assert gradient_accumulation_steps % torch .cuda .device_count () == 0
93
+ gradient_accumulation_steps //= torch .cuda .device_count ()
91
94
else :
92
95
# if not ddp, we are running on a single gpu, and one process
93
96
master_process = True
94
97
seed_offset = 0
95
- gradient_accumulation_steps *= 8 # simulate 8 gpus
96
- print ("total number of tokens per iteration:" , batch_size * block_size * gradient_accumulation_steps )
98
+ ddp_world_size = 1
99
+ tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
100
+ print (f"tokens per iteration will be: { tokens_per_iter :,} " )
97
101
98
102
if master_process :
99
103
os .makedirs (out_dir , exist_ok = True )
You can’t perform that action at this time.
0 commit comments