Merge pull request #225 from otaviogood/grad_accum

karpathy · web-flow · commit 21f9bff7e40d · 2023-04-17T20:11:25.000-07:00
Fix for gradient_accumulation_steps training slow
diff --git a/config/train_gpt2.py b/config/train_gpt2.py
@@ -10,7 +10,7 @@
 # 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
 batch_size = 12
 block_size = 1024
-gradient_accumulation_steps = 5
+gradient_accumulation_steps = 5 * 8
 
 # this makes total number of tokens be 300B
 max_iters = 600000
diff --git a/config/train_shakespeare_char.py b/config/train_shakespeare_char.py
@@ -14,6 +14,7 @@
 wandb_run_name = 'mini-gpt'
 
 dataset = 'shakespeare_char'
+gradient_accumulation_steps = 1
 batch_size = 64
 block_size = 256 # context of up to 256 previous characters
 
diff --git a/train.py b/train.py
@@ -45,7 +45,7 @@
 wandb_run_name = 'gpt2' # 'run' + str(time.time())
 # data
 dataset = 'openwebtext'
-gradient_accumulation_steps = 5 # used to simulate larger batch sizes
+gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
 batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
 block_size = 1024
 # model
@@ -84,16 +84,20 @@
     init_process_group(backend=backend)
     ddp_rank = int(os.environ['RANK'])
     ddp_local_rank = int(os.environ['LOCAL_RANK'])
+    ddp_world_size = int(os.environ['WORLD_SIZE'])
     device = f'cuda:{ddp_local_rank}'
     torch.cuda.set_device(device)
     master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
     seed_offset = ddp_rank # each process gets a different seed
+    assert gradient_accumulation_steps % torch.cuda.device_count() == 0
+    gradient_accumulation_steps //= torch.cuda.device_count()
 else:
     # if not ddp, we are running on a single gpu, and one process
     master_process = True
     seed_offset = 0
-    gradient_accumulation_steps *= 8 # simulate 8 gpus
-print("total number of tokens per iteration:", batch_size * block_size * gradient_accumulation_steps)
+    ddp_world_size = 1
+tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
+print(f"tokens per iteration will be: {tokens_per_iter:,}")
 
 if master_process:
     os.makedirs(out_dir, exist_ok=True)