How to Speed Up Transformer Training Using NVIDIA Apex (FusedAdam, FusedLayerNorm) and Native torch.amp

print("\n### SECTION D: end-to-end Transformer (vanilla fp32 vs Apex fused + AMP) ###")
VOCAB, D, NHEAD, LAYERS, SEQ, BATCH, STEPS = 2000, 256, 4, 4, 128, 32, 60
class Block(torch.nn.Module):
   def __init__(self, d, nhead, norm_cls):
       super().__init__()
       self.attn = torch.nn.MultiheadAttention(d, nhead, batch_first=True)
       self.ff = torch.nn.Sequential(torch.nn.Linear(d, 4 * d), torch.nn.GELU(),
                                     torch.nn.Linear(4 * d, d))
       self.n1, self.n2 = norm_cls(d), norm_cls(d)
   def forward(self, x):
       h = self.n1(x); x = x + self.attn(h, h, h, need_weights=False)[0]
       return x + self.ff(self.n2(x))
class TinyTransformer(torch.nn.Module):
   def __init__(self, norm_cls):
       super().__init__()
       self.emb = torch.nn.Embedding(VOCAB, D)
       self.blocks = torch.nn.ModuleList([Block(D, NHEAD, norm_cls) for _ in range(LAYERS)])
       self.norm = norm_cls(D)
       self.head = torch.nn.Linear(D, VOCAB)
   def forward(self, idx):
       x = self.emb(idx)
       for b in self.blocks:
           x = b(x)
       return self.head(self.norm(x))
g = torch.Generator(device="cpu").manual_seed(0)
data = torch.randint(0, VOCAB, (BATCH, SEQ + 1), generator=g).to(DEV)
inp, tgt = data[:, :-1], data[:, 1:]
lossfn = torch.nn.CrossEntropyLoss()
def run_training(use_apex):
   torch.manual_seed(0)
   norm_cls = (FusedLayerNorm if (use_apex and HAS_FLN and APEX_OK) else torch.nn.LayerNorm)
   model = TinyTransformer(norm_cls).to(DEV)
   if use_apex and HAS_AMP_C and APEX_OK:
       optimizer = FusedAdam(model.parameters(), lr=3e-4)
   else:
       optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
   scaler = torch.amp.GradScaler("cuda", enabled=use_apex)
   def one_step():
       optimizer.zero_grad(set_to_none=True)
       with torch.amp.autocast("cuda", dtype=torch.float16, enabled=use_apex):
           logits = model(inp)
           loss = lossfn(logits.reshape(-1, VOCAB), tgt.reshape(-1))
       scaler.scale(loss).backward()
       scaler.step(optimizer)
       scaler.update()
       return loss
   for _ in range(5):
       one_step()
   torch.cuda.synchronize()
   t0 = time.perf_counter()
   for _ in range(STEPS):
       loss = one_step()
   torch.cuda.synchronize()
   dt = time.perf_counter() - t0
   return loss.item(), (STEPS * BATCH * SEQ) / dt, dt
loss_v, tps_v, dt_v = run_training(use_apex=False)
print(f"  vanilla (fp32, nn.LayerNorm, AdamW)        : "
     f"{dt_v:5.2f}s  | {tps_v:9.0f} tok/s | final loss {loss_v:.3f}")
if APEX_OK and (HAS_AMP_C or HAS_FLN):
   loss_a, tps_a, dt_a = run_training(use_apex=True)
   print(f"  apex   (fp16, FusedLayerNorm, FusedAdam)   : "
         f"{dt_a:5.2f}s  | {tps_a:9.0f} tok/s | final loss {loss_a:.3f}")
   print(f"  ----> speedup: {tps_a / tps_v:0.2f}x throughput")
else:
   print("  apex path SKIPPED (no fused kernels built)")
print("\n" + "=" * 78)
print("DONE. Key takeaways:")
print("  - FusedAdam/FusedLayerNorm/FusedRMSNorm are the still-relevant Apex pieces;")
print("    speedups grow with model size & parameter count (tiny demo understates it).")
print("  - apex.amp is deprecated -> prefer torch.amp.autocast + torch.amp.GradScaler.")
print("  - FusedAdam composes cleanly with native torch.amp (Section D).")
print("  - On real workloads, also try a larger model and bf16 autocast (no scaler needed).")
print("=" * 78)

Source link