maths-cs-ai-compendium-zh/images/training_memory_breakdown.svg

<svg width="700" height="320" xmlns="http://www.w3.org/2000/svg">
  <text x="350" y="22" fill="#333" font-size="14" font-weight="bold" text-anchor="middle">Training Memory Breakdown (per GPU)</text>

  <!-- Stacked bar chart -->
  <!-- Bar outline -->
  <rect x="150" y="45" width="120" height="230" fill="none" stroke="#333" stroke-width="1.5" rx="4"/>

  <!-- Parameters (bottom) -->
  <rect x="150" y="218" width="120" height="57" fill="#3498db" opacity="0.7" rx="0"/>
  <text x="210" y="252" fill="white" font-size="11" font-weight="bold" text-anchor="middle">Params</text>

  <!-- Gradients -->
  <rect x="150" y="161" width="120" height="57" fill="#e74c3c" opacity="0.7"/>
  <text x="210" y="195" fill="white" font-size="11" font-weight="bold" text-anchor="middle">Gradients</text>

  <!-- Optimizer states -->
  <rect x="150" y="90" width="120" height="71" fill="#9b59b6" opacity="0.7"/>
  <text x="210" y="130" fill="white" font-size="11" font-weight="bold" text-anchor="middle">Optimizer</text>
  <text x="210" y="144" fill="white" font-size="10" text-anchor="middle">States</text>

  <!-- Activations (top) -->
  <rect x="150" y="45" width="120" height="45" fill="#f39c12" opacity="0.7" rx="4"/>
  <text x="210" y="72" fill="white" font-size="11" font-weight="bold" text-anchor="middle">Activations</text>

  <!-- Annotations with lines -->
  <line x1="270" y1="247" x2="310" y2="247" stroke="#3498db" stroke-width="1.2"/>
  <text x="320" y="244" fill="#3498db" font-size="10">Model weights: 1x params</text>
  <text x="320" y="258" fill="#3498db" font-size="9">(FP32: 4 bytes each, BF16: 2 bytes)</text>

  <line x1="270" y1="190" x2="310" y2="190" stroke="#e74c3c" stroke-width="1.2"/>
  <text x="320" y="187" fill="#e74c3c" font-size="10">Gradients: 1x params</text>
  <text x="320" y="201" fill="#e74c3c" font-size="9">(same size as weights)</text>

  <line x1="270" y1="126" x2="310" y2="126" stroke="#9b59b6" stroke-width="1.2"/>
  <text x="320" y="118" fill="#9b59b6" font-size="10">Adam states: 2x params</text>
  <text x="320" y="132" fill="#9b59b6" font-size="9">(first moment + second moment)</text>
  <text x="320" y="146" fill="#9b59b6" font-size="9">(always FP32 for stability)</text>

  <line x1="270" y1="67" x2="310" y2="67" stroke="#f39c12" stroke-width="1.2"/>
  <text x="320" y="60" fill="#f39c12" font-size="10">Activations: varies with batch size</text>
  <text x="320" y="74" fill="#f39c12" font-size="9">(saved for backward pass)</text>
  <text x="320" y="88" fill="#f39c12" font-size="9">(reduced by gradient checkpointing)</text>

  <!-- Example calculation -->
  <rect x="100" y="285" width="500" height="30" rx="6" fill="#f5f5f5" stroke="#333" stroke-width="1"/>
  <text x="350" y="305" fill="#333" font-size="10" text-anchor="middle">Example: 7B params, FP32 Adam = 7B * (4+4+4+4) = 112 GB just for params/grads/optimizer</text>
</svg>