Deployed 2536c93 with MkDocs version: 1.6.1

This commit is contained in:
2026-05-03 11:47:30 +08:00
commit 1ff86b66fc
418 changed files with 617336 additions and 0 deletions
+47
View File
@@ -0,0 +1,47 @@
<svg width="700" height="320" xmlns="http://www.w3.org/2000/svg">
<text x="350" y="22" fill="#333" font-size="14" font-weight="bold" text-anchor="middle">Training Memory Breakdown (per GPU)</text>
<!-- Stacked bar chart -->
<!-- Bar outline -->
<rect x="150" y="45" width="120" height="230" fill="none" stroke="#333" stroke-width="1.5" rx="4"/>
<!-- Parameters (bottom) -->
<rect x="150" y="218" width="120" height="57" fill="#3498db" opacity="0.7" rx="0"/>
<text x="210" y="252" fill="white" font-size="11" font-weight="bold" text-anchor="middle">Params</text>
<!-- Gradients -->
<rect x="150" y="161" width="120" height="57" fill="#e74c3c" opacity="0.7"/>
<text x="210" y="195" fill="white" font-size="11" font-weight="bold" text-anchor="middle">Gradients</text>
<!-- Optimizer states -->
<rect x="150" y="90" width="120" height="71" fill="#9b59b6" opacity="0.7"/>
<text x="210" y="130" fill="white" font-size="11" font-weight="bold" text-anchor="middle">Optimizer</text>
<text x="210" y="144" fill="white" font-size="10" text-anchor="middle">States</text>
<!-- Activations (top) -->
<rect x="150" y="45" width="120" height="45" fill="#f39c12" opacity="0.7" rx="4"/>
<text x="210" y="72" fill="white" font-size="11" font-weight="bold" text-anchor="middle">Activations</text>
<!-- Annotations with lines -->
<line x1="270" y1="247" x2="310" y2="247" stroke="#3498db" stroke-width="1.2"/>
<text x="320" y="244" fill="#3498db" font-size="10">Model weights: 1x params</text>
<text x="320" y="258" fill="#3498db" font-size="9">(FP32: 4 bytes each, BF16: 2 bytes)</text>
<line x1="270" y1="190" x2="310" y2="190" stroke="#e74c3c" stroke-width="1.2"/>
<text x="320" y="187" fill="#e74c3c" font-size="10">Gradients: 1x params</text>
<text x="320" y="201" fill="#e74c3c" font-size="9">(same size as weights)</text>
<line x1="270" y1="126" x2="310" y2="126" stroke="#9b59b6" stroke-width="1.2"/>
<text x="320" y="118" fill="#9b59b6" font-size="10">Adam states: 2x params</text>
<text x="320" y="132" fill="#9b59b6" font-size="9">(first moment + second moment)</text>
<text x="320" y="146" fill="#9b59b6" font-size="9">(always FP32 for stability)</text>
<line x1="270" y1="67" x2="310" y2="67" stroke="#f39c12" stroke-width="1.2"/>
<text x="320" y="60" fill="#f39c12" font-size="10">Activations: varies with batch size</text>
<text x="320" y="74" fill="#f39c12" font-size="9">(saved for backward pass)</text>
<text x="320" y="88" fill="#f39c12" font-size="9">(reduced by gradient checkpointing)</text>
<!-- Example calculation -->
<rect x="100" y="285" width="500" height="30" rx="6" fill="#f5f5f5" stroke="#333" stroke-width="1"/>
<text x="350" y="305" fill="#333" font-size="10" text-anchor="middle">Example: 7B params, FP32 Adam = 7B * (4+4+4+4) = 112 GB just for params/grads/optimizer</text>
</svg>

After

Width:  |  Height:  |  Size: 2.8 KiB