maths-cs-ai-compendium-zh/images/audio_spectrogram_transformer.svg

<svg xmlns="http://www.w3.org/2000/svg" width="700" height="290" font-family="Arial, Helvetica, sans-serif">
  <defs>
    <marker id="arrow-ast" markerWidth="8" markerHeight="6" refX="8" refY="3" orient="auto">
      <polygon points="0,0 8,3 0,6" fill="#333"/>
    </marker>
  </defs>

  <!-- Title -->
  <text x="350" y="22" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">Audio Spectrogram Transformer (AST)</text>

  <!-- Mel Spectrogram -->
  <rect x="20" y="42" width="80" height="100" rx="4" fill="rgba(243,156,18,0.08)" stroke="#f39c12" stroke-width="1.2"/>
  <text x="60" y="155" text-anchor="middle" font-size="8" fill="#333">Mel Spectrogram</text>
  <!-- Grid lines for spectrogram feel -->
  <line x1="20" y1="62" x2="100" y2="62" stroke="#f39c12" stroke-width="0.4" opacity="0.4"/>
  <line x1="20" y1="82" x2="100" y2="82" stroke="#f39c12" stroke-width="0.4" opacity="0.4"/>
  <line x1="20" y1="102" x2="100" y2="102" stroke="#f39c12" stroke-width="0.4" opacity="0.4"/>
  <line x1="20" y1="122" x2="100" y2="122" stroke="#f39c12" stroke-width="0.4" opacity="0.4"/>
  <line x1="40" y1="42" x2="40" y2="142" stroke="#f39c12" stroke-width="0.4" opacity="0.4"/>
  <line x1="60" y1="42" x2="60" y2="142" stroke="#f39c12" stroke-width="0.4" opacity="0.4"/>
  <line x1="80" y1="42" x2="80" y2="142" stroke="#f39c12" stroke-width="0.4" opacity="0.4"/>
  <!-- Patch shading to show splitting -->
  <rect x="20" y="42" width="20" height="20" fill="rgba(243,156,18,0.15)"/>
  <rect x="60" y="62" width="20" height="20" fill="rgba(243,156,18,0.15)"/>
  <rect x="40" y="102" width="20" height="20" fill="rgba(243,156,18,0.15)"/>
  <rect x="80" y="82" width="20" height="20" fill="rgba(243,156,18,0.15)"/>
  <rect x="20" y="122" width="20" height="20" fill="rgba(243,156,18,0.15)"/>
  <text x="60" y="167" text-anchor="middle" font-size="7" fill="#666">(patch grid)</text>

  <line x1="100" y1="92" x2="125" y2="92" stroke="#333" stroke-width="1.2" marker-end="url(#arrow-ast)"/>

  <!-- Flatten patches -->
  <rect x="128" y="52" width="50" height="80" rx="4" fill="rgba(52,152,219,0.08)" stroke="#3498db" stroke-width="1.2"/>
  <text x="153" y="86" text-anchor="middle" font-size="8" fill="#333">Flatten</text>
  <text x="153" y="96" text-anchor="middle" font-size="8" fill="#333">Patches</text>
  <!-- Small patch sequence -->
  <rect x="137" y="60" width="32" height="8" rx="1" fill="rgba(243,156,18,0.2)" stroke="#f39c12" stroke-width="0.5"/>
  <rect x="137" y="70" width="32" height="8" rx="1" fill="rgba(243,156,18,0.15)" stroke="#f39c12" stroke-width="0.5"/>
  <rect x="137" y="105" width="32" height="8" rx="1" fill="rgba(243,156,18,0.2)" stroke="#f39c12" stroke-width="0.5"/>
  <rect x="137" y="115" width="32" height="8" rx="1" fill="rgba(243,156,18,0.15)" stroke="#f39c12" stroke-width="0.5"/>

  <line x1="178" y1="92" x2="203" y2="92" stroke="#333" stroke-width="1.2" marker-end="url(#arrow-ast)"/>

  <!-- Linear Projection + Pos Emb -->
  <rect x="206" y="55" width="75" height="74" rx="6" fill="rgba(52,152,219,0.12)" stroke="#3498db" stroke-width="1.5"/>
  <text x="243" y="78" text-anchor="middle" font-size="8" fill="#333">Linear</text>
  <text x="243" y="89" text-anchor="middle" font-size="8" fill="#333">Projection</text>
  <text x="243" y="103" text-anchor="middle" font-size="7" fill="#3498db">+ pos emb</text>

  <!-- [CLS] token indicator -->
  <rect x="215" y="112" width="30" height="14" rx="3" fill="rgba(39,174,96,0.15)" stroke="#27ae60" stroke-width="1"/>
  <text x="230" y="122" text-anchor="middle" font-size="7" font-weight="bold" fill="#27ae60">[CLS]</text>
  <line x1="230" y1="112" x2="230" y2="105" stroke="#27ae60" stroke-width="0.8" stroke-dasharray="2,2"/>
  <text x="255" y="122" font-size="7" fill="#666">prepended</text>

  <line x1="281" y1="92" x2="310" y2="92" stroke="#333" stroke-width="1.2" marker-end="url(#arrow-ast)"/>

  <!-- Transformer Encoder -->
  <rect x="313" y="42" width="130" height="100" rx="6" fill="rgba(155,89,182,0.1)" stroke="#9b59b6" stroke-width="1.8"/>
  <text x="378" y="65" text-anchor="middle" font-size="10" font-weight="bold" fill="#333">Transformer</text>
  <text x="378" y="78" text-anchor="middle" font-size="10" font-weight="bold" fill="#333">Encoder</text>

  <!-- Internal blocks -->
  <rect x="330" y="88" width="96" height="16" rx="3" fill="rgba(155,89,182,0.1)" stroke="#9b59b6" stroke-width="0.8"/>
  <text x="378" y="100" text-anchor="middle" font-size="7" fill="#666">Multi-Head Attn + FFN</text>
  <rect x="330" y="108" width="96" height="16" rx="3" fill="rgba(155,89,182,0.1)" stroke="#9b59b6" stroke-width="0.8"/>
  <text x="378" y="120" text-anchor="middle" font-size="7" fill="#666">x L layers</text>

  <line x1="443" y1="92" x2="472" y2="92" stroke="#333" stroke-width="1.2" marker-end="url(#arrow-ast)"/>

  <!-- Classification Head -->
  <rect x="475" y="62" width="80" height="60" rx="6" fill="rgba(39,174,96,0.12)" stroke="#27ae60" stroke-width="1.5"/>
  <text x="515" y="82" text-anchor="middle" font-size="9" fill="#333">Classification</text>
  <text x="515" y="95" text-anchor="middle" font-size="9" fill="#333">Head</text>
  <text x="515" y="110" text-anchor="middle" font-size="7" fill="#666">(from [CLS])</text>

  <line x1="555" y1="92" x2="580" y2="92" stroke="#333" stroke-width="1.2" marker-end="url(#arrow-ast)"/>

  <!-- Label output -->
  <rect x="583" y="72" width="75" height="40" rx="6" fill="rgba(231,76,60,0.1)" stroke="#e74c3c" stroke-width="1.5"/>
  <text x="620" y="90" text-anchor="middle" font-size="9" fill="#333">Predicted</text>
  <text x="620" y="103" text-anchor="middle" font-size="9" fill="#333">Label</text>

  <!-- Pipeline labels -->
  <text x="60" y="185" text-anchor="middle" font-size="7" fill="#f39c12">2D input</text>
  <text x="153" y="185" text-anchor="middle" font-size="7" fill="#3498db">sequence</text>
  <text x="243" y="185" text-anchor="middle" font-size="7" fill="#3498db">embed</text>
  <text x="378" y="185" text-anchor="middle" font-size="7" fill="#9b59b6">encode</text>
  <text x="515" y="185" text-anchor="middle" font-size="7" fill="#27ae60">classify</text>

  <!-- Note box -->
  <rect x="30" y="200" width="640" height="55" rx="6" fill="#f5f5f5" stroke="#333" stroke-width="1"/>
  <text x="50" y="220" font-size="10" font-weight="bold" fill="#333">AST adapts ViT (chapter 08) directly to audio spectrograms</text>
  <text x="50" y="236" font-size="9" fill="#666">No convolutions needed. The spectrogram is split into 16x16 patches, linearly projected, and processed</text>
  <text x="50" y="248" font-size="9" fill="#666">by a standard Transformer encoder. Pre-training on ImageNet transfers surprisingly well to audio.</text>
</svg>