Files
maths-cs-ai-compendium-zh/images/text_diffusion.svg
T

106 lines
6.4 KiB
XML

<svg width="720" height="240" xmlns="http://www.w3.org/2000/svg">
<defs>
<marker id="tdiff-arrow" markerWidth="7" markerHeight="5" refX="7" refY="2.5" orient="auto">
<polygon points="0 0, 7 2.5, 0 5" fill="#555"/>
</marker>
<marker id="tdiff-arrow-green" markerWidth="7" markerHeight="5" refX="7" refY="2.5" orient="auto">
<polygon points="0 0, 7 2.5, 0 5" fill="#27ae60"/>
</marker>
</defs>
<text x="360" y="22" fill="#333" font-size="14" font-weight="bold" text-anchor="middle">Text Diffusion: Forward Corruption and Reverse Denoising</text>
<!-- Forward process (top) -->
<text x="360" y="48" fill="#e74c3c" font-size="10" font-weight="bold" text-anchor="middle">Forward process: gradually corrupt text</text>
<!-- t=0 clean -->
<g>
<rect x="20" y="58" width="120" height="55" rx="6" fill="#27ae60" opacity="0.1" stroke="#27ae60" stroke-width="1.5"/>
<text x="80" y="73" fill="#27ae60" font-size="9" text-anchor="middle" font-weight="bold">t = 0 (clean)</text>
<text x="80" y="88" fill="#333" font-size="10" text-anchor="middle">The cat sat</text>
<text x="80" y="103" fill="#333" font-size="10" text-anchor="middle">on the mat</text>
</g>
<line x1="140" y1="85" x2="165" y2="85" stroke="#e74c3c" stroke-width="1.5" marker-end="url(#tdiff-arrow)"/>
<!-- t=T/3 partially corrupted -->
<g>
<rect x="170" y="58" width="120" height="55" rx="6" fill="#f39c12" opacity="0.1" stroke="#f39c12" stroke-width="1.5"/>
<text x="230" y="73" fill="#f39c12" font-size="9" text-anchor="middle" font-weight="bold">t = T/3</text>
<text x="230" y="88" fill="#333" font-size="10" text-anchor="middle">The <tspan fill="#e74c3c">[M]</tspan> sat</text>
<text x="230" y="103" fill="#333" font-size="10" text-anchor="middle">on <tspan fill="#e74c3c">[M]</tspan> mat</text>
</g>
<line x1="290" y1="85" x2="315" y2="85" stroke="#e74c3c" stroke-width="1.5" marker-end="url(#tdiff-arrow)"/>
<!-- t=2T/3 mostly corrupted -->
<g>
<rect x="320" y="58" width="120" height="55" rx="6" fill="#e74c3c" opacity="0.1" stroke="#e74c3c" stroke-width="1.5"/>
<text x="380" y="73" fill="#e74c3c" font-size="9" text-anchor="middle" font-weight="bold">t = 2T/3</text>
<text x="380" y="88" fill="#e74c3c" font-size="10" text-anchor="middle">[M] [M] sat</text>
<text x="380" y="103" fill="#e74c3c" font-size="10" text-anchor="middle">[M] [M] [M]</text>
</g>
<line x1="440" y1="85" x2="465" y2="85" stroke="#e74c3c" stroke-width="1.5" marker-end="url(#tdiff-arrow)"/>
<!-- t=T fully masked -->
<g>
<rect x="470" y="58" width="120" height="55" rx="6" fill="#e74c3c" opacity="0.2" stroke="#e74c3c" stroke-width="2"/>
<text x="530" y="73" fill="#e74c3c" font-size="9" text-anchor="middle" font-weight="bold">t = T (noise)</text>
<text x="530" y="88" fill="#e74c3c" font-size="10" text-anchor="middle">[M] [M] [M]</text>
<text x="530" y="103" fill="#e74c3c" font-size="10" text-anchor="middle">[M] [M] [M]</text>
</g>
<!-- Reverse process (bottom) -->
<text x="360" y="140" fill="#27ae60" font-size="10" font-weight="bold" text-anchor="middle">Reverse process: learned denoising (generation)</text>
<!-- t=T noise -->
<g>
<rect x="470" y="150" width="120" height="55" rx="6" fill="#e74c3c" opacity="0.2" stroke="#e74c3c" stroke-width="2"/>
<text x="530" y="165" fill="#e74c3c" font-size="9" text-anchor="middle" font-weight="bold">t = T</text>
<text x="530" y="180" fill="#e74c3c" font-size="10" text-anchor="middle">[M] [M] [M]</text>
<text x="530" y="195" fill="#e74c3c" font-size="10" text-anchor="middle">[M] [M] [M]</text>
</g>
<line x1="470" y1="177" x2="445" y2="177" stroke="#27ae60" stroke-width="1.5" marker-end="url(#tdiff-arrow-green)"/>
<!-- Partially denoised -->
<g>
<rect x="320" y="150" width="120" height="55" rx="6" fill="#f39c12" opacity="0.1" stroke="#f39c12" stroke-width="1.5"/>
<text x="380" y="165" fill="#f39c12" font-size="9" text-anchor="middle" font-weight="bold">denoise</text>
<text x="380" y="180" fill="#333" font-size="10" text-anchor="middle"><tspan fill="#e74c3c">[M]</tspan> cat <tspan fill="#e74c3c">[M]</tspan></text>
<text x="380" y="195" fill="#333" font-size="10" text-anchor="middle">on the <tspan fill="#e74c3c">[M]</tspan></text>
</g>
<line x1="320" y1="177" x2="295" y2="177" stroke="#27ae60" stroke-width="1.5" marker-end="url(#tdiff-arrow-green)"/>
<!-- More denoised -->
<g>
<rect x="170" y="150" width="120" height="55" rx="6" fill="#27ae60" opacity="0.08" stroke="#27ae60" stroke-width="1.5"/>
<text x="230" y="165" fill="#27ae60" font-size="9" text-anchor="middle" font-weight="bold">denoise</text>
<text x="230" y="180" fill="#333" font-size="10" text-anchor="middle">The cat sat</text>
<text x="230" y="195" fill="#333" font-size="10" text-anchor="middle">on the <tspan fill="#e74c3c">[M]</tspan></text>
</g>
<line x1="170" y1="177" x2="145" y2="177" stroke="#27ae60" stroke-width="1.5" marker-end="url(#tdiff-arrow-green)"/>
<!-- t=0 clean output -->
<g>
<rect x="20" y="150" width="120" height="55" rx="6" fill="#27ae60" opacity="0.15" stroke="#27ae60" stroke-width="2"/>
<text x="80" y="165" fill="#27ae60" font-size="9" text-anchor="middle" font-weight="bold">t = 0 (clean)</text>
<text x="80" y="180" fill="#27ae60" font-size="10" text-anchor="middle" font-weight="bold">The cat sat</text>
<text x="80" y="195" fill="#27ae60" font-size="10" text-anchor="middle" font-weight="bold">on the mat</text>
</g>
<!-- Key advantage -->
<rect x="600" y="60" width="110" height="145" rx="6" fill="#f5f5f5" stroke="#ddd" stroke-width="1"/>
<text x="655" y="82" fill="#333" font-size="9" text-anchor="middle" font-weight="bold">Key advantage</text>
<text x="655" y="100" fill="#555" font-size="9" text-anchor="middle">All tokens</text>
<text x="655" y="114" fill="#555" font-size="9" text-anchor="middle">generated in</text>
<text x="655" y="128" fill="#27ae60" font-size="10" text-anchor="middle" font-weight="bold">parallel</text>
<line x1="615" y1="140" x2="695" y2="140" stroke="#ddd" stroke-width="1"/>
<text x="655" y="158" fill="#555" font-size="9" text-anchor="middle">Enables easy</text>
<text x="655" y="172" fill="#555" font-size="9" text-anchor="middle">infilling and</text>
<text x="655" y="186" fill="#555" font-size="9" text-anchor="middle">global planning</text>
<text x="360" y="232" fill="#666" font-size="10" text-anchor="middle">MDLM uses masking as the corruption; D3PM uses general token transitions.</text>
</svg>