Files
flykhan 2536c937e3 feat: 完整中文翻译 maths-cs-ai-compendium(数学·计算机科学·AI 知识大全)
翻译自英文原版 maths-cs-ai-compendium,共 20 章全部完成。

第01章 向量 | 第02章 矩阵 | 第03章 微积分
第04章 统计学 | 第05章 概率论 | 第06章 机器学习
第07章 计算语言学 | 第08章 计算机视觉 | 第09章 音频与语音
第10章 多模态学习 | 第11章 自主系统 | 第12章 图神经网络
第13章 计算与操作系统 | 第14章 数据结构与算法
第15章 生产级软件工程 | 第16章 SIMD与GPU编程
第17章 AI推理 | 第18章 ML系统设计
第19章 应用人工智能 | 第20章 前沿人工智能

翻译说明:
- 所有数学公式 $...$ / $$...$$、代码块、图片引用完整保留
- mkdocs.yml 配置中文导航 + language: zh
- README.md 已翻译为中文(兼 docs/index.md)
- docs/ 目录包含指向各章文件的 symlink
- 约 29,000 行中文内容,排除 .cache/ 构建缓存
2026-05-03 10:23:20 +08:00

139 lines
8.6 KiB
XML

<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 750 300" width="750" height="300" font-family="Arial, sans-serif">
<!-- Title -->
<text x="375" y="22" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">3D VQ-VAE for Video Tokenisation</text>
<defs>
<marker id="vArr" markerWidth="8" markerHeight="6" refX="8" refY="3" orient="auto">
<path d="M0,0 L8,3 L0,6 Z" fill="#666"/>
</marker>
</defs>
<!-- Video frames stack (left) -->
<g transform="translate(15, 65)">
<!-- Frame 5 (back) -->
<rect x="32" y="0" width="60" height="48" rx="3" fill="#3498db" opacity="0.2" stroke="#3498db" stroke-width="0.8"/>
<!-- Frame 4 -->
<rect x="24" y="10" width="60" height="48" rx="3" fill="#3498db" opacity="0.3" stroke="#3498db" stroke-width="0.8"/>
<!-- Frame 3 -->
<rect x="16" y="20" width="60" height="48" rx="3" fill="#3498db" opacity="0.4" stroke="#3498db" stroke-width="0.8"/>
<!-- Frame 2 -->
<rect x="8" y="30" width="60" height="48" rx="3" fill="#3498db" opacity="0.55" stroke="#3498db" stroke-width="0.8"/>
<!-- Frame 1 (front) -->
<rect x="0" y="40" width="60" height="48" rx="3" fill="#3498db" opacity="0.7" stroke="#3498db" stroke-width="1"/>
<!-- Small content squares in front frame -->
<rect x="5" y="45" width="18" height="15" rx="1" fill="#e74c3c" opacity="0.4"/>
<rect x="28" y="52" width="24" height="25" rx="1" fill="#27ae60" opacity="0.4"/>
<!-- Time arrow -->
<line x1="42" y1="-8" x2="78" y2="-22" stroke="#999" stroke-width="1" marker-end="url(#vArr)"/>
<text x="74" y="-26" font-size="8" fill="#999">time</text>
</g>
<text x="58" y="175" text-anchor="middle" font-size="10" fill="#333">Video Frames</text>
<text x="58" y="187" text-anchor="middle" font-size="8" fill="#666">(T x H x W x 3)</text>
<!-- Arrow -->
<line x1="110" y1="108" x2="148" y2="108" stroke="#666" stroke-width="1.5" marker-end="url(#vArr)"/>
<!-- 3D Conv Encoder -->
<rect x="156" y="68" width="100" height="80" rx="8" fill="#3498db" fill-opacity="0.12" stroke="#3498db" stroke-width="1.5"/>
<text x="206" y="100" text-anchor="middle" font-size="11" font-weight="bold" fill="#3498db">3D Conv</text>
<text x="206" y="114" text-anchor="middle" font-size="11" font-weight="bold" fill="#3498db">Encoder</text>
<text x="206" y="132" text-anchor="middle" font-size="8" fill="#666">spatiotemporal</text>
<!-- Arrow -->
<line x1="264" y1="108" x2="294" y2="108" stroke="#666" stroke-width="1.5" marker-end="url(#vArr)"/>
<!-- 3D grid of latent tokens -->
<g transform="translate(302, 55)">
<!-- Back layer (time depth) -->
<rect x="30" y="0" width="56" height="56" rx="2" fill="#9b59b6" fill-opacity="0.08" stroke="#9b59b6" stroke-width="0.6"/>
<!-- Grid lines back -->
<line x1="30" y1="14" x2="86" y2="14" stroke="#9b59b6" stroke-width="0.3"/>
<line x1="30" y1="28" x2="86" y2="28" stroke="#9b59b6" stroke-width="0.3"/>
<line x1="30" y1="42" x2="86" y2="42" stroke="#9b59b6" stroke-width="0.3"/>
<line x1="44" y1="0" x2="44" y2="56" stroke="#9b59b6" stroke-width="0.3"/>
<line x1="58" y1="0" x2="58" y2="56" stroke="#9b59b6" stroke-width="0.3"/>
<line x1="72" y1="0" x2="72" y2="56" stroke="#9b59b6" stroke-width="0.3"/>
<!-- Middle layer -->
<rect x="15" y="20" width="56" height="56" rx="2" fill="#9b59b6" fill-opacity="0.12" stroke="#9b59b6" stroke-width="0.8"/>
<line x1="15" y1="34" x2="71" y2="34" stroke="#9b59b6" stroke-width="0.3"/>
<line x1="15" y1="48" x2="71" y2="48" stroke="#9b59b6" stroke-width="0.3"/>
<line x1="15" y1="62" x2="71" y2="62" stroke="#9b59b6" stroke-width="0.3"/>
<line x1="29" y1="20" x2="29" y2="76" stroke="#9b59b6" stroke-width="0.3"/>
<line x1="43" y1="20" x2="43" y2="76" stroke="#9b59b6" stroke-width="0.3"/>
<line x1="57" y1="20" x2="57" y2="76" stroke="#9b59b6" stroke-width="0.3"/>
<!-- Front layer -->
<rect x="0" y="40" width="56" height="56" rx="2" fill="#9b59b6" fill-opacity="0.18" stroke="#9b59b6" stroke-width="1"/>
<line x1="0" y1="54" x2="56" y2="54" stroke="#9b59b6" stroke-width="0.4"/>
<line x1="0" y1="68" x2="56" y2="68" stroke="#9b59b6" stroke-width="0.4"/>
<line x1="0" y1="82" x2="56" y2="82" stroke="#9b59b6" stroke-width="0.4"/>
<line x1="14" y1="40" x2="14" y2="96" stroke="#9b59b6" stroke-width="0.4"/>
<line x1="28" y1="40" x2="28" y2="96" stroke="#9b59b6" stroke-width="0.4"/>
<line x1="42" y1="40" x2="42" y2="96" stroke="#9b59b6" stroke-width="0.4"/>
<!-- Depth connectors -->
<line x1="0" y1="40" x2="15" y2="20" stroke="#9b59b6" stroke-width="0.5"/>
<line x1="56" y1="40" x2="71" y2="20" stroke="#9b59b6" stroke-width="0.5"/>
<line x1="56" y1="96" x2="71" y2="76" stroke="#9b59b6" stroke-width="0.5"/>
<line x1="15" y1="20" x2="30" y2="0" stroke="#9b59b6" stroke-width="0.4"/>
<line x1="71" y1="20" x2="86" y2="0" stroke="#9b59b6" stroke-width="0.4"/>
<line x1="71" y1="76" x2="86" y2="56" stroke="#9b59b6" stroke-width="0.4"/>
<!-- Dimension labels -->
<text x="28" y="110" text-anchor="middle" font-size="8" fill="#9b59b6">H' x W'</text>
<text x="90" y="-5" font-size="8" fill="#9b59b6">T'</text>
</g>
<text x="344" y="175" text-anchor="middle" font-size="10" fill="#9b59b6">3D Latent Grid</text>
<!-- Arrow -->
<line x1="400" y1="108" x2="430" y2="108" stroke="#666" stroke-width="1.5" marker-end="url(#vArr)"/>
<!-- 3D Quantise -->
<rect x="438" y="72" width="82" height="72" rx="8" fill="#f39c12" fill-opacity="0.12" stroke="#f39c12" stroke-width="1.5"/>
<text x="479" y="102" text-anchor="middle" font-size="11" font-weight="bold" fill="#f39c12">3D</text>
<text x="479" y="118" text-anchor="middle" font-size="11" font-weight="bold" fill="#f39c12">Quantise</text>
<!-- Mini codebook icon -->
<rect x="458" y="126" width="42" height="12" rx="2" fill="white" stroke="#f39c12" stroke-width="0.6"/>
<line x1="458" y1="130" x2="500" y2="130" stroke="#f39c12" stroke-width="0.3"/>
<line x1="458" y1="134" x2="500" y2="134" stroke="#f39c12" stroke-width="0.3"/>
<!-- Arrow -->
<line x1="528" y1="108" x2="558" y2="108" stroke="#666" stroke-width="1.5" marker-end="url(#vArr)"/>
<!-- 3D Conv Decoder -->
<rect x="566" y="68" width="100" height="80" rx="8" fill="#e74c3c" fill-opacity="0.12" stroke="#e74c3c" stroke-width="1.5"/>
<text x="616" y="100" text-anchor="middle" font-size="11" font-weight="bold" fill="#e74c3c">3D Conv</text>
<text x="616" y="114" text-anchor="middle" font-size="11" font-weight="bold" fill="#e74c3c">Decoder</text>
<text x="616" y="132" text-anchor="middle" font-size="8" fill="#666">spatiotemporal</text>
<!-- Arrow -->
<line x1="674" y1="108" x2="696" y2="108" stroke="#666" stroke-width="1.5" marker-end="url(#vArr)"/>
<!-- Reconstructed frame stack -->
<g transform="translate(700, 65)">
<rect x="20" y="0" width="26" height="22" rx="2" fill="#3498db" opacity="0.15" stroke="#3498db" stroke-width="0.6" stroke-dasharray="2,1"/>
<rect x="15" y="8" width="26" height="22" rx="2" fill="#3498db" opacity="0.2" stroke="#3498db" stroke-width="0.6" stroke-dasharray="2,1"/>
<rect x="10" y="16" width="26" height="22" rx="2" fill="#3498db" opacity="0.3" stroke="#3498db" stroke-width="0.6" stroke-dasharray="2,1"/>
<rect x="5" y="24" width="26" height="22" rx="2" fill="#3498db" opacity="0.4" stroke="#3498db" stroke-width="0.8" stroke-dasharray="2,1"/>
<rect x="0" y="32" width="26" height="22" rx="2" fill="#3498db" opacity="0.5" stroke="#3498db" stroke-width="1" stroke-dasharray="2,1"/>
</g>
<text x="718" y="175" text-anchor="middle" font-size="9" fill="#333">Recon.</text>
<text x="718" y="187" text-anchor="middle" font-size="9" fill="#333">Frames</text>
<!-- Bottom annotation: compression factors -->
<text x="375" y="218" text-anchor="middle" font-size="9" fill="#999">Compression: spatial (4-16x per axis) and temporal (4-8x) reduce video to compact token sequences</text>
<!-- Compression ratios -->
<g transform="translate(180, 235)">
<rect x="0" y="0" width="140" height="26" rx="5" fill="#3498db" fill-opacity="0.06" stroke="#3498db" stroke-width="0.8"/>
<text x="70" y="11" text-anchor="middle" font-size="8" fill="#3498db" font-weight="bold">Spatial: H/f x W/f</text>
<text x="70" y="22" text-anchor="middle" font-size="7" fill="#666">f = 8 or 16 typically</text>
</g>
<g transform="translate(360, 235)">
<rect x="0" y="0" width="140" height="26" rx="5" fill="#f39c12" fill-opacity="0.06" stroke="#f39c12" stroke-width="0.8"/>
<text x="70" y="11" text-anchor="middle" font-size="8" fill="#f39c12" font-weight="bold">Temporal: T/f_t</text>
<text x="70" y="22" text-anchor="middle" font-size="7" fill="#666">f_t = 4 or 8 typically</text>
</g>
</svg>