Files
maths-cs-ai-compendium-zh/chapter 18: ML systems design/03. large scale infrastructure/index.html
T

5872 lines
113 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!doctype html>
<html lang="zh" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<meta name="description" content="一本开源的直觉优先教科书,从零开始覆盖数学、计算机科学和人工智能(中文翻译版)。">
<meta name="author" content="Henry Ndubuaku (flykhan 译)">
<link rel="canonical" href="https://flykhan.github.io/maths-cs-ai-compendium-zh/chapter%2018%3A%20ML%20systems%20design/03.%20large%20scale%20infrastructure/">
<link rel="prev" href="../02.%20cloud%20computing/">
<link rel="next" href="../04.%20ML%20systems%20design/">
<link rel="icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>大规模基础设施 - 数学、计算机科学与 AI 百科全书</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="stylesheet" href="../../assets/stylesheets/palette.ab4e12ef.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr" data-md-color-scheme="default" data-md-color-primary="slate" data-md-color-accent="indigo">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#_1" class="md-skip">
跳转至
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="页眉">
<a href="../.." title="数学、计算机科学与 AI 百科全书" class="md-header__button md-logo" aria-label="数学、计算机科学与 AI 百科全书" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
数学、计算机科学与 AI 百科全书
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
大规模基础设施
</span>
</div>
</div>
</div>
<form class="md-header__option" data-md-component="palette">
<input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="slate" data-md-color-accent="indigo" aria-label="切换到深色模式" type="radio" name="__palette" id="__palette_0">
<label class="md-header__button md-icon" title="切换到深色模式" for="__palette_1" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
</label>
<input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="slate" data-md-color-accent="indigo" aria-label="切换到浅色模式" type="radio" name="__palette" id="__palette_1">
<label class="md-header__button md-icon" title="切换到浅色模式" for="__palette_0" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
</label>
</form>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
<label class="md-header__button md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
</label>
<div class="md-search" data-md-component="search" role="dialog">
<label class="md-search__overlay" for="__search"></label>
<div class="md-search__inner" role="search">
<form class="md-search__form" name="search">
<input type="text" class="md-search__input" name="query" aria-label="搜索" placeholder="搜索" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
<label class="md-search__icon md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
</label>
<nav class="md-search__options" aria-label="查找">
<button type="reset" class="md-search__icon md-icon" title="清空当前内容" aria-label="清空当前内容" tabindex="-1">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
</button>
</nav>
<div class="md-search__suggest" data-md-component="search-suggest"></div>
</form>
<div class="md-search__output">
<div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
<div class="md-search-result" data-md-component="search-result">
<div class="md-search-result__meta">
正在初始化搜索引擎
</div>
<ol class="md-search-result__list" role="presentation"></ol>
</div>
</div>
</div>
</div>
</div>
<div class="md-header__source">
<a href="https://github.com/flykhan/maths-cs-ai-compendium-zh" title="前往仓库" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
</div>
<div class="md-source__repository">
flykhan/maths-cs-ai-compendium-zh
</div>
</a>
</div>
</nav>
</header>
<div class="md-container" data-md-component="container">
<nav class="md-tabs" aria-label="标签" data-md-component="tabs">
<div class="md-grid">
<ul class="md-tabs__list">
<li class="md-tabs__item">
<a href="../.." class="md-tabs__link">
首页
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2001%3A%20vectors/01.%20vector%20spaces/" class="md-tabs__link">
向量
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2002%3A%20matrices/01.%20matrix%20properties/" class="md-tabs__link">
矩阵
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2003%3A%20calculus/01.%20differential%20calculus/" class="md-tabs__link">
微积分
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2004%3A%20statistics/01.%20fundamentals/" class="md-tabs__link">
统计学
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2005%3A%20probability/01.%20counting/" class="md-tabs__link">
概率论
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2006%3A%20machine%20learning/01.%20classical%20machine%20learning/" class="md-tabs__link">
机器学习
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2007%3A%20computational%20linguistics/01.%20linguistic%20foundations/" class="md-tabs__link">
计算语言学
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2008%3A%20computer%20vision/01.%20image%20fundamentals/" class="md-tabs__link">
计算机视觉
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2009%3A%20audio%20and%20speech/01.%20digital%20signal%20processing/" class="md-tabs__link">
音频与语音
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2010%3A%20multimodal%20learning/01.%20multimodal%20representations/" class="md-tabs__link">
多模态学习
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2011%3A%20autonomous%20systems/01.%20perception/" class="md-tabs__link">
自主系统
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2012%3A%20graph%20neural%20networks/01.%20geometric%20deep%20learning/" class="md-tabs__link">
图神经网络
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2013%3A%20computing%20and%20OS/01.%20discrete%20maths/" class="md-tabs__link">
计算机与操作系统
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/00.%20foundations/" class="md-tabs__link">
数据结构与算法
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2015%3A%20production%20software%20engineering/01.%20linux%20and%20CMD/" class="md-tabs__link">
生产级软件工程
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/00.%20why%20C%2B%2B%20and%20how%20ML%20frameworks%20work/" class="md-tabs__link">
SIMD 与 GPU 编程
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2017%3A%20AI%20inference/01.%20quantisation/" class="md-tabs__link">
AI 推理
</a>
</li>
<li class="md-tabs__item md-tabs__item--active">
<a href="../01.%20systems%20design%20fundamentals/" class="md-tabs__link">
ML 系统设计
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2019%3A%20applied%20AI/01.%20AI%20for%20finance/" class="md-tabs__link">
应用 AI
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2020%3A%20bleeding%20edge%20AI/01.%20quantum%20machine%20learning/" class="md-tabs__link">
前沿 AI
</a>
</li>
</ul>
</div>
</nav>
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary md-nav--lifted" aria-label="导航栏" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="数学、计算机科学与 AI 百科全书" class="md-nav__button md-logo" aria-label="数学、计算机科学与 AI 百科全书" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
数学、计算机科学与 AI 百科全书
</label>
<div class="md-nav__source">
<a href="https://github.com/flykhan/maths-cs-ai-compendium-zh" title="前往仓库" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
</div>
<div class="md-source__repository">
flykhan/maths-cs-ai-compendium-zh
</div>
</a>
</div>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
<span class="md-ellipsis">
首页
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
向量
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
向量
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2001%3A%20vectors/01.%20vector%20spaces/" class="md-nav__link">
<span class="md-ellipsis">
向量空间
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2001%3A%20vectors/02.%20vector%20properties/" class="md-nav__link">
<span class="md-ellipsis">
向量性质
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2001%3A%20vectors/03.%20norms%20and%20metrics/" class="md-nav__link">
<span class="md-ellipsis">
范数与度量
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2001%3A%20vectors/04.%20products/" class="md-nav__link">
<span class="md-ellipsis">
向量积
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2001%3A%20vectors/05.%20basis%20and%20duality/" class="md-nav__link">
<span class="md-ellipsis">
基与对偶性
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_3" >
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
矩阵
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
矩阵
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2002%3A%20matrices/01.%20matrix%20properties/" class="md-nav__link">
<span class="md-ellipsis">
矩阵性质
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2002%3A%20matrices/02.%20matrix%20types/" class="md-nav__link">
<span class="md-ellipsis">
矩阵类型
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2002%3A%20matrices/03.%20operations/" class="md-nav__link">
<span class="md-ellipsis">
矩阵运算
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2002%3A%20matrices/04.%20linear%20transformations/" class="md-nav__link">
<span class="md-ellipsis">
线性变换
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2002%3A%20matrices/05.%20decompositions/" class="md-nav__link">
<span class="md-ellipsis">
矩阵分解
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
微积分
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
微积分
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2003%3A%20calculus/01.%20differential%20calculus/" class="md-nav__link">
<span class="md-ellipsis">
微分
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2003%3A%20calculus/02.%20integral%20calculus/" class="md-nav__link">
<span class="md-ellipsis">
积分
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2003%3A%20calculus/03.%20multivariate%20calculus/" class="md-nav__link">
<span class="md-ellipsis">
多元微积分
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2003%3A%20calculus/04.%20function%20approximation/" class="md-nav__link">
<span class="md-ellipsis">
函数逼近
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2003%3A%20calculus/05.%20optimisation/" class="md-nav__link">
<span class="md-ellipsis">
优化
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_5" >
<label class="md-nav__link" for="__nav_5" id="__nav_5_label" tabindex="0">
<span class="md-ellipsis">
统计学
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_5_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_5">
<span class="md-nav__icon md-icon"></span>
统计学
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2004%3A%20statistics/01.%20fundamentals/" class="md-nav__link">
<span class="md-ellipsis">
基础
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2004%3A%20statistics/02.%20measures/" class="md-nav__link">
<span class="md-ellipsis">
统计量
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2004%3A%20statistics/03.%20sampling/" class="md-nav__link">
<span class="md-ellipsis">
抽样
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2004%3A%20statistics/04.%20hypothesis%20testing/" class="md-nav__link">
<span class="md-ellipsis">
假设检验
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2004%3A%20statistics/05.%20inference/" class="md-nav__link">
<span class="md-ellipsis">
推断
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_6" >
<label class="md-nav__link" for="__nav_6" id="__nav_6_label" tabindex="0">
<span class="md-ellipsis">
概率论
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_6_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_6">
<span class="md-nav__icon md-icon"></span>
概率论
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2005%3A%20probability/01.%20counting/" class="md-nav__link">
<span class="md-ellipsis">
计数
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2005%3A%20probability/02.%20probability%20concepts/" class="md-nav__link">
<span class="md-ellipsis">
概率概念
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2005%3A%20probability/03.%20distributions/" class="md-nav__link">
<span class="md-ellipsis">
分布
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2005%3A%20probability/04.%20bayesian/" class="md-nav__link">
<span class="md-ellipsis">
贝叶斯
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2005%3A%20probability/05.%20information%20theory/" class="md-nav__link">
<span class="md-ellipsis">
信息论
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_7" >
<label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
<span class="md-ellipsis">
机器学习
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_7">
<span class="md-nav__icon md-icon"></span>
机器学习
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2006%3A%20machine%20learning/01.%20classical%20machine%20learning/" class="md-nav__link">
<span class="md-ellipsis">
经典机器学习
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2006%3A%20machine%20learning/02.%20gradient%20machine%20learning/" class="md-nav__link">
<span class="md-ellipsis">
梯度机器学习
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2006%3A%20machine%20learning/03.%20deep%20learning/" class="md-nav__link">
<span class="md-ellipsis">
深度学习
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2006%3A%20machine%20learning/04.%20reinforcement%20learning/" class="md-nav__link">
<span class="md-ellipsis">
强化学习
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2006%3A%20machine%20learning/05.%20distributed%20deep%20learning/" class="md-nav__link">
<span class="md-ellipsis">
分布式深度学习
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_8" >
<label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
<span class="md-ellipsis">
计算语言学
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_8">
<span class="md-nav__icon md-icon"></span>
计算语言学
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2007%3A%20computational%20linguistics/01.%20linguistic%20foundations/" class="md-nav__link">
<span class="md-ellipsis">
语言学基础
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2007%3A%20computational%20linguistics/02.%20text%20processing%20and%20classic%20NLP/" class="md-nav__link">
<span class="md-ellipsis">
文本处理与经典 NLP
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2007%3A%20computational%20linguistics/03.%20embeddings%20and%20sequence%20models/" class="md-nav__link">
<span class="md-ellipsis">
嵌入与序列模型
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2007%3A%20computational%20linguistics/04.%20transformers%20and%20language%20models/" class="md-nav__link">
<span class="md-ellipsis">
Transformer 与语言模型
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2007%3A%20computational%20linguistics/05.%20advanced%20text%20generation/" class="md-nav__link">
<span class="md-ellipsis">
高级文本生成
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_9" >
<label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
<span class="md-ellipsis">
计算机视觉
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_9">
<span class="md-nav__icon md-icon"></span>
计算机视觉
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2008%3A%20computer%20vision/01.%20image%20fundamentals/" class="md-nav__link">
<span class="md-ellipsis">
图像基础
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2008%3A%20computer%20vision/02.%20convolutional%20networks/" class="md-nav__link">
<span class="md-ellipsis">
卷积网络
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2008%3A%20computer%20vision/03.%20object%20detection%20and%20segmentation/" class="md-nav__link">
<span class="md-ellipsis">
目标检测与分割
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2008%3A%20computer%20vision/04.%20vision%20transformers%20and%20generation/" class="md-nav__link">
<span class="md-ellipsis">
ViT 与生成模型
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2008%3A%20computer%20vision/05.%20video%20and%203D%20vision/" class="md-nav__link">
<span class="md-ellipsis">
视频与 3D 视觉
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_10" >
<label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
<span class="md-ellipsis">
音频与语音
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_10">
<span class="md-nav__icon md-icon"></span>
音频与语音
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2009%3A%20audio%20and%20speech/01.%20digital%20signal%20processing/" class="md-nav__link">
<span class="md-ellipsis">
数字信号处理
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2009%3A%20audio%20and%20speech/02.%20automatic%20speech%20recognition/" class="md-nav__link">
<span class="md-ellipsis">
自动语音识别
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2009%3A%20audio%20and%20speech/03.%20text%20to%20speech%20and%20voice/" class="md-nav__link">
<span class="md-ellipsis">
语音合成
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2009%3A%20audio%20and%20speech/04.%20speaker%20and%20audio%20analysis/" class="md-nav__link">
<span class="md-ellipsis">
说话人与音频分析
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2009%3A%20audio%20and%20speech/05.%20source%20separation%20and%20noise/" class="md-nav__link">
<span class="md-ellipsis">
源分离与降噪
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_11" >
<label class="md-nav__link" for="__nav_11" id="__nav_11_label" tabindex="0">
<span class="md-ellipsis">
多模态学习
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_11_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_11">
<span class="md-nav__icon md-icon"></span>
多模态学习
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2010%3A%20multimodal%20learning/01.%20multimodal%20representations/" class="md-nav__link">
<span class="md-ellipsis">
多模态表征
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2010%3A%20multimodal%20learning/02.%20vision%20language%20models/" class="md-nav__link">
<span class="md-ellipsis">
视觉语言模型
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2010%3A%20multimodal%20learning/03.%20image%20and%20video%20tokenisation/" class="md-nav__link">
<span class="md-ellipsis">
图像与视频 Token 化
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2010%3A%20multimodal%20learning/04.%20cross-modal%20generation/" class="md-nav__link">
<span class="md-ellipsis">
跨模态生成
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2010%3A%20multimodal%20learning/05.%20unified%20multimodal%20architectures/" class="md-nav__link">
<span class="md-ellipsis">
统一多模态架构
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_12" >
<label class="md-nav__link" for="__nav_12" id="__nav_12_label" tabindex="0">
<span class="md-ellipsis">
自主系统
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_12_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_12">
<span class="md-nav__icon md-icon"></span>
自主系统
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2011%3A%20autonomous%20systems/01.%20perception/" class="md-nav__link">
<span class="md-ellipsis">
感知
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2011%3A%20autonomous%20systems/02.%20robot%20learning/" class="md-nav__link">
<span class="md-ellipsis">
机器人学习
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2011%3A%20autonomous%20systems/03.%20vision-language-action%20models/" class="md-nav__link">
<span class="md-ellipsis">
视觉-语言-动作模型
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2011%3A%20autonomous%20systems/04.%20self-driving/" class="md-nav__link">
<span class="md-ellipsis">
自动驾驶
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2011%3A%20autonomous%20systems/05.%20space%20and%20extreme%20robotics/" class="md-nav__link">
<span class="md-ellipsis">
太空与极端机器人
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_13" >
<label class="md-nav__link" for="__nav_13" id="__nav_13_label" tabindex="0">
<span class="md-ellipsis">
图神经网络
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_13_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_13">
<span class="md-nav__icon md-icon"></span>
图神经网络
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2012%3A%20graph%20neural%20networks/01.%20geometric%20deep%20learning/" class="md-nav__link">
<span class="md-ellipsis">
几何深度学习
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2012%3A%20graph%20neural%20networks/02.%20graph%20theory/" class="md-nav__link">
<span class="md-ellipsis">
图论
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2012%3A%20graph%20neural%20networks/03.%20graph%20neural%20networks/" class="md-nav__link">
<span class="md-ellipsis">
图神经网络
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2012%3A%20graph%20neural%20networks/04.%20graph%20attention%20networks/" class="md-nav__link">
<span class="md-ellipsis">
图注意力网络
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2012%3A%20graph%20neural%20networks/05.%203d%20graph%20networks/" class="md-nav__link">
<span class="md-ellipsis">
3D 图网络
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_14" >
<label class="md-nav__link" for="__nav_14" id="__nav_14_label" tabindex="0">
<span class="md-ellipsis">
计算机与操作系统
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_14_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_14">
<span class="md-nav__icon md-icon"></span>
计算机与操作系统
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2013%3A%20computing%20and%20OS/01.%20discrete%20maths/" class="md-nav__link">
<span class="md-ellipsis">
离散数学
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2013%3A%20computing%20and%20OS/02.%20computer%20architecture/" class="md-nav__link">
<span class="md-ellipsis">
计算机体系结构
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2013%3A%20computing%20and%20OS/03.%20operating%20systems/" class="md-nav__link">
<span class="md-ellipsis">
操作系统
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2013%3A%20computing%20and%20OS/04.%20concurrency%20and%20parallelism/" class="md-nav__link">
<span class="md-ellipsis">
并发与并行
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2013%3A%20computing%20and%20OS/05.%20programming%20languages/" class="md-nav__link">
<span class="md-ellipsis">
编程语言
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_15" >
<label class="md-nav__link" for="__nav_15" id="__nav_15_label" tabindex="0">
<span class="md-ellipsis">
数据结构与算法
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_15_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_15">
<span class="md-nav__icon md-icon"></span>
数据结构与算法
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/00.%20foundations/" class="md-nav__link">
<span class="md-ellipsis">
基础
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/01.%20arrays%20and%20hashing/" class="md-nav__link">
<span class="md-ellipsis">
数组与哈希
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/02.%20linked%20lists%2C%20stacks%2C%20and%20queues/" class="md-nav__link">
<span class="md-ellipsis">
链表、栈与队列
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/03.%20trees/" class="md-nav__link">
<span class="md-ellipsis">
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/04.%20graphs/" class="md-nav__link">
<span class="md-ellipsis">
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/05.%20sorting%20and%20search/" class="md-nav__link">
<span class="md-ellipsis">
排序与搜索
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_16" >
<label class="md-nav__link" for="__nav_16" id="__nav_16_label" tabindex="0">
<span class="md-ellipsis">
生产级软件工程
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_16_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_16">
<span class="md-nav__icon md-icon"></span>
生产级软件工程
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2015%3A%20production%20software%20engineering/01.%20linux%20and%20CMD/" class="md-nav__link">
<span class="md-ellipsis">
Linux 与命令行
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2015%3A%20production%20software%20engineering/02.%20git%20and%20repository%20management/" class="md-nav__link">
<span class="md-ellipsis">
Git 与仓库管理
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2015%3A%20production%20software%20engineering/03.%20codebase%20design/" class="md-nav__link">
<span class="md-ellipsis">
代码设计
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2015%3A%20production%20software%20engineering/04.%20testing%20and%20quality%20assurance/" class="md-nav__link">
<span class="md-ellipsis">
测试与质量保障
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2015%3A%20production%20software%20engineering/05.%20deployment%20and%20devops/" class="md-nav__link">
<span class="md-ellipsis">
部署与 DevOps
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_17" >
<label class="md-nav__link" for="__nav_17" id="__nav_17_label" tabindex="0">
<span class="md-ellipsis">
SIMD 与 GPU 编程
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_17_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_17">
<span class="md-nav__icon md-icon"></span>
SIMD 与 GPU 编程
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/00.%20why%20C%2B%2B%20and%20how%20ML%20frameworks%20work/" class="md-nav__link">
<span class="md-ellipsis">
为什么是 C++ 及 ML 框架原理
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/01.%20hardware%20fundamentals/" class="md-nav__link">
<span class="md-ellipsis">
硬件基础
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/02.%20ARM%20and%20NEON/" class="md-nav__link">
<span class="md-ellipsis">
ARM 与 NEON
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/03.%20x86%20and%20AVX/" class="md-nav__link">
<span class="md-ellipsis">
x86 与 AVX
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/04.%20GPU%20architecture%20and%20CUDA/" class="md-nav__link">
<span class="md-ellipsis">
GPU 架构与 CUDA
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/05.%20triton%2C%20TPUs%20and%20pallax/" class="md-nav__link">
<span class="md-ellipsis">
Triton、TPU 与 Pallas
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/06.%20RISC-V%20and%20embedded%20systems/" class="md-nav__link">
<span class="md-ellipsis">
RISC-V 与嵌入式系统
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/07.%20vulkan%20compute%20and%20cross-platform%20GPU/" class="md-nav__link">
<span class="md-ellipsis">
Vulkan Compute 与跨平台 GPU
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_18" >
<label class="md-nav__link" for="__nav_18" id="__nav_18_label" tabindex="0">
<span class="md-ellipsis">
AI 推理
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_18_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_18">
<span class="md-nav__icon md-icon"></span>
AI 推理
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2017%3A%20AI%20inference/01.%20quantisation/" class="md-nav__link">
<span class="md-ellipsis">
量化
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2017%3A%20AI%20inference/02.%20efficient%20architectures/" class="md-nav__link">
<span class="md-ellipsis">
高效架构
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2017%3A%20AI%20inference/03.%20serving%20and%20batching/" class="md-nav__link">
<span class="md-ellipsis">
服务与批处理
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2017%3A%20AI%20inference/04.%20edge%20inference/" class="md-nav__link">
<span class="md-ellipsis">
边缘推理
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2017%3A%20AI%20inference/05.%20scaling%20and%20deployment/" class="md-nav__link">
<span class="md-ellipsis">
扩缩与部署
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--section md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_19" checked>
<label class="md-nav__link" for="__nav_19" id="__nav_19_label" tabindex="">
<span class="md-ellipsis">
ML 系统设计
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_19_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_19">
<span class="md-nav__icon md-icon"></span>
ML 系统设计
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../01.%20systems%20design%20fundamentals/" class="md-nav__link">
<span class="md-ellipsis">
系统设计基础
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../02.%20cloud%20computing/" class="md-nav__link">
<span class="md-ellipsis">
云计算
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
大规模基础设施
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
大规模基础设施
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="目录">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
目录
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#_2" class="md-nav__link">
<span class="md-ellipsis">
可扩展性
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_3" class="md-nav__link">
<span class="md-ellipsis">
分布式系统
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_4" class="md-nav__link">
<span class="md-ellipsis">
微服务
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_5" class="md-nav__link">
<span class="md-ellipsis">
数据流水线
</span>
</a>
<nav class="md-nav" aria-label="数据流水线">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#_6" class="md-nav__link">
<span class="md-ellipsis">
批处理
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_7" class="md-nav__link">
<span class="md-ellipsis">
流处理
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#lambda" class="md-nav__link">
<span class="md-ellipsis">
Lambda架构
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#ml" class="md-nav__link">
<span class="md-ellipsis">
ML训练基础设施
</span>
</a>
<nav class="md-nav" aria-label="ML训练基础设施">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#gpu" class="md-nav__link">
<span class="md-ellipsis">
GPU集群
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_8" class="md-nav__link">
<span class="md-ellipsis">
网络拓扑
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_9" class="md-nav__link">
<span class="md-ellipsis">
训练存储
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_10" class="md-nav__link">
<span class="md-ellipsis">
作业调度
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_11" class="md-nav__link">
<span class="md-ellipsis">
容错
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_12" class="md-nav__link">
<span class="md-ellipsis">
成本和效率
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#_13" class="md-nav__link">
<span class="md-ellipsis">
数据库扩展
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_14" class="md-nav__link">
<span class="md-ellipsis">
搜索和向量系统
</span>
</a>
<nav class="md-nav" aria-label="搜索和向量系统">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#_15" class="md-nav__link">
<span class="md-ellipsis">
文本搜索
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_16" class="md-nav__link">
<span class="md-ellipsis">
向量搜索
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#_17" class="md-nav__link">
<span class="md-ellipsis">
可观测性
</span>
</a>
<nav class="md-nav" aria-label="可观测性">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#_18" class="md-nav__link">
<span class="md-ellipsis">
日志
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_19" class="md-nav__link">
<span class="md-ellipsis">
指标
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_20" class="md-nav__link">
<span class="md-ellipsis">
追踪
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#_21" class="md-nav__link">
<span class="md-ellipsis">
可靠性
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#cicd" class="md-nav__link">
<span class="md-ellipsis">
CI/CD
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../04.%20ML%20systems%20design/" class="md-nav__link">
<span class="md-ellipsis">
ML 系统设计
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../05.%20ML%20design%20examples/" class="md-nav__link">
<span class="md-ellipsis">
ML 设计案例
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_20" >
<label class="md-nav__link" for="__nav_20" id="__nav_20_label" tabindex="0">
<span class="md-ellipsis">
应用 AI
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_20_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_20">
<span class="md-nav__icon md-icon"></span>
应用 AI
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2019%3A%20applied%20AI/01.%20AI%20for%20finance/" class="md-nav__link">
<span class="md-ellipsis">
AI 金融
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2019%3A%20applied%20AI/02.%20protein%20design/" class="md-nav__link">
<span class="md-ellipsis">
蛋白质设计
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2019%3A%20applied%20AI/03.%20drug%20discovery/" class="md-nav__link">
<span class="md-ellipsis">
药物发现
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2019%3A%20applied%20AI/04.%20agentic%20systems/" class="md-nav__link">
<span class="md-ellipsis">
智能体系统
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2019%3A%20applied%20AI/05.%20healthcare/" class="md-nav__link">
<span class="md-ellipsis">
医疗健康
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_21" >
<label class="md-nav__link" for="__nav_21" id="__nav_21_label" tabindex="0">
<span class="md-ellipsis">
前沿 AI
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_21_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_21">
<span class="md-nav__icon md-icon"></span>
前沿 AI
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2020%3A%20bleeding%20edge%20AI/01.%20quantum%20machine%20learning/" class="md-nav__link">
<span class="md-ellipsis">
量子机器学习
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2020%3A%20bleeding%20edge%20AI/02.%20neuromorphic%20computing/" class="md-nav__link">
<span class="md-ellipsis">
神经形态计算
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2020%3A%20bleeding%20edge%20AI/03.%20datacentres%20in%20space/" class="md-nav__link">
<span class="md-ellipsis">
太空数据中心
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2020%3A%20bleeding%20edge%20AI/04.%20decentralised%20AI/" class="md-nav__link">
<span class="md-ellipsis">
去中心化 AI
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2020%3A%20bleeding%20edge%20AI/05.%20brain%20machine%20interfaces/" class="md-nav__link">
<span class="md-ellipsis">
脑机接口
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="目录">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
目录
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#_2" class="md-nav__link">
<span class="md-ellipsis">
可扩展性
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_3" class="md-nav__link">
<span class="md-ellipsis">
分布式系统
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_4" class="md-nav__link">
<span class="md-ellipsis">
微服务
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_5" class="md-nav__link">
<span class="md-ellipsis">
数据流水线
</span>
</a>
<nav class="md-nav" aria-label="数据流水线">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#_6" class="md-nav__link">
<span class="md-ellipsis">
批处理
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_7" class="md-nav__link">
<span class="md-ellipsis">
流处理
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#lambda" class="md-nav__link">
<span class="md-ellipsis">
Lambda架构
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#ml" class="md-nav__link">
<span class="md-ellipsis">
ML训练基础设施
</span>
</a>
<nav class="md-nav" aria-label="ML训练基础设施">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#gpu" class="md-nav__link">
<span class="md-ellipsis">
GPU集群
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_8" class="md-nav__link">
<span class="md-ellipsis">
网络拓扑
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_9" class="md-nav__link">
<span class="md-ellipsis">
训练存储
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_10" class="md-nav__link">
<span class="md-ellipsis">
作业调度
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_11" class="md-nav__link">
<span class="md-ellipsis">
容错
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_12" class="md-nav__link">
<span class="md-ellipsis">
成本和效率
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#_13" class="md-nav__link">
<span class="md-ellipsis">
数据库扩展
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_14" class="md-nav__link">
<span class="md-ellipsis">
搜索和向量系统
</span>
</a>
<nav class="md-nav" aria-label="搜索和向量系统">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#_15" class="md-nav__link">
<span class="md-ellipsis">
文本搜索
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_16" class="md-nav__link">
<span class="md-ellipsis">
向量搜索
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#_17" class="md-nav__link">
<span class="md-ellipsis">
可观测性
</span>
</a>
<nav class="md-nav" aria-label="可观测性">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#_18" class="md-nav__link">
<span class="md-ellipsis">
日志
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_19" class="md-nav__link">
<span class="md-ellipsis">
指标
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_20" class="md-nav__link">
<span class="md-ellipsis">
追踪
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#_21" class="md-nav__link">
<span class="md-ellipsis">
可靠性
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#cicd" class="md-nav__link">
<span class="md-ellipsis">
CI/CD
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="_1">大规模基础设施<a class="headerlink" href="#_1" title="Permanent link">&para;</a></h1>
<p><em>构建服务数百万用户的系统需要的不只是单个服务器。本文件涵盖可扩展性模式、分布式系统基础、微服务、数据流水线、数据库扩展、搜索和向量系统、可观测性、可靠性工程以及CI/CD</em></p>
<ul>
<li>每秒服务1个请求的模型可以在笔记本电脑上运行。每秒服务100,000个请求且可用性达到99.9%需要分布式系统、自动故障转移和精心设计的数据流水线。本文件涵盖弥合这一差距的模式。</li>
</ul>
<h2 id="_2">可扩展性<a class="headerlink" href="#_2" title="Permanent link">&para;</a></h2>
<ul>
<li><strong>垂直扩展</strong>(向上扩展):换更大的机器。更多CPU、更多内存、更大的GPU。简单但有硬性限制(最大的可用机器)和单点故障。</li>
<li><strong>水平扩展</strong>(向外扩展):增加更多机器。每台处理一部分流量。没有单机限制,但需要:负载均衡(文件01)、数据分区和处理分布式状态。</li>
<li><strong>无状态服务</strong>默认是可水平扩展的。在负载均衡器后面添加更多实例即可。在启动时加载权重并独立处理请求的模型推理服务器是无状态的——任何实例都可以处理任何请求。</li>
<li><strong>有状态服务</strong>(数据库、KV缓存、特征存储)更难扩展。状态必须在多台机器间分区(分片,文件01)并复制以实现容错。</li>
<li><strong>可扩展性方程</strong>:对于一个有<span class="arithmatex">\(n\)</span>台服务器的系统:<ul>
<li><strong>理想情况</strong>:吞吐量线性扩展(<span class="arithmatex">\(n\)</span>台服务器→<span class="arithmatex">\(n\times\)</span>吞吐量)。</li>
<li><strong>实际情况</strong>:协调、负载均衡和数据传输的开销意味着吞吐量亚线性扩展。阿姆达尔定律(第13章)适用:串行部分(共享状态、协调)限制了加速比。</li>
</ul>
</li>
</ul>
<h2 id="_3">分布式系统<a class="headerlink" href="#_3" title="Permanent link">&para;</a></h2>
<ul>
<li><strong>分布式系统</strong>是一组协调提供服务器的机器。基本挑战:</li>
<li><strong>网络分区</strong>:机器不能总是通信。网线被切断、交换机故障、数据中心断电。系统必须处理部分故障。</li>
<li><strong>时钟偏差</strong>:机器有不同的时钟。"事件A发生在机器1的10:00:01"和"事件B发生在机器2的10:00:01"并不意味它们同时发生。<strong>逻辑时钟</strong>(Lamport时间戳、向量时钟)建立排序而不依赖物理时钟。</li>
<li><strong>共识</strong>:多台机器如何就某个值达成一致(例如,谁是领导者)?<strong>Raft</strong>是标准的共识算法。一组节点选举一个领导者。领导者处理所有写入。如果领导者失败,剩余节点选举新的领导者。需要多数(5个节点中的3个)才能运行,因此能容忍<span class="arithmatex">\(\lfloor(n-1)/2\rfloor\)</span>个故障。</li>
<li><strong>分布式锁</strong>:确保只有一台机器执行关键操作。<strong>Redlock</strong>(基于Redis)跨多个Redis实例获取锁。如果多数实例授予锁,则获取成功。用于:防止重复的模型部署,确保只有一个训练作业写入检查点。</li>
</ul>
<h2 id="_4">微服务<a class="headerlink" href="#_4" title="Permanent link">&para;</a></h2>
<p><img alt="微服务ML架构:API网关路由到特征服务、模型服务和日志服务,每个都有自己的数据库,通过消息队列连接" src="../../images/microservices_architecture.svg" /></p>
<ul>
<li><strong>微服务</strong>将系统分解为小型、独立可部署的服务。每个服务拥有一个领域:</li>
</ul>
<div class="highlight"><pre><span></span><code><a id="__codelineno-0-1" name="__codelineno-0-1" href="#__codelineno-0-1"></a>┌─────────────┐ ┌──────────────┐ ┌─────────────┐
<a id="__codelineno-0-2" name="__codelineno-0-2" href="#__codelineno-0-2"></a>│ API网关 │→ │ 特征服务 │→ │ 特征数据库 │
<a id="__codelineno-0-3" name="__codelineno-0-3" href="#__codelineno-0-3"></a>└─────────────┘ └──────────────┘ └─────────────┘
<a id="__codelineno-0-4" name="__codelineno-0-4" href="#__codelineno-0-4"></a>
<a id="__codelineno-0-5" name="__codelineno-0-5" href="#__codelineno-0-5"></a> ├────────→ ┌──────────────┐ ┌─────────────┐
<a id="__codelineno-0-6" name="__codelineno-0-6" href="#__codelineno-0-6"></a> │ │ 模型服务 │→ │ 模型存储 │
<a id="__codelineno-0-7" name="__codelineno-0-7" href="#__codelineno-0-7"></a> │ └──────────────┘ └─────────────┘
<a id="__codelineno-0-8" name="__codelineno-0-8" href="#__codelineno-0-8"></a>
<a id="__codelineno-0-9" name="__codelineno-0-9" href="#__codelineno-0-9"></a> └────────→ ┌──────────────┐ ┌─────────────┐
<a id="__codelineno-0-10" name="__codelineno-0-10" href="#__codelineno-0-10"></a> │ 日志服务 │→ │ 日志存储 │
<a id="__codelineno-0-11" name="__codelineno-0-11" href="#__codelineno-0-11"></a> └──────────────┘ └─────────────┘
</code></pre></div>
<ul>
<li><strong>优点</strong>:独立部署(更新模型服务而不影响特征服务)、独立缩放(根据请求负载缩放模型服务器,根据特征存储读取率缩放特征服务器)、技术自由(模型服务用Python,特征服务用Go)。</li>
<li><strong>缺点</strong>:网络开销(每次服务调用都是网络往返)、复杂性(调试跨越多个服务)、数据一致性(没有跨服务的事务)。</li>
<li><strong>服务发现</strong>:API网关如何找到模型服务?选项:基于DNS(每个服务注册一个DNS名)、K8s服务(内置)或服务注册表(Consul、Eureka)。</li>
<li><strong>Saga模式</strong>:对于跨多个服务的操作(创建用户+分配资源+发送欢迎邮件),使用saga:一系列本地事务,如果任何步骤失败则执行补偿操作。</li>
</ul>
<h2 id="_5">数据流水线<a class="headerlink" href="#_5" title="Permanent link">&para;</a></h2>
<ul>
<li>ML系统消耗大量数据。<strong>数据流水线</strong>移动、转换和服务这些数据:</li>
</ul>
<h3 id="_6">批处理<a class="headerlink" href="#_6" title="Permanent link">&para;</a></h3>
<ul>
<li>按固定间隔(每小时、每天)处理大量数据。</li>
<li><strong>MapReduce</strong>:原始的批处理范式。Map(独立转换每条记录)→ Shuffle(按键分组)→ Reduce(按组聚合)。概念上简单但实现繁琐。</li>
<li><strong>Apache Spark</strong>:现代批处理引擎。内存处理(对于迭代算法比MapReduce快100倍)。支持SQL、DataFrame和ML流水线。大规模特征工程的标准。</li>
<li><strong>示例</strong>:为推荐系统计算用户特征。输入:过去30天的10亿用户活动事件。输出:1亿用户特征向量。每天作为Spark作业运行,输出到特征存储。</li>
</ul>
<h3 id="_7">流处理<a class="headerlink" href="#_7" title="Permanent link">&para;</a></h3>
<ul>
<li>实时处理到达的数据(亚秒级延迟)。</li>
<li><strong>Apache Flink</strong>:领先的流处理引擎。精确一次处理、事件时间处理(按事件发生时间处理,而非到达时间)、窗口化(滚动、滑动、会话窗口)。</li>
<li><strong>Kafka Streams</strong>:内置于Kafka的轻量级流处理。适用于简单转换(过滤、聚合),无需部署单独的集群。</li>
<li><strong>示例</strong>:实时欺诈检测。每笔信用卡交易是一个Kafka事件。Flink作业计算运行统计(交易频率、位置变化)并在100ms内标记异常。</li>
</ul>
<h3 id="lambda">Lambda架构<a class="headerlink" href="#lambda" title="Permanent link">&para;</a></h3>
<ul>
<li>结合批处理和流处理。<strong>批处理层</strong>提供准确、全面的结果(但有延迟)。<strong>速度层</strong>提供近似、实时的结果。<strong>服务层</strong>合并两者。</li>
<li>实际上,许多团队现在使用<strong>Kappa架构</strong>:仅流处理,将流视为事实来源。流是可重播的(Kafka保留事件),因此可以通过重播流来模拟批处理。</li>
</ul>
<h2 id="ml">ML训练基础设施<a class="headerlink" href="#ml" title="Permanent link">&para;</a></h2>
<ul>
<li>训练前沿模型(100B+参数)是一个大规模基础设施问题:数千个GPU运行数月,消耗兆瓦级电力,生成PB级数据,花费数千万美元。基础设施决定了训练成功还是失败。</li>
</ul>
<h3 id="gpu">GPU集群<a class="headerlink" href="#gpu" title="Permanent link">&para;</a></h3>
<ul>
<li>训练集群是由高速网络连接的GPU服务器集合。关键组件:</li>
</ul>
<p><img alt="GPU集群:每个节点有8个通过NVLink连接的GPU,节点通过Infiniband以胖树拓扑连接,从64扩展到16,000+个GPU" src="../../images/gpu_cluster_topology.svg" /></p>
<ul>
<li><strong>GPU服务器(节点)</strong>:每台服务器有4-8个GPU。典型配置:8×H100 GPU、2×AMD EPYC CPU、2 TB RAM、30 TB NVMe SSD。节点内的GPU通过<strong>NVLink</strong>连接(H100上每个GPU 900 GB/s),比PCIe快30倍。</li>
<li><strong>集群规模</strong>:小型训练集群有64-256个GPU(8-32个节点)。前沿模型训练集群有4,000-32,000个GPU500-4000个节点)。Meta的Llama 3使用了16,384个H100 GPU。Google在拥有8,000+个芯片的TPU pod上训练。</li>
<li><strong>粗略估算</strong>:训练70B模型需要约<span class="arithmatex">\(200万。训练400B+前沿模型需要约\)</span>5000万-<span class="arithmatex">\(1亿。集群硬件本身在H100价格下约\)</span>5亿-<span class="arithmatex">\(10亿(\)</span>3万/GPU × 16,000 GPU = $4.8亿)。</li>
</ul>
<h3 id="_8">网络拓扑<a class="headerlink" href="#_8" title="Permanent link">&para;</a></h3>
<ul>
<li>GPU节点之间的网络是最关键的基础设施组件。如果GPU不能足够快地交换梯度,它们就会闲置等待通信完成。</li>
<li><strong>InfiniBand</strong>是GPU集群网络的标准。NVIDIA的<strong>Quantum-2 InfiniBand</strong>提供每个端口400 Gb/s。每个节点通常有8个InfiniBand端口(每个GPU一个),每个节点的总对分带宽为400 GB/s。</li>
<li><strong>RDMA</strong>(远程直接内存访问):InfiniBand支持RDMA,它直接在节点间的GPU内存之间传输数据,无需CPU参与。这将延迟从约100μs(TCP)降低到约1μs,对于高效的梯度全规约(第6章)至关重要。</li>
<li><strong>网络拓扑很重要</strong><strong>胖树</strong>(Clos网络)提供全对分带宽(任何GPU可以与其他任何GPU以全速通信)。更便宜的拓扑(<strong>轨道优化</strong><strong>3D环面</strong>)提供较少的带宽但成本更低。拓扑必须匹配并行策略:<ul>
<li><strong>数据并行</strong>:跨所有GPU的全规约→需要高对分带宽(胖树)。</li>
<li><strong>张量并行</strong>:节点内通信→NVLink处理此需求(不需要网络)。</li>
<li><strong>流水线并行</strong>:相邻流水线阶段之间的通信→只需要特定节点对之间的带宽(轨道优化即可)。</li>
</ul>
</li>
<li><strong>以太网替代方案</strong><strong>RoCE v2</strong>(融合以太网上的RDMA)在标准以太网基础设施上提供RDMA。比InfiniBand便宜,但延迟更高且更易拥塞。Google在某些TPU pod网络中使用RoCE。超以太网联盟正在开发用于AI工作负载的无损以太网。</li>
</ul>
<h3 id="_9">训练存储<a class="headerlink" href="#_9" title="Permanent link">&para;</a></h3>
<ul>
<li>训练需要三个存储层级:<ul>
<li><strong>数据集存储</strong>:训练语料(1-100 TB文本,或PB级多模态数据)。存储在分布式文件系统或对象存储中。必须支持高吞吐量顺序读取(数据加载器以大批量读取数据)。<strong>Lustre</strong><strong>GPFS</strong>是常见的HPC文件系统;云替代方案包括<strong>FSx for Lustre</strong>AWS)和<strong>Filestore</strong>GCP)。</li>
<li><strong>检查点存储</strong>:训练状态(模型权重+优化器状态+调度器状态)定期保存。对于使用Adam优化器的混合精度70B模型:每个检查点约560 GB(70B × 4字节 × 2用于优化器)。每小时保存一次,运行3个月=约2000个检查点=1.1 PB。实际上,只保留最新的N个检查点,旧的会被删除。必须足够快,使检查点不会显著拖慢训练。</li>
<li><strong>日志和指标</strong>:实验跟踪数据(损失曲线、学习率计划、梯度范数)。相对较小但必须实时写入。W&amp;B、MLflow或TensorBoard处理此需求。</li>
</ul>
</li>
<li><strong>存储瓶颈</strong>:一个16,000-GPU集群加载一个训练批次需要持续读取约100 GB/s的数据。如果文件系统无法维持此吞吐量,GPU将闲置等待数据。数据流水线优化(预取、缓存、使用WebDataset或Mosaic Streaming进行格式优化)至关重要。</li>
</ul>
<h3 id="_10">作业调度<a class="headerlink" href="#_10" title="Permanent link">&para;</a></h3>
<ul>
<li>GPU集群服务于多个团队和项目。<strong>作业调度器</strong>将GPU分配给训练作业:</li>
<li><strong>SLURM</strong>:标准的HPC作业调度器。用户提交作业,指定GPU数量、内存和时间限制。SLURM分配资源并管理队列。支持基于优先级的调度、抢占和团队间的公平份额分配。</li>
<li><strong>带GPU调度的Kubernetes</strong>(第18章文件02):云原生方法。K8s GPU设备插件将GPU暴露为可调度资源。<strong>Volcano</strong><strong>Run:ai</strong>增加了ML特定的调度功能:群体调度(一次为一个作业分配所有GPU,而不是逐个分配)、优先级队列和GPU时间共享。</li>
<li><strong>调度挑战</strong><ul>
<li><strong>碎片化</strong>:一个拥有1000个GPU的集群可能有200个空闲,但分布在50个节点上(每个节点4个空闲)。需要128个连续GPU的作业无法运行,即使有足够的总GPU数。<strong>去碎片化</strong>(迁移作业以合并空闲GPU)或<strong>拓扑感知调度</strong>(分配连接良好的GPU)可以解决此问题。</li>
<li><strong>优先级和抢占</strong>:紧急实验应抢占低优先级作业。但抢占一个已运行2天的训练作业会浪费计算资源。调度器必须在优先级和效率之间取得平衡。</li>
<li><strong>公平份额</strong>:团队应在一段时间内获得其分配的计算份额,即使一个团队提交的作业超过其份额。</li>
</ul>
</li>
</ul>
<h3 id="_11">容错<a class="headerlink" href="#_11" title="Permanent link">&para;</a></h3>
<ul>
<li>在数千个GPU运行数月的规模下,硬件故障不是异常——而是常态。16,000-GPU集群的平均故障间隔时间以小时计,而非月。</li>
<li><strong>常见故障</strong>:GPU内存错误(ECC可纠正和不可纠正)、NVLink故障(节点内GPU到GPU通信)、InfiniBand链路故障(节点到节点通信)、节点崩溃(内核恐慌、PSU故障)和存储故障(磁盘或控制器故障)。</li>
<li><strong>检查点</strong>是主要的防御手段。每N步保存完整的训练状态(模型、优化器、数据加载器位置)。故障时:识别故障节点,替换或移除它,从最近的检查点恢复训练。故障的代价是最后一次检查点和故障之间的计算量。</li>
<li><strong>检查点频率权衡</strong>:频繁检查点(每10分钟)在故障时浪费更少的计算,但会减慢训练(保存560 GB需要时间)。不频繁检查点(每2小时)更快,但故障时浪费多达2小时的计算。大多数团队每20-60分钟检查一次。</li>
<li><strong>弹性训练</strong>:现代框架(PyTorch Elastic、DeepSpeed)支持在不重启的情况下调整训练规模。如果500个节点中有2个节点故障,训练继续使用498个节点。故障节点被替换,训练在它们重新上线时自动纳入。</li>
<li><strong>健康监控</strong>:持续监控所有GPU(温度、内存错误、计算吞吐量)、网络链路(丢包、延迟)和存储(吞吐量、错误率)。异常时自动告警。一些集群运行定期GPU健康检查(一个简短的计算测试)以主动识别在故障前性能下降的硬件。</li>
<li><strong>大规模场景</strong>:训练Meta的Llama 316,384个H10054天)经历了约466次作业中断。有效训练时间仅为挂钟时间的约90%——10%损失于故障和恢复。实现90%(而非50%或70%)的基础设施是区分能训练前沿模型的组织和不能训练的组织的关键。</li>
</ul>
<h3 id="_12">成本和效率<a class="headerlink" href="#_12" title="Permanent link">&para;</a></h3>
<ul>
<li>训练基础设施成本由GPU小时主导:</li>
</ul>
<table>
<thead>
<tr>
<th>组件</th>
<th>占总成本百分比</th>
</tr>
</thead>
<tbody>
<tr>
<td>GPU计算</td>
<td>70-80%</td>
</tr>
<tr>
<td>网络(InfiniBand</td>
<td>10-15%</td>
</tr>
<tr>
<td>存储</td>
<td>5-10%</td>
</tr>
<tr>
<td>冷却和电源</td>
<td>5-10%</td>
</tr>
</tbody>
</table>
<ul>
<li><strong>GPU利用率</strong>(模型FLOPs利用率,MFU)衡量GPU理论峰值性能中有多少被用于实际有用计算。H100峰值为989 TFLOPSFP8)。达到40-50% MFU算良好;50-60%算优秀。差距来自:通信开销(全规约、流水线气泡)、内存带宽限制以及检查点和数据加载期间的闲置时间。</li>
<li><strong>提高MFU</strong>:重叠计算和通信(第6章)、使用高效注意力(Flash Attention,第16章)、优化数据加载(防止GPU饥饿)、减少检查点开销(异步检查点,先检查到快速NVMe,然后后台复制到持久存储)。</li>
<li><strong>自建vs租用</strong>:在小规模(&lt;256个GPU)下,云更便宜(无前期成本,按小时付费)。在大规模(&gt;1000个GPU,持续使用6+个月)下,拥有硬件更便宜(3年内TCO低约2-3倍)。大多数AI公司混合使用:自有集群用于持续训练,云用于突发容量和实验。</li>
</ul>
<h2 id="_13">数据库扩展<a class="headerlink" href="#_13" title="Permanent link">&para;</a></h2>
<ul>
<li><strong>只读副本</strong>:将读取查询路由到主数据库的副本。主库处理写入,副本处理读取。由于大多数工作负载是读取密集型的(95%+读取),这使读取吞吐量随副本数量线性扩展。</li>
<li><strong>分区</strong>(分片,来自文件01):将数据分割到多个数据库。每个分区是独立的,支持并行读取和写入。挑战是跨分区查询(连接来自不同分片的数据)。</li>
<li><strong>连接池</strong>:数据库有有限的连接容量。连接池(PostgreSQL的PgBouncer)在请求间复用连接,防止当数百个服务实例各自尝试连接时出现连接耗尽。</li>
</ul>
<h2 id="_14">搜索和向量系统<a class="headerlink" href="#_14" title="Permanent link">&para;</a></h2>
<h3 id="_15">文本搜索<a class="headerlink" href="#_15" title="Permanent link">&para;</a></h3>
<ul>
<li><strong>倒排索引</strong>:文本搜索的基础。对每个单词,存储包含该单词的文档列表。查询对每个查询词的列表求交集。<strong>Elasticsearch</strong>是标准:分布式、实时、支持全文搜索、聚合和地理空间查询。</li>
<li><strong>BM25</strong>:标准文本检索评分函数。根据词频、逆文档频率和文档长度归一化对文档评分。简单而有效——对于关键词密集型查询仍然能与神经方法竞争。</li>
</ul>
<h3 id="_16">向量搜索<a class="headerlink" href="#_16" title="Permanent link">&para;</a></h3>
<ul>
<li><strong>向量数据库</strong>存储嵌入(高维向量)并支持快速<strong>近似最近邻(ANN</strong>搜索。给定一个查询嵌入,找到<span class="arithmatex">\(k\)</span>个最相似的存储嵌入。</li>
<li><strong>FAISS</strong>Facebook AI相似性搜索):一个用于ANN搜索的库(而非数据库)。支持多种索引类型:<ul>
<li><strong>Flat</strong>:精确搜索,<span class="arithmatex">\(O(n)\)</span>。用于小数据集或作为基准。</li>
<li><strong>IVF</strong>(倒排文件):将向量分区到簇中,仅搜索最近的簇。每个查询<span class="arithmatex">\(O(n/k)\)</span></li>
<li><strong>HNSW</strong>(分层可导航小世界):基于图。构建分层图,从粗到细导航。极快且准确,是大多数应用的默认选择。</li>
<li><strong>乘积量化(PQ</strong>:将向量压缩为紧凑编码以实现内存高效搜索。用准确度换取内存。</li>
</ul>
</li>
<li><strong>托管向量数据库</strong>Pinecone、Weaviate、Milvus、Qdrant。它们处理FAISS不具备的扩展、复制和实时更新。</li>
<li><strong>对于RAG</strong>(检索增强生成):用户查询→用文本编码器嵌入→搜索向量数据库以找到相关文档→将检索到的文档前置到LLM提示中。检索质量直接决定LLM响应的质量。</li>
</ul>
<h2 id="_17">可观测性<a class="headerlink" href="#_17" title="Permanent link">&para;</a></h2>
<ul>
<li><strong>可观测性</strong>是从系统外部输出理解系统内部状态的能力。三大支柱:</li>
</ul>
<h3 id="_18">日志<a class="headerlink" href="#_18" title="Permanent link">&para;</a></h3>
<ul>
<li><strong>结构化日志</strong>(JSON)是可搜索和可解析的。非结构化日志("ERROR: something failed")则不是。始终记录:时间戳、服务名、请求ID(用于跨服务追踪)、严重级别和相关上下文。</li>
<li><strong>ELK栈</strong>Elasticsearch、Logstash、Kibana):标准日志流水线。Logstash收集和转换日志,Elasticsearch建立索引,Kibana可视化和搜索。</li>
</ul>
<h3 id="_19">指标<a class="headerlink" href="#_19" title="Permanent link">&para;</a></h3>
<ul>
<li><strong>指标</strong>是随时间变化的数值测量:请求率、错误率、延迟百分位数、GPU利用率、队列深度。<strong>Prometheus</strong>从服务抓取指标;<strong>Grafana</strong>在仪表盘中可视化并设置告警。</li>
<li><strong>服务的RED方法</strong><strong>R</strong>ate(请求/秒)、<strong>E</strong>rrors(错误率)、<strong>D</strong>uration(延迟)。为每个服务监控这些指标。</li>
<li><strong>资源的USE方法</strong><strong>U</strong>tilisation(使用百分比)、<strong>S</strong>aturation(队列深度)、<strong>E</strong>rrors。为每个资源(CPU、GPU、内存、磁盘、网络)监控这些指标。</li>
</ul>
<h3 id="_20">追踪<a class="headerlink" href="#_20" title="Permanent link">&para;</a></h3>
<ul>
<li><strong>分布式追踪</strong>跟踪单个请求跨多个服务的路径。用户请求命中API网关→特征服务→模型服务→后处理。一个<strong>追踪</strong>记录了每次跳转的时序,显示延迟花在哪里。</li>
<li><strong>OpenTelemetry</strong>:追踪、指标和日志的开放标准。一次代码埋点,导出到任何后端(Jaeger、Zipkin、Datadog)。</li>
</ul>
<h2 id="_21">可靠性<a class="headerlink" href="#_21" title="Permanent link">&para;</a></h2>
<ul>
<li><strong>SLO</strong>(服务等级目标):目标可靠性。"99.9%的请求在&lt;200ms内完成。"这给出了具体的错误预算:0.1%的请求(每月约43分钟)可以慢或失败。</li>
<li><strong>SLI</strong>(服务等级指标):测量指标。"过去5分钟的第99百分位延迟。"</li>
<li><strong>SLA</strong>(服务等级协议):有后果的合同承诺。"如果可用性低于99.95%,客户获得信用额度。"</li>
<li><strong>错误预算</strong>:如果你的SLO是99.9%,而你达到了99.99%,你就有进行风险变更(部署新模型、迁移数据库)的预算。如果你只有99.85%,冻结所有变更,专注于可靠性。错误预算将可靠性从抽象目标转化为可衡量的资源。</li>
<li><strong>混沌工程</strong>:故意注入故障(杀死服务器、添加网络延迟、破坏数据)以测试系统是否能正确处理。Netflix的Chaos Monkey随机终止生产实例。如果系统保持运行,它就是有弹性的。如果崩溃了,你在用户之前发现了一个bug。</li>
</ul>
<h2 id="cicd">CI/CD<a class="headerlink" href="#cicd" title="Permanent link">&para;</a></h2>
<ul>
<li><strong>持续集成</strong>:自动构建和测试每次代码变更。每次推送触发:lint、类型检查、单元测试、集成测试。任何失败,变更被拒绝。这能在bug到达生产之前捕获它们。</li>
<li><strong>持续部署</strong>:自动部署通过CI的变更。部署策略:<ul>
<li><strong>蓝绿部署</strong>:运行两个相同的环境(蓝色=当前,绿色=新版本)。将流量从蓝色瞬间切换到绿色。如果绿色失败,切换回蓝色(即时回滚)。</li>
<li><strong>金丝雀部署</strong>:将一小部分流量(1-5%)路由到新版本。监控错误。如果指标良好,逐步增加流量。这限制了不良部署的影响范围。</li>
<li><strong>功能标志</strong>:部署新代码但隐藏在标志后面。为部分用户启用该标志(内部测试人员,然后是beta用户,然后是所有用户)。将部署(代码上线)与发布(用户看到功能)解耦。</li>
</ul>
</li>
<li>对于ML:CI/CD包括模型特定的步骤。模型变更触发:单元测试(形状测试、梯度检查)、在保留集上评估(准确率不得下降)、影子部署(新旧模型并行运行,比较输出)和逐步推出(金丝雀从1%→100%)。</li>
</ul>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
<button type="button" class="md-top md-icon" data-md-component="top" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8z"/></svg>
回到页面顶部
</button>
</main>
<footer class="md-footer">
<nav class="md-footer__inner md-grid" aria-label="页脚" >
<a href="../02.%20cloud%20computing/" class="md-footer__link md-footer__link--prev" aria-label="上一页: 云计算">
<div class="md-footer__button md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
</div>
<div class="md-footer__title">
<span class="md-footer__direction">
上一页
</span>
<div class="md-ellipsis">
云计算
</div>
</div>
</a>
<a href="../04.%20ML%20systems%20design/" class="md-footer__link md-footer__link--next" aria-label="下一页: ML 系统设计">
<div class="md-footer__title">
<span class="md-footer__direction">
下一页
</span>
<div class="md-ellipsis">
ML 系统设计
</div>
</div>
<div class="md-footer__button md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11z"/></svg>
</div>
</a>
</nav>
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
<div class="md-social">
<a href="https://github.com/flykhan/maths-cs-ai-compendium-zh" target="_blank" rel="noopener" title="github.com" class="md-social__link">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M173.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6m-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3m44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9M252.8 8C114.1 8 8 113.3 8 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C436.2 457.8 504 362.9 504 252 504 113.3 391.5 8 252.8 8M105.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1m-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7m32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1m-11.4-14.7c-1.6 1-1.6 3.6 0 5.9s4.3 3.3 5.6 2.3c1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2"/></svg>
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": ["navigation.tabs", "navigation.sections", "navigation.expand", "navigation.top", "navigation.footer", "search.suggest", "search.highlight", "content.code.copy", "toc.follow"], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "\u5df2\u590d\u5236", "clipboard.copy": "\u590d\u5236", "search.result.more.one": "\u5728\u8be5\u9875\u4e0a\u8fd8\u6709 1 \u4e2a\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.more.other": "\u5728\u8be5\u9875\u4e0a\u8fd8\u6709 # \u4e2a\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.none": "\u6ca1\u6709\u627e\u5230\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.one": "\u627e\u5230 1 \u4e2a\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.other": "# \u4e2a\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.placeholder": "\u952e\u5165\u4ee5\u5f00\u59cb\u641c\u7d22", "search.result.term.missing": "\u7f3a\u5c11", "select.version": "\u9009\u62e9\u5f53\u524d\u7248\u672c"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="../../javascripts/mathjax.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>