Files
maths-cs-ai-compendium-zh/chapter 17: AI inference/02. efficient architectures/index.html
T

5641 lines
123 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!doctype html>
<html lang="zh" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<meta name="description" content="一本开源的直觉优先教科书,从零开始覆盖数学、计算机科学和人工智能(中文翻译版)。">
<meta name="author" content="Henry Ndubuaku (flykhan 译)">
<link rel="canonical" href="https://flykhan.github.io/maths-cs-ai-compendium-zh/chapter%2017%3A%20AI%20inference/02.%20efficient%20architectures/">
<link rel="prev" href="../01.%20quantisation/">
<link rel="next" href="../03.%20serving%20and%20batching/">
<link rel="icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>高效架构 - 数学、计算机科学与 AI 百科全书</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="stylesheet" href="../../assets/stylesheets/palette.ab4e12ef.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr" data-md-color-scheme="default" data-md-color-primary="slate" data-md-color-accent="indigo">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#_1" class="md-skip">
跳转至
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="页眉">
<a href="../.." title="数学、计算机科学与 AI 百科全书" class="md-header__button md-logo" aria-label="数学、计算机科学与 AI 百科全书" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
数学、计算机科学与 AI 百科全书
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
高效架构
</span>
</div>
</div>
</div>
<form class="md-header__option" data-md-component="palette">
<input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="slate" data-md-color-accent="indigo" aria-label="切换到深色模式" type="radio" name="__palette" id="__palette_0">
<label class="md-header__button md-icon" title="切换到深色模式" for="__palette_1" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
</label>
<input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="slate" data-md-color-accent="indigo" aria-label="切换到浅色模式" type="radio" name="__palette" id="__palette_1">
<label class="md-header__button md-icon" title="切换到浅色模式" for="__palette_0" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
</label>
</form>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
<label class="md-header__button md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
</label>
<div class="md-search" data-md-component="search" role="dialog">
<label class="md-search__overlay" for="__search"></label>
<div class="md-search__inner" role="search">
<form class="md-search__form" name="search">
<input type="text" class="md-search__input" name="query" aria-label="搜索" placeholder="搜索" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
<label class="md-search__icon md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
</label>
<nav class="md-search__options" aria-label="查找">
<button type="reset" class="md-search__icon md-icon" title="清空当前内容" aria-label="清空当前内容" tabindex="-1">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
</button>
</nav>
<div class="md-search__suggest" data-md-component="search-suggest"></div>
</form>
<div class="md-search__output">
<div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
<div class="md-search-result" data-md-component="search-result">
<div class="md-search-result__meta">
正在初始化搜索引擎
</div>
<ol class="md-search-result__list" role="presentation"></ol>
</div>
</div>
</div>
</div>
</div>
<div class="md-header__source">
<a href="https://github.com/flykhan/maths-cs-ai-compendium-zh" title="前往仓库" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
</div>
<div class="md-source__repository">
flykhan/maths-cs-ai-compendium-zh
</div>
</a>
</div>
</nav>
</header>
<div class="md-container" data-md-component="container">
<nav class="md-tabs" aria-label="标签" data-md-component="tabs">
<div class="md-grid">
<ul class="md-tabs__list">
<li class="md-tabs__item">
<a href="../.." class="md-tabs__link">
首页
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2001%3A%20vectors/01.%20vector%20spaces/" class="md-tabs__link">
向量
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2002%3A%20matrices/01.%20matrix%20properties/" class="md-tabs__link">
矩阵
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2003%3A%20calculus/01.%20differential%20calculus/" class="md-tabs__link">
微积分
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2004%3A%20statistics/01.%20fundamentals/" class="md-tabs__link">
统计学
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2005%3A%20probability/01.%20counting/" class="md-tabs__link">
概率论
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2006%3A%20machine%20learning/01.%20classical%20machine%20learning/" class="md-tabs__link">
机器学习
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2007%3A%20computational%20linguistics/01.%20linguistic%20foundations/" class="md-tabs__link">
计算语言学
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2008%3A%20computer%20vision/01.%20image%20fundamentals/" class="md-tabs__link">
计算机视觉
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2009%3A%20audio%20and%20speech/01.%20digital%20signal%20processing/" class="md-tabs__link">
音频与语音
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2010%3A%20multimodal%20learning/01.%20multimodal%20representations/" class="md-tabs__link">
多模态学习
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2011%3A%20autonomous%20systems/01.%20perception/" class="md-tabs__link">
自主系统
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2012%3A%20graph%20neural%20networks/01.%20geometric%20deep%20learning/" class="md-tabs__link">
图神经网络
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2013%3A%20computing%20and%20OS/01.%20discrete%20maths/" class="md-tabs__link">
计算机与操作系统
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/00.%20foundations/" class="md-tabs__link">
数据结构与算法
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2015%3A%20production%20software%20engineering/01.%20linux%20and%20CMD/" class="md-tabs__link">
生产级软件工程
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/00.%20why%20C%2B%2B%20and%20how%20ML%20frameworks%20work/" class="md-tabs__link">
SIMD 与 GPU 编程
</a>
</li>
<li class="md-tabs__item md-tabs__item--active">
<a href="../01.%20quantisation/" class="md-tabs__link">
AI 推理
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2018%3A%20ML%20systems%20design/01.%20systems%20design%20fundamentals/" class="md-tabs__link">
ML 系统设计
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2019%3A%20applied%20AI/01.%20AI%20for%20finance/" class="md-tabs__link">
应用 AI
</a>
</li>
<li class="md-tabs__item">
<a href="../../chapter%2020%3A%20bleeding%20edge%20AI/01.%20quantum%20machine%20learning/" class="md-tabs__link">
前沿 AI
</a>
</li>
</ul>
</div>
</nav>
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary md-nav--lifted" aria-label="导航栏" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="数学、计算机科学与 AI 百科全书" class="md-nav__button md-logo" aria-label="数学、计算机科学与 AI 百科全书" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
数学、计算机科学与 AI 百科全书
</label>
<div class="md-nav__source">
<a href="https://github.com/flykhan/maths-cs-ai-compendium-zh" title="前往仓库" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
</div>
<div class="md-source__repository">
flykhan/maths-cs-ai-compendium-zh
</div>
</a>
</div>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
<span class="md-ellipsis">
首页
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
向量
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
向量
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2001%3A%20vectors/01.%20vector%20spaces/" class="md-nav__link">
<span class="md-ellipsis">
向量空间
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2001%3A%20vectors/02.%20vector%20properties/" class="md-nav__link">
<span class="md-ellipsis">
向量性质
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2001%3A%20vectors/03.%20norms%20and%20metrics/" class="md-nav__link">
<span class="md-ellipsis">
范数与度量
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2001%3A%20vectors/04.%20products/" class="md-nav__link">
<span class="md-ellipsis">
向量积
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2001%3A%20vectors/05.%20basis%20and%20duality/" class="md-nav__link">
<span class="md-ellipsis">
基与对偶性
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_3" >
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
矩阵
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
矩阵
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2002%3A%20matrices/01.%20matrix%20properties/" class="md-nav__link">
<span class="md-ellipsis">
矩阵性质
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2002%3A%20matrices/02.%20matrix%20types/" class="md-nav__link">
<span class="md-ellipsis">
矩阵类型
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2002%3A%20matrices/03.%20operations/" class="md-nav__link">
<span class="md-ellipsis">
矩阵运算
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2002%3A%20matrices/04.%20linear%20transformations/" class="md-nav__link">
<span class="md-ellipsis">
线性变换
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2002%3A%20matrices/05.%20decompositions/" class="md-nav__link">
<span class="md-ellipsis">
矩阵分解
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
微积分
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
微积分
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2003%3A%20calculus/01.%20differential%20calculus/" class="md-nav__link">
<span class="md-ellipsis">
微分
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2003%3A%20calculus/02.%20integral%20calculus/" class="md-nav__link">
<span class="md-ellipsis">
积分
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2003%3A%20calculus/03.%20multivariate%20calculus/" class="md-nav__link">
<span class="md-ellipsis">
多元微积分
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2003%3A%20calculus/04.%20function%20approximation/" class="md-nav__link">
<span class="md-ellipsis">
函数逼近
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2003%3A%20calculus/05.%20optimisation/" class="md-nav__link">
<span class="md-ellipsis">
优化
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_5" >
<label class="md-nav__link" for="__nav_5" id="__nav_5_label" tabindex="0">
<span class="md-ellipsis">
统计学
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_5_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_5">
<span class="md-nav__icon md-icon"></span>
统计学
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2004%3A%20statistics/01.%20fundamentals/" class="md-nav__link">
<span class="md-ellipsis">
基础
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2004%3A%20statistics/02.%20measures/" class="md-nav__link">
<span class="md-ellipsis">
统计量
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2004%3A%20statistics/03.%20sampling/" class="md-nav__link">
<span class="md-ellipsis">
抽样
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2004%3A%20statistics/04.%20hypothesis%20testing/" class="md-nav__link">
<span class="md-ellipsis">
假设检验
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2004%3A%20statistics/05.%20inference/" class="md-nav__link">
<span class="md-ellipsis">
推断
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_6" >
<label class="md-nav__link" for="__nav_6" id="__nav_6_label" tabindex="0">
<span class="md-ellipsis">
概率论
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_6_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_6">
<span class="md-nav__icon md-icon"></span>
概率论
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2005%3A%20probability/01.%20counting/" class="md-nav__link">
<span class="md-ellipsis">
计数
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2005%3A%20probability/02.%20probability%20concepts/" class="md-nav__link">
<span class="md-ellipsis">
概率概念
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2005%3A%20probability/03.%20distributions/" class="md-nav__link">
<span class="md-ellipsis">
分布
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2005%3A%20probability/04.%20bayesian/" class="md-nav__link">
<span class="md-ellipsis">
贝叶斯
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2005%3A%20probability/05.%20information%20theory/" class="md-nav__link">
<span class="md-ellipsis">
信息论
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_7" >
<label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
<span class="md-ellipsis">
机器学习
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_7">
<span class="md-nav__icon md-icon"></span>
机器学习
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2006%3A%20machine%20learning/01.%20classical%20machine%20learning/" class="md-nav__link">
<span class="md-ellipsis">
经典机器学习
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2006%3A%20machine%20learning/02.%20gradient%20machine%20learning/" class="md-nav__link">
<span class="md-ellipsis">
梯度机器学习
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2006%3A%20machine%20learning/03.%20deep%20learning/" class="md-nav__link">
<span class="md-ellipsis">
深度学习
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2006%3A%20machine%20learning/04.%20reinforcement%20learning/" class="md-nav__link">
<span class="md-ellipsis">
强化学习
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2006%3A%20machine%20learning/05.%20distributed%20deep%20learning/" class="md-nav__link">
<span class="md-ellipsis">
分布式深度学习
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_8" >
<label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
<span class="md-ellipsis">
计算语言学
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_8">
<span class="md-nav__icon md-icon"></span>
计算语言学
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2007%3A%20computational%20linguistics/01.%20linguistic%20foundations/" class="md-nav__link">
<span class="md-ellipsis">
语言学基础
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2007%3A%20computational%20linguistics/02.%20text%20processing%20and%20classic%20NLP/" class="md-nav__link">
<span class="md-ellipsis">
文本处理与经典 NLP
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2007%3A%20computational%20linguistics/03.%20embeddings%20and%20sequence%20models/" class="md-nav__link">
<span class="md-ellipsis">
嵌入与序列模型
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2007%3A%20computational%20linguistics/04.%20transformers%20and%20language%20models/" class="md-nav__link">
<span class="md-ellipsis">
Transformer 与语言模型
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2007%3A%20computational%20linguistics/05.%20advanced%20text%20generation/" class="md-nav__link">
<span class="md-ellipsis">
高级文本生成
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_9" >
<label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
<span class="md-ellipsis">
计算机视觉
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_9">
<span class="md-nav__icon md-icon"></span>
计算机视觉
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2008%3A%20computer%20vision/01.%20image%20fundamentals/" class="md-nav__link">
<span class="md-ellipsis">
图像基础
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2008%3A%20computer%20vision/02.%20convolutional%20networks/" class="md-nav__link">
<span class="md-ellipsis">
卷积网络
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2008%3A%20computer%20vision/03.%20object%20detection%20and%20segmentation/" class="md-nav__link">
<span class="md-ellipsis">
目标检测与分割
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2008%3A%20computer%20vision/04.%20vision%20transformers%20and%20generation/" class="md-nav__link">
<span class="md-ellipsis">
ViT 与生成模型
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2008%3A%20computer%20vision/05.%20video%20and%203D%20vision/" class="md-nav__link">
<span class="md-ellipsis">
视频与 3D 视觉
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_10" >
<label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
<span class="md-ellipsis">
音频与语音
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_10">
<span class="md-nav__icon md-icon"></span>
音频与语音
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2009%3A%20audio%20and%20speech/01.%20digital%20signal%20processing/" class="md-nav__link">
<span class="md-ellipsis">
数字信号处理
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2009%3A%20audio%20and%20speech/02.%20automatic%20speech%20recognition/" class="md-nav__link">
<span class="md-ellipsis">
自动语音识别
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2009%3A%20audio%20and%20speech/03.%20text%20to%20speech%20and%20voice/" class="md-nav__link">
<span class="md-ellipsis">
语音合成
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2009%3A%20audio%20and%20speech/04.%20speaker%20and%20audio%20analysis/" class="md-nav__link">
<span class="md-ellipsis">
说话人与音频分析
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2009%3A%20audio%20and%20speech/05.%20source%20separation%20and%20noise/" class="md-nav__link">
<span class="md-ellipsis">
源分离与降噪
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_11" >
<label class="md-nav__link" for="__nav_11" id="__nav_11_label" tabindex="0">
<span class="md-ellipsis">
多模态学习
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_11_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_11">
<span class="md-nav__icon md-icon"></span>
多模态学习
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2010%3A%20multimodal%20learning/01.%20multimodal%20representations/" class="md-nav__link">
<span class="md-ellipsis">
多模态表征
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2010%3A%20multimodal%20learning/02.%20vision%20language%20models/" class="md-nav__link">
<span class="md-ellipsis">
视觉语言模型
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2010%3A%20multimodal%20learning/03.%20image%20and%20video%20tokenisation/" class="md-nav__link">
<span class="md-ellipsis">
图像与视频 Token 化
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2010%3A%20multimodal%20learning/04.%20cross-modal%20generation/" class="md-nav__link">
<span class="md-ellipsis">
跨模态生成
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2010%3A%20multimodal%20learning/05.%20unified%20multimodal%20architectures/" class="md-nav__link">
<span class="md-ellipsis">
统一多模态架构
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_12" >
<label class="md-nav__link" for="__nav_12" id="__nav_12_label" tabindex="0">
<span class="md-ellipsis">
自主系统
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_12_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_12">
<span class="md-nav__icon md-icon"></span>
自主系统
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2011%3A%20autonomous%20systems/01.%20perception/" class="md-nav__link">
<span class="md-ellipsis">
感知
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2011%3A%20autonomous%20systems/02.%20robot%20learning/" class="md-nav__link">
<span class="md-ellipsis">
机器人学习
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2011%3A%20autonomous%20systems/03.%20vision-language-action%20models/" class="md-nav__link">
<span class="md-ellipsis">
视觉-语言-动作模型
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2011%3A%20autonomous%20systems/04.%20self-driving/" class="md-nav__link">
<span class="md-ellipsis">
自动驾驶
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2011%3A%20autonomous%20systems/05.%20space%20and%20extreme%20robotics/" class="md-nav__link">
<span class="md-ellipsis">
太空与极端机器人
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_13" >
<label class="md-nav__link" for="__nav_13" id="__nav_13_label" tabindex="0">
<span class="md-ellipsis">
图神经网络
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_13_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_13">
<span class="md-nav__icon md-icon"></span>
图神经网络
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2012%3A%20graph%20neural%20networks/01.%20geometric%20deep%20learning/" class="md-nav__link">
<span class="md-ellipsis">
几何深度学习
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2012%3A%20graph%20neural%20networks/02.%20graph%20theory/" class="md-nav__link">
<span class="md-ellipsis">
图论
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2012%3A%20graph%20neural%20networks/03.%20graph%20neural%20networks/" class="md-nav__link">
<span class="md-ellipsis">
图神经网络
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2012%3A%20graph%20neural%20networks/04.%20graph%20attention%20networks/" class="md-nav__link">
<span class="md-ellipsis">
图注意力网络
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2012%3A%20graph%20neural%20networks/05.%203d%20graph%20networks/" class="md-nav__link">
<span class="md-ellipsis">
3D 图网络
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_14" >
<label class="md-nav__link" for="__nav_14" id="__nav_14_label" tabindex="0">
<span class="md-ellipsis">
计算机与操作系统
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_14_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_14">
<span class="md-nav__icon md-icon"></span>
计算机与操作系统
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2013%3A%20computing%20and%20OS/01.%20discrete%20maths/" class="md-nav__link">
<span class="md-ellipsis">
离散数学
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2013%3A%20computing%20and%20OS/02.%20computer%20architecture/" class="md-nav__link">
<span class="md-ellipsis">
计算机体系结构
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2013%3A%20computing%20and%20OS/03.%20operating%20systems/" class="md-nav__link">
<span class="md-ellipsis">
操作系统
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2013%3A%20computing%20and%20OS/04.%20concurrency%20and%20parallelism/" class="md-nav__link">
<span class="md-ellipsis">
并发与并行
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2013%3A%20computing%20and%20OS/05.%20programming%20languages/" class="md-nav__link">
<span class="md-ellipsis">
编程语言
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_15" >
<label class="md-nav__link" for="__nav_15" id="__nav_15_label" tabindex="0">
<span class="md-ellipsis">
数据结构与算法
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_15_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_15">
<span class="md-nav__icon md-icon"></span>
数据结构与算法
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/00.%20foundations/" class="md-nav__link">
<span class="md-ellipsis">
基础
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/01.%20arrays%20and%20hashing/" class="md-nav__link">
<span class="md-ellipsis">
数组与哈希
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/02.%20linked%20lists%2C%20stacks%2C%20and%20queues/" class="md-nav__link">
<span class="md-ellipsis">
链表、栈与队列
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/03.%20trees/" class="md-nav__link">
<span class="md-ellipsis">
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/04.%20graphs/" class="md-nav__link">
<span class="md-ellipsis">
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/05.%20sorting%20and%20search/" class="md-nav__link">
<span class="md-ellipsis">
排序与搜索
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_16" >
<label class="md-nav__link" for="__nav_16" id="__nav_16_label" tabindex="0">
<span class="md-ellipsis">
生产级软件工程
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_16_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_16">
<span class="md-nav__icon md-icon"></span>
生产级软件工程
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2015%3A%20production%20software%20engineering/01.%20linux%20and%20CMD/" class="md-nav__link">
<span class="md-ellipsis">
Linux 与命令行
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2015%3A%20production%20software%20engineering/02.%20git%20and%20repository%20management/" class="md-nav__link">
<span class="md-ellipsis">
Git 与仓库管理
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2015%3A%20production%20software%20engineering/03.%20codebase%20design/" class="md-nav__link">
<span class="md-ellipsis">
代码设计
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2015%3A%20production%20software%20engineering/04.%20testing%20and%20quality%20assurance/" class="md-nav__link">
<span class="md-ellipsis">
测试与质量保障
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2015%3A%20production%20software%20engineering/05.%20deployment%20and%20devops/" class="md-nav__link">
<span class="md-ellipsis">
部署与 DevOps
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_17" >
<label class="md-nav__link" for="__nav_17" id="__nav_17_label" tabindex="0">
<span class="md-ellipsis">
SIMD 与 GPU 编程
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_17_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_17">
<span class="md-nav__icon md-icon"></span>
SIMD 与 GPU 编程
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/00.%20why%20C%2B%2B%20and%20how%20ML%20frameworks%20work/" class="md-nav__link">
<span class="md-ellipsis">
为什么是 C++ 及 ML 框架原理
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/01.%20hardware%20fundamentals/" class="md-nav__link">
<span class="md-ellipsis">
硬件基础
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/02.%20ARM%20and%20NEON/" class="md-nav__link">
<span class="md-ellipsis">
ARM 与 NEON
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/03.%20x86%20and%20AVX/" class="md-nav__link">
<span class="md-ellipsis">
x86 与 AVX
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/04.%20GPU%20architecture%20and%20CUDA/" class="md-nav__link">
<span class="md-ellipsis">
GPU 架构与 CUDA
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/05.%20triton%2C%20TPUs%20and%20pallax/" class="md-nav__link">
<span class="md-ellipsis">
Triton、TPU 与 Pallas
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/06.%20RISC-V%20and%20embedded%20systems/" class="md-nav__link">
<span class="md-ellipsis">
RISC-V 与嵌入式系统
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/07.%20vulkan%20compute%20and%20cross-platform%20GPU/" class="md-nav__link">
<span class="md-ellipsis">
Vulkan Compute 与跨平台 GPU
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--section md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_18" checked>
<label class="md-nav__link" for="__nav_18" id="__nav_18_label" tabindex="">
<span class="md-ellipsis">
AI 推理
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_18_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_18">
<span class="md-nav__icon md-icon"></span>
AI 推理
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../01.%20quantisation/" class="md-nav__link">
<span class="md-ellipsis">
量化
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
高效架构
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
高效架构
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="目录">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
目录
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#streamingllm" class="md-nav__link">
<span class="md-ellipsis">
StreamingLLM:无限长度生成
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_2" class="md-nav__link">
<span class="md-ellipsis">
稀疏注意力
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_3" class="md-nav__link">
<span class="md-ellipsis">
线性注意力和状态空间模型
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_4" class="md-nav__link">
<span class="md-ellipsis">
多查询和分组查询注意力
</span>
</a>
<nav class="md-nav" aria-label="多查询和分组查询注意力">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#mla" class="md-nav__link">
<span class="md-ellipsis">
多头潜在注意力(MLA)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#flash-attention" class="md-nav__link">
<span class="md-ellipsis">
Flash Attention
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#ring-attention" class="md-nav__link">
<span class="md-ellipsis">
Ring Attention
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#_5" class="md-nav__link">
<span class="md-ellipsis">
推理时的混合专家
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_6" class="md-nav__link">
<span class="md-ellipsis">
知识蒸馏
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_7" class="md-nav__link">
<span class="md-ellipsis">
剪枝
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#nas" class="md-nav__link">
<span class="md-ellipsis">
神经架构搜索(NAS)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#colabnotebook" class="md-nav__link">
<span class="md-ellipsis">
编程任务(使用CoLab或notebook
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../03.%20serving%20and%20batching/" class="md-nav__link">
<span class="md-ellipsis">
服务与批处理
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../04.%20edge%20inference/" class="md-nav__link">
<span class="md-ellipsis">
边缘推理
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../05.%20scaling%20and%20deployment/" class="md-nav__link">
<span class="md-ellipsis">
扩缩与部署
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_19" >
<label class="md-nav__link" for="__nav_19" id="__nav_19_label" tabindex="0">
<span class="md-ellipsis">
ML 系统设计
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_19_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_19">
<span class="md-nav__icon md-icon"></span>
ML 系统设计
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2018%3A%20ML%20systems%20design/01.%20systems%20design%20fundamentals/" class="md-nav__link">
<span class="md-ellipsis">
系统设计基础
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2018%3A%20ML%20systems%20design/02.%20cloud%20computing/" class="md-nav__link">
<span class="md-ellipsis">
云计算
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2018%3A%20ML%20systems%20design/03.%20large%20scale%20infrastructure/" class="md-nav__link">
<span class="md-ellipsis">
大规模基础设施
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2018%3A%20ML%20systems%20design/04.%20ML%20systems%20design/" class="md-nav__link">
<span class="md-ellipsis">
ML 系统设计
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2018%3A%20ML%20systems%20design/05.%20ML%20design%20examples/" class="md-nav__link">
<span class="md-ellipsis">
ML 设计案例
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_20" >
<label class="md-nav__link" for="__nav_20" id="__nav_20_label" tabindex="0">
<span class="md-ellipsis">
应用 AI
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_20_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_20">
<span class="md-nav__icon md-icon"></span>
应用 AI
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2019%3A%20applied%20AI/01.%20AI%20for%20finance/" class="md-nav__link">
<span class="md-ellipsis">
AI 金融
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2019%3A%20applied%20AI/02.%20protein%20design/" class="md-nav__link">
<span class="md-ellipsis">
蛋白质设计
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2019%3A%20applied%20AI/03.%20drug%20discovery/" class="md-nav__link">
<span class="md-ellipsis">
药物发现
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2019%3A%20applied%20AI/04.%20agentic%20systems/" class="md-nav__link">
<span class="md-ellipsis">
智能体系统
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2019%3A%20applied%20AI/05.%20healthcare/" class="md-nav__link">
<span class="md-ellipsis">
医疗健康
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_21" >
<label class="md-nav__link" for="__nav_21" id="__nav_21_label" tabindex="0">
<span class="md-ellipsis">
前沿 AI
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_21_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_21">
<span class="md-nav__icon md-icon"></span>
前沿 AI
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../chapter%2020%3A%20bleeding%20edge%20AI/01.%20quantum%20machine%20learning/" class="md-nav__link">
<span class="md-ellipsis">
量子机器学习
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2020%3A%20bleeding%20edge%20AI/02.%20neuromorphic%20computing/" class="md-nav__link">
<span class="md-ellipsis">
神经形态计算
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2020%3A%20bleeding%20edge%20AI/03.%20datacentres%20in%20space/" class="md-nav__link">
<span class="md-ellipsis">
太空数据中心
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2020%3A%20bleeding%20edge%20AI/04.%20decentralised%20AI/" class="md-nav__link">
<span class="md-ellipsis">
去中心化 AI
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../chapter%2020%3A%20bleeding%20edge%20AI/05.%20brain%20machine%20interfaces/" class="md-nav__link">
<span class="md-ellipsis">
脑机接口
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="目录">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
目录
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#streamingllm" class="md-nav__link">
<span class="md-ellipsis">
StreamingLLM:无限长度生成
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_2" class="md-nav__link">
<span class="md-ellipsis">
稀疏注意力
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_3" class="md-nav__link">
<span class="md-ellipsis">
线性注意力和状态空间模型
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_4" class="md-nav__link">
<span class="md-ellipsis">
多查询和分组查询注意力
</span>
</a>
<nav class="md-nav" aria-label="多查询和分组查询注意力">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#mla" class="md-nav__link">
<span class="md-ellipsis">
多头潜在注意力(MLA)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#flash-attention" class="md-nav__link">
<span class="md-ellipsis">
Flash Attention
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#ring-attention" class="md-nav__link">
<span class="md-ellipsis">
Ring Attention
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#_5" class="md-nav__link">
<span class="md-ellipsis">
推理时的混合专家
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_6" class="md-nav__link">
<span class="md-ellipsis">
知识蒸馏
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_7" class="md-nav__link">
<span class="md-ellipsis">
剪枝
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#nas" class="md-nav__link">
<span class="md-ellipsis">
神经架构搜索(NAS)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#colabnotebook" class="md-nav__link">
<span class="md-ellipsis">
编程任务(使用CoLab或notebook
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="_1">高效架构<a class="headerlink" href="#_1" title="Permanent link">&para;</a></h1>
<p><em>让模型更快不仅仅是降低精度,还在于设计更智能的架构,使每个token的计算量更少。本文涵盖StreamingLLM、稀疏和线性注意力、多查询和分组查询注意力、推理时的混合专家、知识蒸馏、剪枝和神经架构搜索</em></p>
<ul>
<li>量化(文件1)使每个操作更廉价。本文从源头上减少操作数量。两者互补:一个架构高效且量化的模型可以比原始模型快10-100倍。</li>
</ul>
<h2 id="streamingllm">StreamingLLM:无限长度生成<a class="headerlink" href="#streamingllm" title="Permanent link">&para;</a></h2>
<ul>
<li>
<p>标准Transformer将所有先前的token存储在KV缓存中,KV缓存随序列长度线性增长。在某一点上,缓存超过GPU内存,生成失败。<strong>StreamingLLM</strong>(Xiao等人,2023)使用固定大小的<strong>滚动KV缓存</strong>解决了这个问题。</p>
</li>
<li>
<p>关键观察:序列中的前几个token,无论其内容如何,都获得不成比例的高注意力分数。这些被称为<strong>注意力汇聚点</strong>。如果将它们从缓存中逐出,注意力分布会崩溃,生成质量灾难性下降。</p>
</li>
<li>
<p>StreamingLLM的解决方案:在缓存中永久保留少量<strong>汇聚token</strong>(前1-4个token),加上最近<span class="arithmatex">\(w\)</span>个token的<strong>滚动窗口</strong>。总缓存大小为<span class="arithmatex">\(\text{sink} + w\)</span>,无论生成了多少token都是固定的。</p>
</li>
</ul>
<div class="arithmatex">\[\text{缓存} = [\text{token}_0, \text{token}_1, \text{token}_{t-w+1}, \ldots, \text{token}_t]\]</div>
<ul>
<li>
<p>注意力汇聚点锚定softmax分布,滚动窗口提供最近的上下文。这实现了<strong>无限长度生成</strong>,内存恒定,代价是失去了访问序列中间上下文的能力。</p>
</li>
<li>
<p>StreamingLLM无需重新训练即可用于自然形成注意力汇聚点的模型(大多数预训练LLM都会)。对于不形成汇聚点的模型,在训练期间添加一个可学习的汇聚token即可解决。</p>
</li>
</ul>
<h2 id="_2">稀疏注意力<a class="headerlink" href="#_2" title="Permanent link">&para;</a></h2>
<ul>
<li>全自注意力在序列长度<span class="arithmatex">\(n\)</span>上是<span class="arithmatex">\(O(n^2)\)</span>,因为每个token关注所有其他token。对于<span class="arithmatex">\(n = 128K\)</span>,注意力矩阵有<span class="arithmatex">\(128K^2 = 160\)</span>亿个条目。<strong>稀疏注意力</strong>模式通过限制哪些token关注哪些token来减少这个数量。</li>
</ul>
<p><img alt="注意力稀疏模式:全注意力是O(n²),滑动窗口是O(n·w),局部+全局添加长距离token" src="../../images/attention_sparsity_patterns.svg" /></p>
<ul>
<li>
<p><strong>滑动窗口注意力</strong>Mistral、Gemma):每个token只关注之前<span class="arithmatex">\(w\)</span>个token(例如<span class="arithmatex">\(w = 4096\)</span>)。注意力是<span class="arithmatex">\(O(n \cdot w)\)</span>而不是<span class="arithmatex">\(O(n^2)\)</span>。信息通过多层在窗口之外传播:经过<span class="arithmatex">\(L\)</span>层后,有效上下文为<span class="arithmatex">\(L \times w\)</span></p>
</li>
<li>
<p><strong>局部+全局注意力</strong>Longformer、BigBird):大多数token使用滑动窗口注意力(局部),但少数指定token(例如[CLS],每512个token)关注所有token(全局)。这同时捕获了局部模式和长距离依赖。</p>
</li>
<li>
<p><strong>膨胀注意力</strong>:关注窗口内每第<span class="arithmatex">\(k\)</span>个token,创建一个覆盖更大范围但注意力分数数量相同的稀疏模式。跨层增加膨胀度创建类似于膨胀卷积的层次结构(第8章)。</p>
</li>
<li>
<p>现代LLM的实际胜者是<strong>滑动窗口+全注意力交错</strong>:某些层使用滑动窗口(廉价,处理局部上下文),某些层使用全注意力(昂贵,捕获长距离)。Mistral/Mixtral使用这种模式。</p>
</li>
</ul>
<h2 id="_3">线性注意力和状态空间模型<a class="headerlink" href="#_3" title="Permanent link">&para;</a></h2>
<ul>
<li>
<p>我们能完全替换<span class="arithmatex">\(O(n^2)\)</span>的注意力吗?<strong>线性注意力</strong><strong>状态空间模型(SSM</strong>通过避免显式注意力矩阵,以<span class="arithmatex">\(O(n)\)</span>时间处理序列。</p>
</li>
<li>
<p><strong>线性注意力</strong>用核近似替换softmax注意力:</p>
</li>
</ul>
<div class="arithmatex">\[\text{标准: } O = \text{softmax}(QK^T / \sqrt{d}) V$$
$$\text{线性: } O = \phi(Q) (\phi(K)^T V)\]</div>
<ul>
<li>
<p>通过先关联<span class="arithmatex">\(K^T V\)</span>乘积(这是<span class="arithmatex">\(d \times d\)</span>,与序列长度无关),计算变成<span class="arithmatex">\(O(n \cdot d^2)\)</span>而不是<span class="arithmatex">\(O(n^2 \cdot d)\)</span>。对于<span class="arithmatex">\(n \gg d\)</span>的长序列,这是巨大的节省。</p>
</li>
<li>
<p><strong>RWKV</strong>结合了RNN和Transformer的思想。它使用循环公式顺序处理token(像RNN),但可以在训练时并行化(像Transformer)。推理是每个token <span class="arithmatex">\(O(1)\)</span>(常量内存,KV缓存不增长)。</p>
</li>
<li>
<p><strong>Mamba</strong>Gu &amp; Dao,2023)是一种选择性状态空间模型。它通过学习到的状态转换处理序列:</p>
</li>
</ul>
<div class="arithmatex">\[h_t = \bar{A} h_{t-1} + \bar{B} x_t, \quad y_t = C h_t\]</div>
<ul>
<li>
<p>其中<span class="arithmatex">\(\bar{A}\)</span><span class="arithmatex">\(\bar{B}\)</span>是依赖于输入的(选择性),允许Mamba动态关注或忽略输入的部分。与固定SSM不同,选择性使Mamba在语言任务上与Transformer具有竞争力,同时保持<span class="arithmatex">\(O(n)\)</span>的扩展性。</p>
</li>
<li>
<p><strong>权衡</strong>:线性注意力和SSM对长序列更快,但对于需要精确长距离检索的任务,通常不如全注意力。混合架构(一些Transformer层+一些Mamba层)通常能提供两全其美的效果。</p>
</li>
</ul>
<h2 id="_4">多查询和分组查询注意力<a class="headerlink" href="#_4" title="Permanent link">&para;</a></h2>
<ul>
<li>
<p>标准多头注意力(MHA,第7章)为每个头使用独立的<span class="arithmatex">\(K\)</span><span class="arithmatex">\(V\)</span>投影。对于<span class="arithmatex">\(h\)</span>个headKV缓存中有<span class="arithmatex">\(h\)</span>个独立的键和值张量。<strong>多查询注意力(MQA</strong><strong>分组查询注意力(GQA</strong>减少了这个数量。</p>
</li>
<li>
<p><strong>MQA</strong>Shazeer,2019):所有头共享单组<span class="arithmatex">\(K, V\)</span>投影。每个头仍然有自己的<span class="arithmatex">\(Q\)</span>投影。KV缓存缩小了<span class="arithmatex">\(h\)</span>倍(例如,32个头则缩小32倍)。</p>
</li>
<li>
<p><strong>GQA</strong>Ainslie等人,2023):一个中间方案。头被分组,每组共享一组<span class="arithmatex">\(K, V\)</span>投影。有<span class="arithmatex">\(h = 32\)</span>个头和<span class="arithmatex">\(g = 8\)</span>个组,每组4个头共享K/V。KV缓存缩小了<span class="arithmatex">\(h/g = 4\)</span>倍。</p>
</li>
</ul>
<div class="arithmatex">\[\text{MHA: } h \text{ 个头, } h \text{ 个 K/V 集} \quad \to \quad \text{GQA: } h \text{ 个头, } g \text{ 个 K/V 集} \quad \to \quad \text{MQA: } h \text{ 个头, } 1 \text{ 个 K/V 集}\]</div>
<p><img alt="MHA vs GQA vs MQAMHA给每个头自己的KV,GQA跨组共享KV,MQA为所有头使用单个KV——大幅减少KV缓存大小" src="../../images/mha_gqa_mqa.svg" /></p>
<ul>
<li>大多数现代LLM使用GQALlama 2/3、Gemma、Mistral)。它减少了KV缓存内存和推理延迟,与MHA相比质量损失可以忽略不计。</li>
</ul>
<h3 id="mla">多头潜在注意力(MLA<a class="headerlink" href="#mla" title="Permanent link">&para;</a></h3>
<ul>
<li><strong>MLA</strong>DeepSeek-V22024)通过将KV缓存压缩为<strong>低秩潜在空间</strong>,比GQA更进一步。MLA不是缓存完整的键和值向量,而是每个token缓存一个压缩后的潜在向量<span class="arithmatex">\(\mathbf{c}_t\)</span>,并在注意力期间动态重构K/V</li>
</ul>
<div class="arithmatex">\[\mathbf{c}_t = W_{\text{compress}} \cdot [\mathbf{k}_t; \mathbf{v}_t], \quad \mathbf{k}_t = W_K^{\text{up}} \cdot \mathbf{c}_t, \quad \mathbf{v}_t = W_V^{\text{up}} \cdot \mathbf{c}_t\]</div>
<ul>
<li>
<p>压缩向量<span class="arithmatex">\(\mathbf{c}_t\)</span>比原始K和V的组合小得多。DeepSeek-V2实现了与MHA相比<strong>93.3%的KV缓存大小减少</strong>,甚至优于MQA,同时保持MHA级别的质量。</p>
</li>
<li>
<p>权衡:从潜在向量重构K/V在每个注意力操作中增加了少量计算成本。但由于LLM解码是内存带宽受限的(而非计算受限),这总体上是个净收益:更少的内存加载 &gt; 每token稍多计算。</p>
</li>
</ul>
<h3 id="flash-attention">Flash Attention<a class="headerlink" href="#flash-attention" title="Permanent link">&para;</a></h3>
<ul>
<li>
<p><strong>Flash Attention</strong>Dao等人,2022,第16章文件05有详细论述)不是架构变化,而是一种实现优化,在任何高效注意力讨论中都不可或缺。它计算精确的标准注意力,具有以下特点:</p>
<ul>
<li><strong>O(n)内存</strong>而不是O(n²)(注意力矩阵从未在HBM中具体化)。</li>
<li><strong>比标准注意力快2-4倍</strong>(通过分块和在线softmax将数据保留在SRAM中)。</li>
<li><strong>无质量损失</strong>——输出在数学上与标准注意力完全相同。</li>
</ul>
</li>
<li>
<p>Flash Attention现在是PyTorch<code>torch.nn.functional.scaled_dot_product_attention</code>)、JAX和所有主要推理框架中默认的注意力实现。如果你在2024+年运行注意力,你几乎肯定在使用Flash Attention。</p>
</li>
</ul>
<h3 id="ring-attention">Ring Attention<a class="headerlink" href="#ring-attention" title="Permanent link">&para;</a></h3>
<ul>
<li>
<p><strong>Ring Attention</strong>(Liu等人,2023)将注意力计算分布到多个设备上,用于即使使用Flash Attention也无法装入单GPU内存的长序列。</p>
</li>
<li>
<p>思路:将序列分区到<span class="arithmatex">\(N\)</span>个设备上。每个设备持有<span class="arithmatex">\(n/N\)</span>个token的Q、K、V。设备排列成环形。每一步:</p>
<ol>
<li>每个设备计算局部注意力(其Q对其局部K/V)。</li>
<li>每个设备将其K/V块发送到环中的下一个设备。</li>
<li>每个设备从上一个设备接收K/V,并针对这些K/V计算注意力。</li>
<li>经过<span class="arithmatex">\(N\)</span>步后,每个设备已经关注过每个K/V块。</li>
</ol>
</li>
<li>
<p>通信与计算<strong>重叠</strong>:在当前K/V块上计算注意力的同时,下一个块正在传输中。这几乎完全隐藏了通信延迟。</p>
</li>
<li>
<p>Ring Attention通过将KV缓存分布在一圈GPU上,实现了<strong>百万token上下文窗口</strong>。每台设备的内存为O(n/N),使得任意长序列都可行(仅受设备数量限制)。</p>
</li>
</ul>
<h2 id="_5">推理时的混合专家<a class="headerlink" href="#_5" title="Permanent link">&para;</a></h2>
<ul>
<li>
<p>MoE模型(第7章)每个token只激活其参数的一小部分(通常8个专家中的2个)。在推理时,独特的挑战是<strong>专家缓存</strong>:所有专家都必须在内存中(因为任何token可能路由到任何专家),但每个token只有2个活跃。</p>
</li>
<li>
<p>对于Mixtral 8x7B模型:总参数 = 47B(8 × 7B专家,但有共享组件)。每个token的活跃参数 ≈ 13B(2个专家 + 共享层)。该模型具有LLM-70B级别的质量,但推理成本为LLM-13B级别,不过需要在内存中保留47B参数。</p>
</li>
<li>
<p><strong>专家卸载</strong>:对于GPU内存受限的部署,将非活跃专家保留在CPU或SSD上,按需加载。这之所以有效,是因为token路由足够可预测,可以预取可能的专家。</p>
</li>
<li>
<p><strong>专家缓存</strong>:在GPU内存中维护最近使用的专家的LRU缓存。如果相同的专家被重复激活(在领域内数据中常见),缓存命中率很高。</p>
</li>
</ul>
<h2 id="_6">知识蒸馏<a class="headerlink" href="#_6" title="Permanent link">&para;</a></h2>
<ul>
<li><strong>蒸馏</strong>(第6章)训练一个小的"学生"模型来模仿一个大的"教师"。学生从教师的软预测(类上的概率分布)中学习,这比单独的硬标签包含更多信息。</li>
</ul>
<div class="arithmatex">\[\mathcal{L} = \alpha \cdot \text{KL}(p_{\text{teacher}}^{T} \| p_{\text{student}}^{T}) + (1 - \alpha) \cdot \mathcal{L}_{\text{CE}}(y, p_{\text{student}})\]</div>
<ul>
<li>
<p>其中<span class="arithmatex">\(T\)</span>是温度(更高的<span class="arithmatex">\(T\)</span>使分布变软,揭示教师的不确定性),<span class="arithmatex">\(\alpha\)</span>平衡蒸馏损失与标准交叉熵损失。</p>
</li>
<li>
<p><strong>对于LLM</strong>:蒸馏用于从大型、能力强的模型创建小型、快速的模型。GPT-4 → 一个7B学生模型,在特定任务上捕获GPT-4的大部分行为。学生模型的推理成本可以低10-100倍。</p>
</li>
<li>
<p><strong>任务特定蒸馏</strong>:仅在与部署任务相关的数据上进行蒸馏。从70B教师模型在医疗问答上蒸馏出的7B模型,在该特定任务上可以超越70B模型(因为学生有限的容量完全集中在目标领域上)。</p>
</li>
</ul>
<h2 id="_7">剪枝<a class="headerlink" href="#_7" title="Permanent link">&para;</a></h2>
<ul>
<li>
<p><strong>剪枝</strong>移除不必要的权重(将其设为零),减少模型大小和计算量。</p>
</li>
<li>
<p><strong>非结构化剪枝</strong>(基于幅值):移除绝对值最小的单个权重。这创建了稀疏权重矩阵。简单有效用于压缩,但当前硬件(GPU)除非稀疏性遵循特定模式,否则无法高效加速稀疏操作。</p>
</li>
<li>
<p><strong>结构化剪枝</strong>:移除整个单元——注意力头、MLP神经元或层。这产生一个更小的稠密模型,可以在标准硬件上直接加速。权衡是粒度更粗(移除一个完整的头可能同时移除了有用和无用的权重)。</p>
</li>
<li>
<p><strong>2:4稀疏性</strong>NVIDIA Ampere+):一种硬件支持的稀疏模式,每4个权重中有2个为零。GPU的稀疏Tensor Core跳过零乘法,实现约2倍加速。这是目前唯一具有实际硬件加速的稀疏模式。</p>
</li>
<li>
<p><strong>彩票假说</strong>Frankle &amp; Carlin,2019):在随机初始化的网络中,存在一个子网络("中奖彩票"),可以单独训练以匹配完整网络的性能。找到这些子网络(通过训练、剪枝和重置)成本高昂,但这个洞察激励了剪枝研究。</p>
</li>
</ul>
<h2 id="nas">神经架构搜索(NAS<a class="headerlink" href="#nas" title="Permanent link">&para;</a></h2>
<ul>
<li>
<p><strong>NAS</strong>通过搜索可能的架构空间来自动化架构设计,找到在硬件约束(延迟、内存、功耗)下最大化精度的架构。</p>
</li>
<li>
<p><strong>EfficientNet</strong>(第8章)就是通过NAS找到的:复合缩放规则(平衡深度、宽度、分辨率)是从搜索中涌现的,而非人类直觉。</p>
</li>
<li>
<p>对于推理效率,NAS可以找到针对特定硬件目标优化的架构:"找到一个在iPhone神经引擎上延迟&lt;5ms且在ImageNet上精度&gt;80%的模型。"搜索空间包括层类型、宽度、激活函数和注意力模式。</p>
</li>
<li>
<p><strong>一次性网络</strong>训练一个单个过参数化网络,为不同的部署目标提取子网络。一次训练运行产生针对云GPU、移动GPU和CPU优化的模型,每个都针对其目标进行了优化。</p>
</li>
</ul>
<h2 id="colabnotebook">编程任务(使用CoLab或notebook<a class="headerlink" href="#colabnotebook" title="Permanent link">&para;</a></h2>
<ol>
<li>
<p>实现滑动窗口注意力,并与全注意力比较内存使用。
<div class="highlight"><pre><span></span><code><a id="__codelineno-0-1" name="__codelineno-0-1" href="#__codelineno-0-1"></a><span class="kn">import</span><span class="w"> </span><span class="nn">jax</span>
<a id="__codelineno-0-2" name="__codelineno-0-2" href="#__codelineno-0-2"></a><span class="kn">import</span><span class="w"> </span><span class="nn">jax.numpy</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">jnp</span>
<a id="__codelineno-0-3" name="__codelineno-0-3" href="#__codelineno-0-3"></a>
<a id="__codelineno-0-4" name="__codelineno-0-4" href="#__codelineno-0-4"></a><span class="k">def</span><span class="w"> </span><span class="nf">full_attention</span><span class="p">(</span><span class="n">Q</span><span class="p">,</span> <span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">):</span>
<a id="__codelineno-0-5" name="__codelineno-0-5" href="#__codelineno-0-5"></a><span class="w"> </span><span class="sd">&quot;&quot;&quot;标准O(n²)注意力。&quot;&quot;&quot;</span>
<a id="__codelineno-0-6" name="__codelineno-0-6" href="#__codelineno-0-6"></a> <span class="n">scores</span> <span class="o">=</span> <span class="n">Q</span> <span class="o">@</span> <span class="n">K</span><span class="o">.</span><span class="n">T</span> <span class="o">/</span> <span class="n">jnp</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="n">Q</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">])</span>
<a id="__codelineno-0-7" name="__codelineno-0-7" href="#__codelineno-0-7"></a> <span class="n">weights</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">softmax</span><span class="p">(</span><span class="n">scores</span><span class="p">,</span> <span class="n">axis</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span>
<a id="__codelineno-0-8" name="__codelineno-0-8" href="#__codelineno-0-8"></a> <span class="k">return</span> <span class="n">weights</span> <span class="o">@</span> <span class="n">V</span>
<a id="__codelineno-0-9" name="__codelineno-0-9" href="#__codelineno-0-9"></a>
<a id="__codelineno-0-10" name="__codelineno-0-10" href="#__codelineno-0-10"></a><span class="k">def</span><span class="w"> </span><span class="nf">sliding_window_attention</span><span class="p">(</span><span class="n">Q</span><span class="p">,</span> <span class="n">K</span><span class="p">,</span> <span class="n">V</span><span class="p">,</span> <span class="n">window_size</span><span class="o">=</span><span class="mi">128</span><span class="p">):</span>
<a id="__codelineno-0-11" name="__codelineno-0-11" href="#__codelineno-0-11"></a><span class="w"> </span><span class="sd">&quot;&quot;&quot;滑动窗口注意力:每个token关注前window_size个token。&quot;&quot;&quot;</span>
<a id="__codelineno-0-12" name="__codelineno-0-12" href="#__codelineno-0-12"></a> <span class="n">n</span> <span class="o">=</span> <span class="n">Q</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<a id="__codelineno-0-13" name="__codelineno-0-13" href="#__codelineno-0-13"></a> <span class="n">d</span> <span class="o">=</span> <span class="n">Q</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
<a id="__codelineno-0-14" name="__codelineno-0-14" href="#__codelineno-0-14"></a> <span class="n">output</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros_like</span><span class="p">(</span><span class="n">Q</span><span class="p">)</span>
<a id="__codelineno-0-15" name="__codelineno-0-15" href="#__codelineno-0-15"></a>
<a id="__codelineno-0-16" name="__codelineno-0-16" href="#__codelineno-0-16"></a> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">n</span><span class="p">):</span>
<a id="__codelineno-0-17" name="__codelineno-0-17" href="#__codelineno-0-17"></a> <span class="n">start</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">i</span> <span class="o">-</span> <span class="n">window_size</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span>
<a id="__codelineno-0-18" name="__codelineno-0-18" href="#__codelineno-0-18"></a> <span class="n">k_window</span> <span class="o">=</span> <span class="n">K</span><span class="p">[</span><span class="n">start</span><span class="p">:</span><span class="n">i</span><span class="o">+</span><span class="mi">1</span><span class="p">]</span>
<a id="__codelineno-0-19" name="__codelineno-0-19" href="#__codelineno-0-19"></a> <span class="n">v_window</span> <span class="o">=</span> <span class="n">V</span><span class="p">[</span><span class="n">start</span><span class="p">:</span><span class="n">i</span><span class="o">+</span><span class="mi">1</span><span class="p">]</span>
<a id="__codelineno-0-20" name="__codelineno-0-20" href="#__codelineno-0-20"></a> <span class="n">scores</span> <span class="o">=</span> <span class="n">Q</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">@</span> <span class="n">k_window</span><span class="o">.</span><span class="n">T</span> <span class="o">/</span> <span class="n">jnp</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="n">d</span><span class="p">)</span>
<a id="__codelineno-0-21" name="__codelineno-0-21" href="#__codelineno-0-21"></a> <span class="n">weights</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">softmax</span><span class="p">(</span><span class="n">scores</span><span class="p">)</span>
<a id="__codelineno-0-22" name="__codelineno-0-22" href="#__codelineno-0-22"></a> <span class="n">output</span> <span class="o">=</span> <span class="n">output</span><span class="o">.</span><span class="n">at</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="n">weights</span> <span class="o">@</span> <span class="n">v_window</span><span class="p">)</span>
<a id="__codelineno-0-23" name="__codelineno-0-23" href="#__codelineno-0-23"></a>
<a id="__codelineno-0-24" name="__codelineno-0-24" href="#__codelineno-0-24"></a> <span class="k">return</span> <span class="n">output</span>
<a id="__codelineno-0-25" name="__codelineno-0-25" href="#__codelineno-0-25"></a>
<a id="__codelineno-0-26" name="__codelineno-0-26" href="#__codelineno-0-26"></a><span class="n">n</span><span class="p">,</span> <span class="n">d</span> <span class="o">=</span> <span class="mi">512</span><span class="p">,</span> <span class="mi">64</span>
<a id="__codelineno-0-27" name="__codelineno-0-27" href="#__codelineno-0-27"></a><span class="n">key</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">PRNGKey</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
<a id="__codelineno-0-28" name="__codelineno-0-28" href="#__codelineno-0-28"></a><span class="n">Q</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="n">d</span><span class="p">))</span>
<a id="__codelineno-0-29" name="__codelineno-0-29" href="#__codelineno-0-29"></a><span class="n">K</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">jax</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">PRNGKey</span><span class="p">(</span><span class="mi">1</span><span class="p">),</span> <span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="n">d</span><span class="p">))</span>
<a id="__codelineno-0-30" name="__codelineno-0-30" href="#__codelineno-0-30"></a><span class="n">V</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">jax</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">PRNGKey</span><span class="p">(</span><span class="mi">2</span><span class="p">),</span> <span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="n">d</span><span class="p">))</span>
<a id="__codelineno-0-31" name="__codelineno-0-31" href="#__codelineno-0-31"></a>
<a id="__codelineno-0-32" name="__codelineno-0-32" href="#__codelineno-0-32"></a><span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;全注意力内存: O(n²) = </span><span class="si">{</span><span class="n">n</span><span class="o">*</span><span class="n">n</span><span class="si">}</span><span class="s2"> 个条目&quot;</span><span class="p">)</span>
<a id="__codelineno-0-33" name="__codelineno-0-33" href="#__codelineno-0-33"></a><span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;窗口 (w=128) 内存: O(n*w) = </span><span class="si">{</span><span class="n">n</span><span class="o">*</span><span class="mi">128</span><span class="si">}</span><span class="s2"> 个条目&quot;</span><span class="p">)</span>
<a id="__codelineno-0-34" name="__codelineno-0-34" href="#__codelineno-0-34"></a><span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;减少: </span><span class="si">{</span><span class="n">n</span><span class="o">*</span><span class="n">n</span><span class="w"> </span><span class="o">/</span><span class="w"> </span><span class="p">(</span><span class="n">n</span><span class="o">*</span><span class="mi">128</span><span class="p">)</span><span class="si">:</span><span class="s2">.1f</span><span class="si">}</span><span class="s2">x&quot;</span><span class="p">)</span>
</code></pre></div></p>
</li>
<li>
<p>比较MHA、GQA和MQA的KV缓存大小。展示为什么GQA是实际的最佳选择。
<div class="highlight"><pre><span></span><code><a id="__codelineno-1-1" name="__codelineno-1-1" href="#__codelineno-1-1"></a><span class="k">def</span><span class="w"> </span><span class="nf">kv_cache_size</span><span class="p">(</span><span class="n">n_heads</span><span class="p">,</span> <span class="n">n_kv_heads</span><span class="p">,</span> <span class="n">d_head</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">,</span> <span class="nb">bytes</span><span class="o">=</span><span class="mi">2</span><span class="p">):</span>
<a id="__codelineno-1-2" name="__codelineno-1-2" href="#__codelineno-1-2"></a><span class="w"> </span><span class="sd">&quot;&quot;&quot;KV缓存大小(MB)。&quot;&quot;&quot;</span>
<a id="__codelineno-1-3" name="__codelineno-1-3" href="#__codelineno-1-3"></a> <span class="k">return</span> <span class="mi">2</span> <span class="o">*</span> <span class="n">n_kv_heads</span> <span class="o">*</span> <span class="n">d_head</span> <span class="o">*</span> <span class="n">seq_len</span> <span class="o">*</span> <span class="nb">bytes</span> <span class="o">/</span> <span class="mf">1e6</span>
<a id="__codelineno-1-4" name="__codelineno-1-4" href="#__codelineno-1-4"></a>
<a id="__codelineno-1-5" name="__codelineno-1-5" href="#__codelineno-1-5"></a><span class="n">n_heads</span> <span class="o">=</span> <span class="mi">32</span>
<a id="__codelineno-1-6" name="__codelineno-1-6" href="#__codelineno-1-6"></a><span class="n">d_head</span> <span class="o">=</span> <span class="mi">128</span>
<a id="__codelineno-1-7" name="__codelineno-1-7" href="#__codelineno-1-7"></a><span class="n">seq_len</span> <span class="o">=</span> <span class="mi">32768</span>
<a id="__codelineno-1-8" name="__codelineno-1-8" href="#__codelineno-1-8"></a>
<a id="__codelineno-1-9" name="__codelineno-1-9" href="#__codelineno-1-9"></a><span class="n">mha</span> <span class="o">=</span> <span class="n">kv_cache_size</span><span class="p">(</span><span class="n">n_heads</span><span class="p">,</span> <span class="n">n_heads</span><span class="p">,</span> <span class="n">d_head</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">)</span> <span class="c1"># 32个KV头</span>
<a id="__codelineno-1-10" name="__codelineno-1-10" href="#__codelineno-1-10"></a><span class="n">gqa</span> <span class="o">=</span> <span class="n">kv_cache_size</span><span class="p">(</span><span class="n">n_heads</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="n">d_head</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">)</span> <span class="c1"># 8个KV头</span>
<a id="__codelineno-1-11" name="__codelineno-1-11" href="#__codelineno-1-11"></a><span class="n">mqa</span> <span class="o">=</span> <span class="n">kv_cache_size</span><span class="p">(</span><span class="n">n_heads</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">d_head</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">)</span> <span class="c1"># 1个KV头</span>
<a id="__codelineno-1-12" name="__codelineno-1-12" href="#__codelineno-1-12"></a>
<a id="__codelineno-1-13" name="__codelineno-1-13" href="#__codelineno-1-13"></a><span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;MHA (32个KV头): </span><span class="si">{</span><span class="n">mha</span><span class="si">:</span><span class="s2">.0f</span><span class="si">}</span><span class="s2"> MB 每层&quot;</span><span class="p">)</span>
<a id="__codelineno-1-14" name="__codelineno-1-14" href="#__codelineno-1-14"></a><span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;GQA (8个KV头): </span><span class="si">{</span><span class="n">gqa</span><span class="si">:</span><span class="s2">.0f</span><span class="si">}</span><span class="s2"> MB 每层 (</span><span class="si">{</span><span class="n">mha</span><span class="o">/</span><span class="n">gqa</span><span class="si">:</span><span class="s2">.0f</span><span class="si">}</span><span class="s2">x 更小)&quot;</span><span class="p">)</span>
<a id="__codelineno-1-15" name="__codelineno-1-15" href="#__codelineno-1-15"></a><span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;MQA (1个KV头): </span><span class="si">{</span><span class="n">mqa</span><span class="si">:</span><span class="s2">.0f</span><span class="si">}</span><span class="s2"> MB 每层 (</span><span class="si">{</span><span class="n">mha</span><span class="o">/</span><span class="n">mqa</span><span class="si">:</span><span class="s2">.0f</span><span class="si">}</span><span class="s2">x 更小)&quot;</span><span class="p">)</span>
</code></pre></div></p>
</li>
<li>
<p>通过从随机注意力层中移除最不重要的注意力头并测量输出变化来模拟结构化剪枝。
<div class="highlight"><pre><span></span><code><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a><span class="kn">import</span><span class="w"> </span><span class="nn">jax</span>
<a id="__codelineno-2-2" name="__codelineno-2-2" href="#__codelineno-2-2"></a><span class="kn">import</span><span class="w"> </span><span class="nn">jax.numpy</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">jnp</span>
<a id="__codelineno-2-3" name="__codelineno-2-3" href="#__codelineno-2-3"></a>
<a id="__codelineno-2-4" name="__codelineno-2-4" href="#__codelineno-2-4"></a><span class="n">key</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">PRNGKey</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
<a id="__codelineno-2-5" name="__codelineno-2-5" href="#__codelineno-2-5"></a><span class="n">n_heads</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">,</span> <span class="n">d_head</span> <span class="o">=</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">64</span><span class="p">,</span> <span class="mi">32</span>
<a id="__codelineno-2-6" name="__codelineno-2-6" href="#__codelineno-2-6"></a>
<a id="__codelineno-2-7" name="__codelineno-2-7" href="#__codelineno-2-7"></a><span class="c1"># 随机多头注意力输出(每个头一个)</span>
<a id="__codelineno-2-8" name="__codelineno-2-8" href="#__codelineno-2-8"></a><span class="n">head_outputs</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="p">(</span><span class="n">n_heads</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">,</span> <span class="n">d_head</span><span class="p">))</span>
<a id="__codelineno-2-9" name="__codelineno-2-9" href="#__codelineno-2-9"></a>
<a id="__codelineno-2-10" name="__codelineno-2-10" href="#__codelineno-2-10"></a><span class="c1"># 完整输出:连接所有头</span>
<a id="__codelineno-2-11" name="__codelineno-2-11" href="#__codelineno-2-11"></a><span class="n">full_output</span> <span class="o">=</span> <span class="n">head_outputs</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="n">seq_len</span><span class="p">,</span> <span class="n">n_heads</span> <span class="o">*</span> <span class="n">d_head</span><span class="p">)</span>
<a id="__codelineno-2-12" name="__codelineno-2-12" href="#__codelineno-2-12"></a>
<a id="__codelineno-2-13" name="__codelineno-2-13" href="#__codelineno-2-13"></a><span class="c1"># 重要性:通过范数度量每个头的贡献</span>
<a id="__codelineno-2-14" name="__codelineno-2-14" href="#__codelineno-2-14"></a><span class="n">head_norms</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">norm</span><span class="p">(</span><span class="n">head_outputs</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">))</span>
<a id="__codelineno-2-15" name="__codelineno-2-15" href="#__codelineno-2-15"></a><span class="nb">print</span><span class="p">(</span><span class="s2">&quot;头重要性(按范数):&quot;</span><span class="p">,</span> <span class="n">jnp</span><span class="o">.</span><span class="n">round</span><span class="p">(</span><span class="n">head_norms</span><span class="p">,</span> <span class="mi">2</span><span class="p">))</span>
<a id="__codelineno-2-16" name="__codelineno-2-16" href="#__codelineno-2-16"></a>
<a id="__codelineno-2-17" name="__codelineno-2-17" href="#__codelineno-2-17"></a><span class="c1"># 剪枝最不重要的头</span>
<a id="__codelineno-2-18" name="__codelineno-2-18" href="#__codelineno-2-18"></a><span class="k">for</span> <span class="n">n_keep</span> <span class="ow">in</span> <span class="p">[</span><span class="mi">8</span><span class="p">,</span> <span class="mi">6</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">2</span><span class="p">]:</span>
<a id="__codelineno-2-19" name="__codelineno-2-19" href="#__codelineno-2-19"></a> <span class="n">top_heads</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">argsort</span><span class="p">(</span><span class="n">head_norms</span><span class="p">)[</span><span class="o">-</span><span class="n">n_keep</span><span class="p">:]</span>
<a id="__codelineno-2-20" name="__codelineno-2-20" href="#__codelineno-2-20"></a> <span class="n">pruned</span> <span class="o">=</span> <span class="n">head_outputs</span><span class="p">[</span><span class="n">top_heads</span><span class="p">]</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="n">seq_len</span><span class="p">,</span> <span class="n">n_keep</span> <span class="o">*</span> <span class="n">d_head</span><span class="p">)</span>
<a id="__codelineno-2-21" name="__codelineno-2-21" href="#__codelineno-2-21"></a>
<a id="__codelineno-2-22" name="__codelineno-2-22" href="#__codelineno-2-22"></a> <span class="c1"># 填充到原始大小用于比较(将剪掉的头设为零)</span>
<a id="__codelineno-2-23" name="__codelineno-2-23" href="#__codelineno-2-23"></a> <span class="n">full_pruned</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros_like</span><span class="p">(</span><span class="n">head_outputs</span><span class="p">)</span>
<a id="__codelineno-2-24" name="__codelineno-2-24" href="#__codelineno-2-24"></a> <span class="n">full_pruned</span> <span class="o">=</span> <span class="n">full_pruned</span><span class="o">.</span><span class="n">at</span><span class="p">[</span><span class="n">top_heads</span><span class="p">]</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="n">head_outputs</span><span class="p">[</span><span class="n">top_heads</span><span class="p">])</span>
<a id="__codelineno-2-25" name="__codelineno-2-25" href="#__codelineno-2-25"></a> <span class="n">full_pruned</span> <span class="o">=</span> <span class="n">full_pruned</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="n">seq_len</span><span class="p">,</span> <span class="n">n_heads</span> <span class="o">*</span> <span class="n">d_head</span><span class="p">)</span>
<a id="__codelineno-2-26" name="__codelineno-2-26" href="#__codelineno-2-26"></a>
<a id="__codelineno-2-27" name="__codelineno-2-27" href="#__codelineno-2-27"></a> <span class="n">error</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">norm</span><span class="p">(</span><span class="n">full_output</span> <span class="o">-</span> <span class="n">full_pruned</span><span class="p">)</span> <span class="o">/</span> <span class="n">jnp</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">norm</span><span class="p">(</span><span class="n">full_output</span><span class="p">)</span>
<a id="__codelineno-2-28" name="__codelineno-2-28" href="#__codelineno-2-28"></a> <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;保留 </span><span class="si">{</span><span class="n">n_keep</span><span class="si">}</span><span class="s2">/</span><span class="si">{</span><span class="n">n_heads</span><span class="si">}</span><span class="s2"> 个头: 相对误差 = </span><span class="si">{</span><span class="n">error</span><span class="si">:</span><span class="s2">.4f</span><span class="si">}</span><span class="s2">, &quot;</span>
<a id="__codelineno-2-29" name="__codelineno-2-29" href="#__codelineno-2-29"></a> <span class="sa">f</span><span class="s2">&quot;内存 = </span><span class="si">{</span><span class="n">n_keep</span><span class="o">/</span><span class="n">n_heads</span><span class="si">:</span><span class="s2">.0%</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
</code></pre></div></p>
</li>
</ol>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
<button type="button" class="md-top md-icon" data-md-component="top" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8z"/></svg>
回到页面顶部
</button>
</main>
<footer class="md-footer">
<nav class="md-footer__inner md-grid" aria-label="页脚" >
<a href="../01.%20quantisation/" class="md-footer__link md-footer__link--prev" aria-label="上一页: 量化">
<div class="md-footer__button md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
</div>
<div class="md-footer__title">
<span class="md-footer__direction">
上一页
</span>
<div class="md-ellipsis">
量化
</div>
</div>
</a>
<a href="../03.%20serving%20and%20batching/" class="md-footer__link md-footer__link--next" aria-label="下一页: 服务与批处理">
<div class="md-footer__title">
<span class="md-footer__direction">
下一页
</span>
<div class="md-ellipsis">
服务与批处理
</div>
</div>
<div class="md-footer__button md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11z"/></svg>
</div>
</a>
</nav>
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
<div class="md-social">
<a href="https://github.com/flykhan/maths-cs-ai-compendium-zh" target="_blank" rel="noopener" title="github.com" class="md-social__link">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M173.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6m-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3m44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9M252.8 8C114.1 8 8 113.3 8 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C436.2 457.8 504 362.9 504 252 504 113.3 391.5 8 252.8 8M105.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1m-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7m32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1m-11.4-14.7c-1.6 1-1.6 3.6 0 5.9s4.3 3.3 5.6 2.3c1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2"/></svg>
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": ["navigation.tabs", "navigation.sections", "navigation.expand", "navigation.top", "navigation.footer", "search.suggest", "search.highlight", "content.code.copy", "toc.follow"], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "\u5df2\u590d\u5236", "clipboard.copy": "\u590d\u5236", "search.result.more.one": "\u5728\u8be5\u9875\u4e0a\u8fd8\u6709 1 \u4e2a\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.more.other": "\u5728\u8be5\u9875\u4e0a\u8fd8\u6709 # \u4e2a\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.none": "\u6ca1\u6709\u627e\u5230\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.one": "\u627e\u5230 1 \u4e2a\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.other": "# \u4e2a\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.placeholder": "\u952e\u5165\u4ee5\u5f00\u59cb\u641c\u7d22", "search.result.term.missing": "\u7f3a\u5c11", "select.version": "\u9009\u62e9\u5f53\u524d\u7248\u672c"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="../../javascripts/mathjax.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>