5879 lines
262 KiB
HTML
5879 lines
262 KiB
HTML
|
||
<!doctype html>
|
||
<html lang="zh" class="no-js">
|
||
<head>
|
||
|
||
<meta charset="utf-8">
|
||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||
|
||
<meta name="description" content="一本开源的直觉优先教科书,从零开始覆盖数学、计算机科学和人工智能(中文翻译版)。">
|
||
|
||
|
||
<meta name="author" content="Henry Ndubuaku (flykhan 译)">
|
||
|
||
|
||
<link rel="canonical" href="https://flykhan.github.io/maths-cs-ai-compendium-zh/chapter%2009%3A%20audio%20and%20speech/03.%20text%20to%20speech%20and%20voice/">
|
||
|
||
|
||
<link rel="prev" href="../02.%20automatic%20speech%20recognition/">
|
||
|
||
|
||
<link rel="next" href="../04.%20speaker%20and%20audio%20analysis/">
|
||
|
||
|
||
|
||
|
||
|
||
<link rel="icon" href="../../assets/images/favicon.png">
|
||
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
|
||
|
||
|
||
|
||
<title>语音合成 - 数学、计算机科学与 AI 百科全书</title>
|
||
|
||
|
||
|
||
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
|
||
|
||
|
||
<link rel="stylesheet" href="../../assets/stylesheets/palette.ab4e12ef.min.css">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
||
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
||
|
||
|
||
|
||
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
||
|
||
|
||
|
||
|
||
|
||
</head>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<body dir="ltr" data-md-color-scheme="default" data-md-color-primary="slate" data-md-color-accent="indigo">
|
||
|
||
|
||
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||
<label class="md-overlay" for="__drawer"></label>
|
||
<div data-md-component="skip">
|
||
|
||
|
||
<a href="#_1" class="md-skip">
|
||
跳转至
|
||
</a>
|
||
|
||
</div>
|
||
<div data-md-component="announce">
|
||
|
||
</div>
|
||
|
||
|
||
|
||
|
||
<header class="md-header" data-md-component="header">
|
||
<nav class="md-header__inner md-grid" aria-label="页眉">
|
||
<a href="../.." title="数学、计算机科学与 AI 百科全书" class="md-header__button md-logo" aria-label="数学、计算机科学与 AI 百科全书" data-md-component="logo">
|
||
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||
|
||
</a>
|
||
<label class="md-header__button md-icon" for="__drawer">
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
||
</label>
|
||
<div class="md-header__title" data-md-component="header-title">
|
||
<div class="md-header__ellipsis">
|
||
<div class="md-header__topic">
|
||
<span class="md-ellipsis">
|
||
数学、计算机科学与 AI 百科全书
|
||
</span>
|
||
</div>
|
||
<div class="md-header__topic" data-md-component="header-topic">
|
||
<span class="md-ellipsis">
|
||
|
||
语音合成
|
||
|
||
</span>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
<form class="md-header__option" data-md-component="palette">
|
||
|
||
|
||
|
||
|
||
<input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="slate" data-md-color-accent="indigo" aria-label="切换到深色模式" type="radio" name="__palette" id="__palette_0">
|
||
|
||
<label class="md-header__button md-icon" title="切换到深色模式" for="__palette_1" hidden>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
|
||
</label>
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="slate" data-md-color-accent="indigo" aria-label="切换到浅色模式" type="radio" name="__palette" id="__palette_1">
|
||
|
||
<label class="md-header__button md-icon" title="切换到浅色模式" for="__palette_0" hidden>
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
|
||
</label>
|
||
|
||
|
||
</form>
|
||
|
||
|
||
|
||
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-header__button md-icon" for="__search">
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
|
||
</label>
|
||
<div class="md-search" data-md-component="search" role="dialog">
|
||
<label class="md-search__overlay" for="__search"></label>
|
||
<div class="md-search__inner" role="search">
|
||
<form class="md-search__form" name="search">
|
||
<input type="text" class="md-search__input" name="query" aria-label="搜索" placeholder="搜索" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
|
||
<label class="md-search__icon md-icon" for="__search">
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
|
||
</label>
|
||
<nav class="md-search__options" aria-label="查找">
|
||
|
||
<button type="reset" class="md-search__icon md-icon" title="清空当前内容" aria-label="清空当前内容" tabindex="-1">
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
|
||
</button>
|
||
</nav>
|
||
|
||
<div class="md-search__suggest" data-md-component="search-suggest"></div>
|
||
|
||
</form>
|
||
<div class="md-search__output">
|
||
<div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
|
||
<div class="md-search-result" data-md-component="search-result">
|
||
<div class="md-search-result__meta">
|
||
正在初始化搜索引擎
|
||
</div>
|
||
<ol class="md-search-result__list" role="presentation"></ol>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div class="md-header__source">
|
||
<a href="https://github.com/flykhan/maths-cs-ai-compendium-zh" title="前往仓库" class="md-source" data-md-component="source">
|
||
<div class="md-source__icon md-icon">
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
|
||
</div>
|
||
<div class="md-source__repository">
|
||
flykhan/maths-cs-ai-compendium-zh
|
||
</div>
|
||
</a>
|
||
</div>
|
||
|
||
</nav>
|
||
|
||
</header>
|
||
|
||
<div class="md-container" data-md-component="container">
|
||
|
||
|
||
|
||
|
||
|
||
<nav class="md-tabs" aria-label="标签" data-md-component="tabs">
|
||
<div class="md-grid">
|
||
<ul class="md-tabs__list">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../.." class="md-tabs__link">
|
||
|
||
|
||
|
||
|
||
|
||
首页
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2001%3A%20vectors/01.%20vector%20spaces/" class="md-tabs__link">
|
||
|
||
|
||
|
||
向量
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2002%3A%20matrices/01.%20matrix%20properties/" class="md-tabs__link">
|
||
|
||
|
||
|
||
矩阵
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2003%3A%20calculus/01.%20differential%20calculus/" class="md-tabs__link">
|
||
|
||
|
||
|
||
微积分
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2004%3A%20statistics/01.%20fundamentals/" class="md-tabs__link">
|
||
|
||
|
||
|
||
统计学
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2005%3A%20probability/01.%20counting/" class="md-tabs__link">
|
||
|
||
|
||
|
||
概率论
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2006%3A%20machine%20learning/01.%20classical%20machine%20learning/" class="md-tabs__link">
|
||
|
||
|
||
|
||
机器学习
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2007%3A%20computational%20linguistics/01.%20linguistic%20foundations/" class="md-tabs__link">
|
||
|
||
|
||
|
||
计算语言学
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2008%3A%20computer%20vision/01.%20image%20fundamentals/" class="md-tabs__link">
|
||
|
||
|
||
|
||
计算机视觉
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item md-tabs__item--active">
|
||
<a href="../01.%20digital%20signal%20processing/" class="md-tabs__link">
|
||
|
||
|
||
|
||
音频与语音
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2010%3A%20multimodal%20learning/01.%20multimodal%20representations/" class="md-tabs__link">
|
||
|
||
|
||
|
||
多模态学习
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2011%3A%20autonomous%20systems/01.%20perception/" class="md-tabs__link">
|
||
|
||
|
||
|
||
自主系统
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2012%3A%20graph%20neural%20networks/01.%20geometric%20deep%20learning/" class="md-tabs__link">
|
||
|
||
|
||
|
||
图神经网络
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2013%3A%20computing%20and%20OS/01.%20discrete%20maths/" class="md-tabs__link">
|
||
|
||
|
||
|
||
计算机与操作系统
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/00.%20foundations/" class="md-tabs__link">
|
||
|
||
|
||
|
||
数据结构与算法
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2015%3A%20production%20software%20engineering/01.%20linux%20and%20CMD/" class="md-tabs__link">
|
||
|
||
|
||
|
||
生产级软件工程
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/00.%20why%20C%2B%2B%20and%20how%20ML%20frameworks%20work/" class="md-tabs__link">
|
||
|
||
|
||
|
||
SIMD 与 GPU 编程
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2017%3A%20AI%20inference/01.%20quantisation/" class="md-tabs__link">
|
||
|
||
|
||
|
||
AI 推理
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2018%3A%20ML%20systems%20design/01.%20systems%20design%20fundamentals/" class="md-tabs__link">
|
||
|
||
|
||
|
||
ML 系统设计
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2019%3A%20applied%20AI/01.%20AI%20for%20finance/" class="md-tabs__link">
|
||
|
||
|
||
|
||
应用 AI
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-tabs__item">
|
||
<a href="../../chapter%2020%3A%20bleeding%20edge%20AI/01.%20quantum%20machine%20learning/" class="md-tabs__link">
|
||
|
||
|
||
|
||
前沿 AI
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</div>
|
||
</nav>
|
||
|
||
|
||
|
||
<main class="md-main" data-md-component="main">
|
||
<div class="md-main__inner md-grid">
|
||
|
||
|
||
|
||
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||
<div class="md-sidebar__scrollwrap">
|
||
<div class="md-sidebar__inner">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<nav class="md-nav md-nav--primary md-nav--lifted" aria-label="导航栏" data-md-level="0">
|
||
<label class="md-nav__title" for="__drawer">
|
||
<a href="../.." title="数学、计算机科学与 AI 百科全书" class="md-nav__button md-logo" aria-label="数学、计算机科学与 AI 百科全书" data-md-component="logo">
|
||
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||
|
||
</a>
|
||
数学、计算机科学与 AI 百科全书
|
||
</label>
|
||
|
||
<div class="md-nav__source">
|
||
<a href="https://github.com/flykhan/maths-cs-ai-compendium-zh" title="前往仓库" class="md-source" data-md-component="source">
|
||
<div class="md-source__icon md-icon">
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
|
||
</div>
|
||
<div class="md-source__repository">
|
||
flykhan/maths-cs-ai-compendium-zh
|
||
</div>
|
||
</a>
|
||
</div>
|
||
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../.." class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
首页
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_2" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
向量
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_2">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
向量
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2001%3A%20vectors/01.%20vector%20spaces/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
向量空间
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2001%3A%20vectors/02.%20vector%20properties/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
向量性质
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2001%3A%20vectors/03.%20norms%20and%20metrics/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
范数与度量
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2001%3A%20vectors/04.%20products/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
向量积
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2001%3A%20vectors/05.%20basis%20and%20duality/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
基与对偶性
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_3" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
矩阵
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_3">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
矩阵
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2002%3A%20matrices/01.%20matrix%20properties/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
矩阵性质
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2002%3A%20matrices/02.%20matrix%20types/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
矩阵类型
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2002%3A%20matrices/03.%20operations/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
矩阵运算
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2002%3A%20matrices/04.%20linear%20transformations/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
线性变换
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2002%3A%20matrices/05.%20decompositions/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
矩阵分解
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_4" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
微积分
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_4">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
微积分
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2003%3A%20calculus/01.%20differential%20calculus/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
微分
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2003%3A%20calculus/02.%20integral%20calculus/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
积分
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2003%3A%20calculus/03.%20multivariate%20calculus/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
多元微积分
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2003%3A%20calculus/04.%20function%20approximation/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
函数逼近
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2003%3A%20calculus/05.%20optimisation/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
优化
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_5" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_5" id="__nav_5_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
统计学
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_5_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_5">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
统计学
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2004%3A%20statistics/01.%20fundamentals/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
基础
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2004%3A%20statistics/02.%20measures/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
统计量
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2004%3A%20statistics/03.%20sampling/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
抽样
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2004%3A%20statistics/04.%20hypothesis%20testing/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
假设检验
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2004%3A%20statistics/05.%20inference/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
推断
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_6" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_6" id="__nav_6_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
概率论
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_6_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_6">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
概率论
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2005%3A%20probability/01.%20counting/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
计数
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2005%3A%20probability/02.%20probability%20concepts/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
概率概念
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2005%3A%20probability/03.%20distributions/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
分布
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2005%3A%20probability/04.%20bayesian/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
贝叶斯
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2005%3A%20probability/05.%20information%20theory/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
信息论
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_7" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
机器学习
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_7">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
机器学习
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2006%3A%20machine%20learning/01.%20classical%20machine%20learning/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
经典机器学习
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2006%3A%20machine%20learning/02.%20gradient%20machine%20learning/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
梯度机器学习
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2006%3A%20machine%20learning/03.%20deep%20learning/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
深度学习
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2006%3A%20machine%20learning/04.%20reinforcement%20learning/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
强化学习
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2006%3A%20machine%20learning/05.%20distributed%20deep%20learning/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
分布式深度学习
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_8" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
计算语言学
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_8">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
计算语言学
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2007%3A%20computational%20linguistics/01.%20linguistic%20foundations/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
语言学基础
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2007%3A%20computational%20linguistics/02.%20text%20processing%20and%20classic%20NLP/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
文本处理与经典 NLP
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2007%3A%20computational%20linguistics/03.%20embeddings%20and%20sequence%20models/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
嵌入与序列模型
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2007%3A%20computational%20linguistics/04.%20transformers%20and%20language%20models/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Transformer 与语言模型
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2007%3A%20computational%20linguistics/05.%20advanced%20text%20generation/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
高级文本生成
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_9" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
计算机视觉
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_9">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
计算机视觉
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2008%3A%20computer%20vision/01.%20image%20fundamentals/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
图像基础
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2008%3A%20computer%20vision/02.%20convolutional%20networks/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
卷积网络
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2008%3A%20computer%20vision/03.%20object%20detection%20and%20segmentation/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
目标检测与分割
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2008%3A%20computer%20vision/04.%20vision%20transformers%20and%20generation/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
ViT 与生成模型
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2008%3A%20computer%20vision/05.%20video%20and%203D%20vision/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
视频与 3D 视觉
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--active md-nav__item--section md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" checked>
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
音频与语音
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="true">
|
||
<label class="md-nav__title" for="__nav_10">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
音频与语音
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../01.%20digital%20signal%20processing/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
数字信号处理
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../02.%20automatic%20speech%20recognition/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
自动语音识别
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--active">
|
||
|
||
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
语音合成
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<a href="./" class="md-nav__link md-nav__link--active">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
语音合成
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
|
||
|
||
|
||
<nav class="md-nav md-nav--secondary" aria-label="目录">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__title" for="__toc">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
目录
|
||
</label>
|
||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#colab-notebook" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
编程任务(使用 CoLab 或 notebook)
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../04.%20speaker%20and%20audio%20analysis/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
说话人与音频分析
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../05.%20source%20separation%20and%20noise/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
源分离与降噪
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_11" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_11" id="__nav_11_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
多模态学习
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_11_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_11">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
多模态学习
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2010%3A%20multimodal%20learning/01.%20multimodal%20representations/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
多模态表征
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2010%3A%20multimodal%20learning/02.%20vision%20language%20models/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
视觉语言模型
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2010%3A%20multimodal%20learning/03.%20image%20and%20video%20tokenisation/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
图像与视频 Token 化
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2010%3A%20multimodal%20learning/04.%20cross-modal%20generation/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
跨模态生成
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2010%3A%20multimodal%20learning/05.%20unified%20multimodal%20architectures/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
统一多模态架构
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_12" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_12" id="__nav_12_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
自主系统
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_12_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_12">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
自主系统
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2011%3A%20autonomous%20systems/01.%20perception/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
感知
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2011%3A%20autonomous%20systems/02.%20robot%20learning/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
机器人学习
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2011%3A%20autonomous%20systems/03.%20vision-language-action%20models/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
视觉-语言-动作模型
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2011%3A%20autonomous%20systems/04.%20self-driving/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
自动驾驶
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2011%3A%20autonomous%20systems/05.%20space%20and%20extreme%20robotics/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
太空与极端机器人
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_13" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_13" id="__nav_13_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
图神经网络
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_13_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_13">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
图神经网络
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2012%3A%20graph%20neural%20networks/01.%20geometric%20deep%20learning/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
几何深度学习
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2012%3A%20graph%20neural%20networks/02.%20graph%20theory/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
图论
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2012%3A%20graph%20neural%20networks/03.%20graph%20neural%20networks/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
图神经网络
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2012%3A%20graph%20neural%20networks/04.%20graph%20attention%20networks/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
图注意力网络
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2012%3A%20graph%20neural%20networks/05.%203d%20graph%20networks/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
3D 图网络
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_14" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_14" id="__nav_14_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
计算机与操作系统
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_14_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_14">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
计算机与操作系统
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2013%3A%20computing%20and%20OS/01.%20discrete%20maths/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
离散数学
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2013%3A%20computing%20and%20OS/02.%20computer%20architecture/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
计算机体系结构
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2013%3A%20computing%20and%20OS/03.%20operating%20systems/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
操作系统
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2013%3A%20computing%20and%20OS/04.%20concurrency%20and%20parallelism/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
并发与并行
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2013%3A%20computing%20and%20OS/05.%20programming%20languages/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
编程语言
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_15" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_15" id="__nav_15_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
数据结构与算法
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_15_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_15">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
数据结构与算法
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/00.%20foundations/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
基础
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/01.%20arrays%20and%20hashing/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
数组与哈希
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/02.%20linked%20lists%2C%20stacks%2C%20and%20queues/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
链表、栈与队列
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/03.%20trees/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
树
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/04.%20graphs/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
图
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2014%3A%20data%20structures%20and%20algorithms/05.%20sorting%20and%20search/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
排序与搜索
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_16" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_16" id="__nav_16_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
生产级软件工程
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_16_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_16">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
生产级软件工程
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2015%3A%20production%20software%20engineering/01.%20linux%20and%20CMD/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Linux 与命令行
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2015%3A%20production%20software%20engineering/02.%20git%20and%20repository%20management/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Git 与仓库管理
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2015%3A%20production%20software%20engineering/03.%20codebase%20design/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
代码设计
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2015%3A%20production%20software%20engineering/04.%20testing%20and%20quality%20assurance/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
测试与质量保障
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2015%3A%20production%20software%20engineering/05.%20deployment%20and%20devops/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
部署与 DevOps
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_17" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_17" id="__nav_17_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
SIMD 与 GPU 编程
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_17_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_17">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
SIMD 与 GPU 编程
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/00.%20why%20C%2B%2B%20and%20how%20ML%20frameworks%20work/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
为什么是 C++ 及 ML 框架原理
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/01.%20hardware%20fundamentals/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
硬件基础
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/02.%20ARM%20and%20NEON/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
ARM 与 NEON
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/03.%20x86%20and%20AVX/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
x86 与 AVX
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/04.%20GPU%20architecture%20and%20CUDA/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
GPU 架构与 CUDA
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/05.%20triton%2C%20TPUs%20and%20pallax/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Triton、TPU 与 Pallas
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/06.%20RISC-V%20and%20embedded%20systems/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
RISC-V 与嵌入式系统
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2016%3A%20SIMD%20and%20GPU%20programming/07.%20vulkan%20compute%20and%20cross-platform%20GPU/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Vulkan Compute 与跨平台 GPU
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_18" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_18" id="__nav_18_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
AI 推理
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_18_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_18">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
AI 推理
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2017%3A%20AI%20inference/01.%20quantisation/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
量化
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2017%3A%20AI%20inference/02.%20efficient%20architectures/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
高效架构
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2017%3A%20AI%20inference/03.%20serving%20and%20batching/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
服务与批处理
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2017%3A%20AI%20inference/04.%20edge%20inference/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
边缘推理
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2017%3A%20AI%20inference/05.%20scaling%20and%20deployment/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
扩缩与部署
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_19" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_19" id="__nav_19_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
ML 系统设计
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_19_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_19">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
ML 系统设计
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2018%3A%20ML%20systems%20design/01.%20systems%20design%20fundamentals/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
系统设计基础
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2018%3A%20ML%20systems%20design/02.%20cloud%20computing/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
云计算
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2018%3A%20ML%20systems%20design/03.%20large%20scale%20infrastructure/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
大规模基础设施
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2018%3A%20ML%20systems%20design/04.%20ML%20systems%20design/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
ML 系统设计
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2018%3A%20ML%20systems%20design/05.%20ML%20design%20examples/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
ML 设计案例
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_20" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_20" id="__nav_20_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
应用 AI
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_20_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_20">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
应用 AI
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2019%3A%20applied%20AI/01.%20AI%20for%20finance/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
AI 金融
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2019%3A%20applied%20AI/02.%20protein%20design/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
蛋白质设计
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2019%3A%20applied%20AI/03.%20drug%20discovery/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
药物发现
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2019%3A%20applied%20AI/04.%20agentic%20systems/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
智能体系统
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2019%3A%20applied%20AI/05.%20healthcare/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
医疗健康
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle md-toggle--indeterminate" type="checkbox" id="__nav_21" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_21" id="__nav_21_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
前沿 AI
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_21_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_21">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
前沿 AI
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2020%3A%20bleeding%20edge%20AI/01.%20quantum%20machine%20learning/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
量子机器学习
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2020%3A%20bleeding%20edge%20AI/02.%20neuromorphic%20computing/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
神经形态计算
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2020%3A%20bleeding%20edge%20AI/03.%20datacentres%20in%20space/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
太空数据中心
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2020%3A%20bleeding%20edge%20AI/04.%20decentralised%20AI/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
去中心化 AI
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../chapter%2020%3A%20bleeding%20edge%20AI/05.%20brain%20machine%20interfaces/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
脑机接口
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||
<div class="md-sidebar__scrollwrap">
|
||
<div class="md-sidebar__inner">
|
||
|
||
|
||
<nav class="md-nav md-nav--secondary" aria-label="目录">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__title" for="__toc">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
目录
|
||
</label>
|
||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#colab-notebook" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
编程任务(使用 CoLab 或 notebook)
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div class="md-content" data-md-component="content">
|
||
|
||
<article class="md-content__inner md-typeset">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<h1 id="_1">语音合成与声音<a class="headerlink" href="#_1" title="Permanent link">¶</a></h1>
|
||
<p><em>语音合成(Text-to-Speech Synthesis)逆向执行 ASR 流水线,从书面文本生成自然听感的音频。本文涵盖 TTS 流水线(文本规范化、G2P、声学模型、声码器)、Tacotron、WaveNet、HiFi-GAN、声音克隆、声音转换以及语音活动检测(VAD)。</em></p>
|
||
<ul>
|
||
<li>
|
||
<p>在文件 01 中,我们构建了信号处理工具包:波形、语谱图、梅尔滤波器组和 MFCC。在文件 02 中,我们将语音转换为文本。现在我们反方向操作:给定文本,合成自然听感的语音。这就是<strong>语音合成(TTS)</strong>,一个同样通向声音转换、声音克隆和语音活动检测的问题。</p>
|
||
</li>
|
||
<li>
|
||
<p>将 TTS 想象成一场舞台表演。剧本就是文本输入。导演(声学模型)决定每句台词应该如何发音——音高、时长、重音。管弦乐队(声码器)随后演奏乐谱,产生听众实际听到的声波。现代神经 TTS 用媲美人类说话者的演绎,取代了基于规则系统那种僵硬、机械的发音。</p>
|
||
</li>
|
||
</ul>
|
||
<p><img alt="TTS 流水线:文本被规范化、转换为音素、由声学模型处理生成梅尔语谱图,然后通过声码器生成最终波形" src="../../images/tts_pipeline.svg" /></p>
|
||
<ul>
|
||
<li>
|
||
<p><strong>语音合成流水线</strong> 标准 TTS 流水线包含四个阶段:(1) 文本规范化,(2) 音素转换,(3) 声学模型,(4) 声码器。一些现代系统将阶段 3 和 4 合并为一个端到端模型,但这种概念分解仍然有用。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>文本规范化</strong> 将原始文本转换为可发音的形式。缩写展开("Dr."变为"Doctor")、数字变为词语("1984"变为"nineteen eighty-four")、货币符号被口头发音("$5"变为"five dollars"),以及处理 URL 或特殊字符。这一阶段通常基于规则和语言特定文法,不过也存在神经规范化模型。此处的错误会传播到所有下游阶段:如果"St."被读作"saint"而不是"street",整个发音就错了。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>字素到音素(G2P)转换</strong> 将规范化文本映射为音素序列。英语尤其不规则("though"、"through"、"tough"中的"ough"发音各不相同),因此词典查找(CMU 发音词典)处理常见词语,而神经序列到序列模型(第 06 章的编码器-解码器或第 07 章的 Transformer)处理词汇表外的词语。浅层正字法语言(西班牙语、芬兰语)需要更简单的 G2P。输出通常是 IPA(国际音标)序列或等效的内部音素集合。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>声学模型</strong> 接收音素序列并产生中间声学表示,几乎总是<strong>梅尔语谱图</strong>(文件 01)。梅尔语谱图捕获每个时间帧的频谱包络,编码了声码器重构波形所需的感知相关信息。声学模型必须决定时长(每个音素持续多久)、音高(基频 <span class="arithmatex">\(F_0\)</span>)和能量(响度)。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>声码器</strong> 接收梅尔语谱图并产生原始音频波形。这是一个不适定的反演问题:由于相位信息已被丢弃,许多波形可以产生相同的语谱图。经典声码器(Griffin-Lim、WORLD)使用迭代或信号模型方法,但神经声码器现在在质量上占主导地位。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>声码器:WaveNet</strong>(van den Oord 等人,2016)是第一个生成几乎与人类录音无法区分的语音的神经声码器。它自回归地对波形建模,预测每个样本 <span class="arithmatex">\(x_t\)</span> 的条件概率依赖于所有先前样本:</p>
|
||
</li>
|
||
</ul>
|
||
<div class="arithmatex">\[P(x) = \prod_{t=1}^{T} P(x_t \mid x_1, \ldots, x_{t-1}, c)\]</div>
|
||
<ul>
|
||
<li>
|
||
<p>其中 <span class="arithmatex">\(c\)</span> 是条件信号(梅尔语谱图)。每个样本是 16 位,因此对 65536 个值进行朴素 softmax 是不切实际的。WaveNet 使用 <strong>μ-law 压扩</strong> 减少到 256 个量化级别,或者后来的变体使用 logistics 混合分布。</p>
|
||
</li>
|
||
<li>
|
||
<p>WaveNet 的核心构建模块是<strong>扩张因果卷积</strong>。因果意味着滤波器权重只看过去样本(无未来泄露)。扩张意味着滤波器以指数增长的间隔跳过样本:扩张因子 <span class="arithmatex">\(1, 2, 4, 8, \ldots, 512\)</span>。这提供了指数级大的感受野,同时保持参数量线性增长。</p>
|
||
</li>
|
||
<li>
|
||
<p>每层的门控激活函数为:</p>
|
||
</li>
|
||
</ul>
|
||
<div class="arithmatex">\[z = \tanh(W_{f} \ast x) \odot \sigma(W_{g} \ast x)\]</div>
|
||
<ul>
|
||
<li>
|
||
<p>其中 <span class="arithmatex">\(W_f\)</span> 和 <span class="arithmatex">\(W_g\)</span> 是滤波器和门控卷积权重,<span class="arithmatex">\(\ast\)</span> 表示扩张因果卷积,<span class="arithmatex">\(\odot\)</span> 是逐元素乘法。这种门控机制(来自第 06 章的 LSTM)允许网络控制信息流。</p>
|
||
</li>
|
||
<li>
|
||
<p>WaveNet 产生卓越的质量,但推理速度极慢:生成一秒 24 kHz 音频需要 24000 次顺序前向传播。这推动了所有后续声码器研究。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>WaveRNN</strong>(Kalchbrenner 等人,2018)用单层循环网络取代了 WaveNet 的深层卷积堆叠。它将每个 16 位样本拆分为粗(高 8 位)和细(低 8 位)分量,使用 GRU(第 06 章)预测每个分量。这种双 softmax 方法显著减少了计算量,同时保持了高质量。经过精心内核优化后,WaveRNN 在移动 CPU 上足以实现实时运行。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>WaveGlow</strong>(Prenger 等人,2019)是一种基于<strong>流</strong>的声码器,完全避免了自回归生成。它使用一系列可逆变换(仿射耦合层,第 06 章的正则化流)将简单高斯分布映射到波形分布。训练使用变量变换公式最大化精确对数似然:</p>
|
||
</li>
|
||
</ul>
|
||
<div class="arithmatex">\[\log P(x) = \log P(z) + \sum_{i} \log \left| \det \frac{\partial f_i}{\partial f_{i-1}} \right|\]</div>
|
||
<ul>
|
||
<li>
|
||
<p>其中 <span class="arithmatex">\(z = f(x)\)</span> 是通过将 <span class="arithmatex">\(x\)</span> 传递经流得到的潜在变量。推理时,抽取样本 <span class="arithmatex">\(z \sim \mathcal{N}(0, I)\)</span> 并通过逆流以单次并行前向传播推出。WaveGlow 用模型大小(耦合层的大网络)换取生成速度。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>HiFi-GAN</strong>(Kong 等人,2020)使用<strong>生成对抗网络</strong>从梅尔语谱图合成波形。生成器通过一系列转置卷积对梅尔语谱图进行上采样,每个卷积后跟一个<strong>多感受野融合(MRF)</strong>模块。MRF 模块并行应用多个具有不同核大小和扩张率的残差块,然后将它们的输出求和。这使得生成器能够同时捕获多个时间尺度的模式。</p>
|
||
</li>
|
||
</ul>
|
||
<p><img alt="HiFi-GAN 生成器架构:梅尔语谱图输入经过转置卷积上采样层,每层后跟多感受野融合块,这些融合块组合了具有不同扩张模式的并行残差堆叠" src="../../images/hifi_gan_generator.svg" /></p>
|
||
<ul>
|
||
<li>
|
||
<p>HiFi-GAN 使用两种鉴别器类型。<strong>多周期鉴别器(MPD)</strong>通过以不同周期(2、3、5、7、11)折叠一维波形,将其重塑为二维,然后应用二维卷积。这捕获了不同基频下的周期结构。<strong>多尺度鉴别器(MSD)</strong>在原始波形、2 倍降采样和 4 倍降采样版本上操作,捕获不同时间分辨率下的模式。</p>
|
||
</li>
|
||
<li>
|
||
<p>训练目标结合了对抗损失、<strong>梅尔语谱图重构损失</strong>(合成音频与真实音频的梅尔语谱图之间的 L1 距离)和<strong>特征匹配损失</strong>(中间鉴别器特征之间的 L1 距离):</p>
|
||
</li>
|
||
</ul>
|
||
<div class="arithmatex">\[\mathcal{L}_G = \mathcal{L}_{\text{adv}}(G) + \lambda_{\text{mel}} \mathcal{L}_{\text{mel}}(G) + \lambda_{\text{fm}} \mathcal{L}_{\text{fm}}(G)\]</div>
|
||
<ul>
|
||
<li>
|
||
<p>HiFi-GAN 实现了与 WaveNet 相当的合成质量,同时速度提升超过 1000 倍,可在单个 GPU 上实现实时生成。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>神经源-滤波器(NSF)模型</strong>将传统信号处理与神经网络相结合。在经典源-滤波器模型中,浊音由声源激励(基频 <span class="arithmatex">\(F_0\)</span> 处的周期脉冲序列)通过声道滤波器(频谱包络)产生。NSF 模型用神经网络替代手工设计的滤波器,同时保留显式源信号。输入的 <span class="arithmatex">\(F_0\)</span> 轮廓提供了纯数据驱动声码器有时难以处理的精细音高控制。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>声学模型:Tacotron</strong>(Wang 等人,2017)是第一个直接将字符序列转换为梅尔语谱图的端到端神经 TTS 系统。它使用带注意力机制的编码器-解码器架构(第 07 章)。编码器使用卷积库、高速网络和双向 GRU 处理字符/音素序列。解码器是一个自回归 GRU,逐个预测梅尔帧,使用前一帧和注意力上下文作为输入。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>Tacotron 2</strong>(Shen 等人,2018)显著改进了架构。编码器是一个 3 层一维卷积堆叠后跟双向 LSTM(第 06 章)。解码器是一个 2 层 LSTM,带<strong>位置敏感注意力</strong>,该注意力机制不仅基于编码器输出和解码器状态,还基于先前步骤累积的注意力权重来条件化。这防止了注意力跳过或重复词语的常见失败模式。</p>
|
||
</li>
|
||
</ul>
|
||
<p><img alt="Tacotron 2 架构:字符/音素编码器包含卷积层和 BiLSTM,位置敏感注意力对齐到梅尔语谱图帧,自回归解码器包含停止标记预测" src="../../images/tacotron2_architecture.svg" /></p>
|
||
<ul>
|
||
<li>解码器步骤 <span class="arithmatex">\(i\)</span> 下编码器位置 <span class="arithmatex">\(j\)</span> 的位置敏感注意力能量为:</li>
|
||
</ul>
|
||
<div class="arithmatex">\[e_{i,j} = w^T \tanh(W_s s_{i-1} + W_h h_j + W_f f_{i,j} + b)\]</div>
|
||
<ul>
|
||
<li>
|
||
<p>其中 <span class="arithmatex">\(s_{i-1}\)</span> 是前一个解码器状态,<span class="arithmatex">\(h_j\)</span> 是位置 <span class="arithmatex">\(j\)</span> 处的编码器输出,<span class="arithmatex">\(f_{i,j}\)</span> 是通过将累积注意力权重 <span class="arithmatex">\(\sum_{k<i} \alpha_{k,j}\)</span> 与一维卷积滤波器卷积得到的位置特征。注意力权重为 <span class="arithmatex">\(\alpha_{i,j} = \text{softmax}(e_{i,j})\)</span>。</p>
|
||
</li>
|
||
<li>
|
||
<p>Tacotron 2 的解码器还在每个步骤预测一个<strong>停止标记</strong>概率,指示梅尔语谱图何时完成。输出的梅尔语谱图随后传递给声码器(最初是 WaveNet,后来被 HiFi-GAN 或类似模型取代)。</p>
|
||
</li>
|
||
<li>
|
||
<p>Tacotron 2 的自回归特性意味着合成速度受限于梅尔帧的数量。对于典型的每秒 80 帧的梅尔语谱图,一个 5 秒的发音需要 400 个顺序解码步骤。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>FastSpeech</strong>(Ren 等人,2019)使用<strong>非自回归</strong>声学模型解决了速度问题。FastSpeech 不是顺序生成梅尔帧,而是并行生成所有帧。关键挑战在于确定每个音素应该产生多少梅尔帧,FastSpeech 通过<strong>时长预测器</strong>来处理。</p>
|
||
</li>
|
||
<li>
|
||
<p>时长预测器是一个小型卷积网络,预测每个音素的整数时长(梅尔帧数)。训练期间,真实时长使用其注意力对齐从预训练的自回归教师模型(Tacotron 2)中提取。推理期间,使用预测时长通过<strong>长度调节器</strong>将音素级隐藏序列扩展到帧级,该调节器简单地将每个音素的隐藏表示重复预测的帧数。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>FastSpeech 2</strong>(Ren 等人,2021)通过移除教师-学生蒸馏改进了 FastSpeech。它直接使用强制对齐(来自文件 02 的声学模型框架)提取真实时长,并在时长之外添加了显式的音高(<span class="arithmatex">\(F_0\)</span>)和能量<strong>方差适配器</strong>。每个适配器是一个小型卷积预测器,其输出条件化解码器:</p>
|
||
</li>
|
||
</ul>
|
||
<div class="arithmatex">\[
|
||
\begin{aligned}
|
||
\hat{d}_i &= \text{DurationPredictor}(h_i) \\
|
||
\hat{p}_i &= \text{PitchPredictor}(h_i) \\
|
||
\hat{e}_i &= \text{EnergyPredictor}(h_i)
|
||
\end{aligned}
|
||
\]</div>
|
||
<ul>
|
||
<li>
|
||
<p>其中 <span class="arithmatex">\(h_i\)</span> 是音素 <span class="arithmatex">\(i\)</span> 的编码器隐藏状态。训练时使用真实值;推理时,预测值提供对韵律的显式控制。这种可控性是 FastSpeech 2 的主要优势:调整音高、速度或能量就像缩放预测器输出一样简单。</p>
|
||
</li>
|
||
<li>
|
||
<p>FastSpeech 2 在推理时通常比 Tacotron 2 快 10-20 倍,并避免了常见的自回归失败模式,如词语跳过、重复和注意力崩塌。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>VITS</strong>(Kim 等人,2021)是一个<strong>端到端</strong> TTS 模型,直接从文本生成波形,消除了独立的声码器阶段。VITS 结合了条件变分自编码器(第 06 章)、正则化流和对抗训练。后验编码器将真实梅尔语谱图映射到潜在空间,先验编码器将音素(通过基于 Transformer 的文本编码器和时长预测器)映射到同一潜在空间,解码器(基于 HiFi-GAN)从潜在样本生成波形。</p>
|
||
</li>
|
||
<li>
|
||
<p>VITS 的训练目标结合了:</p>
|
||
<ul>
|
||
<li><strong>重构损失</strong>:VAE 迫使潜在分布编码声学信息</li>
|
||
<li><strong>KL 散度</strong>:对齐文本条件化的先验与音频条件化的后验</li>
|
||
<li><strong>对抗损失</strong>:鉴别器确保波形质量</li>
|
||
<li><strong>时长损失</strong>:训练随机时长预测器</li>
|
||
</ul>
|
||
</li>
|
||
<li>
|
||
<p>VITS 比两阶段系统(FastSpeech 2 + HiFi-GAN)产生更高质量,因为声学模型和声码器被联合优化,避免了预测梅尔语谱图与真实梅尔语谱图之间的不匹配,这种不匹配会降低两阶段系统的性能。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>VALL-E</strong>(Wang 等人,2023)从根本上将 TTS 重构为离散音频令牌上的<strong>语言建模问题</strong>。它使用神经音频编解码器(EnCodec)将语音表示为来自多个码本级的一系列离散码。给定文本提示和一个 3 秒的注册话语(也编码为离散令牌),VALL-E 使用 Transformer 语言模型自回归地预测音频令牌。</p>
|
||
</li>
|
||
<li>
|
||
<p>VALL-E 使用两个模型:一个<strong>自回归(AR)模型</strong>逐个令牌地生成第一个码本级,以及一个<strong>非自回归(NAR)模型</strong>并行预测剩余的码本级,以第一个级别和彼此为条件。这种编解码器语言模型方法实现了卓越的零样本声音克隆:3 秒样本足以重现说话人的声音、音色,甚至情感基调。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>StyleTTS</strong>(Li 等人,2022)和 <strong>StyleTTS 2</strong> 将语音解耦为内容和风格组件。风格编码器从参考音频中提取风格向量,捕获说话人身份、韵律和录音条件。推理时,风格可以从学习的先验分布中采样,或从参考话语中迁移。StyleTTS 2 使用扩散模型(第 08 章)作为风格先验,生成多样化且自然的韵律。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>Kokoro</strong>(2024)是一个轻量级、高质量的开放源码 TTS 模型,以其小巧的规模(约 82M 参数)和令人印象深刻的自热度而著称。它采用受 StyleTTS 2 启发的架构,包含基于扩散的风格先验和微调的 ISTFTNet 声码器,该声码器直接预测 STFT 系数(来自文件 01)而不是原始波形样本。尽管模型大小仅为 VALL-E 等模型的一小部分,Kokoro 在英语、日语、法语、韩语和中文上实现了接近人类的自然度,证明了精心策划的训练数据和高效架构设计可以与暴力规模相抗衡。Kokoro 的小体积使其非常适合本地和边缘部署。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>Orpheus</strong>(Canopy Labs,2025)是一个开放源码 TTS 模型系列(1B 和 3B 参数),构建在 VALL-E 开创的<strong>编解码器语言模型</strong>范式之上。Orpheus 更进一步,使用 LLM 骨干网络(微调的 Llama 3)直接生成 SNAC 音频编解码器令牌。其突出特点是类似人类的情感表达能力:它能够以卓越的自然度处理笑声、叹息、犹豫和情感韵律。Orpheus 可以通过在输入文本中使用 <code>[laugh]</code> 或 <code>[sigh]</code> 等标签进行提示,从而对副语言表达进行细粒度控制。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>Dia</strong>(Nari Labs,2025)是一个开放源码对话 TTS 模型,从单个文本转录生成逼真的多说话人对话。Dia 构建在 1.6B 参数的编码器-解码器 Transformer 之上,处理对话中的话轮转换、说话人特定声音和非语言线索(笑声、停顿)。它还支持从简短音频提示进行声音克隆,从而在对话上下文中实现零样本说话人生成。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>Sesame CSM</strong>(会话语音模型,2025)专注于自然的多人轮换会话语音。Sesame 不是为了优化朗读式 TTS,而是对真实对话的动态进行建模:反馈词("嗯哼")、打断、说话人之间的节奏变化和情感响应。该模型使用以对话上下文(文本和音频历史)为条件的 Transformer 骨干网络,生成的语音风格能适应对话的流程。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>Fish Speech</strong>(Fish Audio,2024)是一个开放源码 TTS 系统,使用双自回归架构:一个大语言模型从文本生成语义令牌,一个较小模型将这些转换为 VQGAN 声学令牌,再由声码器解码为波形。Fish Speech 支持从 10-15 秒参考音频进行零样本声音克隆,并实现适合实时应用的低延迟。其模块化设计允许独立替换组件(例如,不同的声码器)。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>ChatTTS</strong>(2024)是一个开放源码会话 TTS 模型,专为聊天机器人和虚拟助手等对话应用设计。它通过在文本输入中嵌入特殊令牌,生成自然、会话风格的语音,并对韵律特征(笑声、停顿、填充词)进行细粒度控制。ChatTTS 支持中英混合合成和多说话人生成。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>Bark</strong>(Suno,2023)是一个基于 Transformer 的开放源码模型,从文本提示生成语音、音乐和音效。它使用三个阶段的 Transformer 模型流水线(文本 → 语义令牌 → 粗声学令牌 → 细声学令牌),并支持声音克隆、多语言合成以及音乐和环境音等非语音音频。Bark 的通用性以可控性为代价——它不如专用 TTS 系统精确,但更灵活。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>Parler-TTS</strong>(Hugging Face,2024)采用<strong>自然语言描述</strong>方式进行声音控制:用户无需提供参考音频片段来控制风格,而是提供文本描述,例如"一位女性说话者,声音温暖、富有表现力,在安静的房间中。"Parler-TTS 在带注释的语音数据上训练,其中每个话语都配有一个描述说话风格的自然语言描述,从而无需任何参考音频即可实现直观控制。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>Neuphonic</strong> 是一个基于 API 的 TTS 平台,针对超低延迟语音合成进行了优化,面向实时语音代理和会话 AI 应用。它通过流式架构实现低于 100 毫秒的首音时间,在完整输入文本可用之前就开始生成音频。Neuphonic 专注于部署和延迟优化层面,而不是新颖的模型架构,围绕现代神经 TTS 提供生产级基础设施。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>KittenTTS</strong> 是一个紧凑、快速的 TTS 模型,专为效率低资源部署设计。它优先考虑最小延迟和小模型大小,适用于边缘和嵌入式应用,以牺牲一定自然度换取在 CPU 和移动设备上的实时性能。</p>
|
||
</li>
|
||
<li>
|
||
<p>现代 TTS 格局正在分化为两种范式:(1) <strong>编解码器语言模型</strong>(VALL-E、Orpheus、Fish Speech),将语音生成视为离散音频码上的下一个令牌预测,利用 LLM 的扩展规律;以及 (2) <strong>流/扩散模型</strong>(VITS、StyleTTS 2、Kokoro),通过迭代细化生成连续梅尔语谱图或波形。编解码器语言模型在零样本克隆和表现力方面表现出色;流/扩散模型通常更小、更快。两者都在快速向人类级别的自然度收敛。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>韵律建模</strong>控制语音的"音乐":音高、时长、能量、节奏和语调。没有良好的韵律,即使单个音素清晰,合成语音听起来也平淡且机械。可以把韵律想象成单调的 GPS 语音与富有表现力的有声读物旁白之间的区别。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>音高</strong>(基频 <span class="arithmatex">\(F_0\)</span>)是语音感知的高低程度。它在问句末尾上升,在陈述句末尾下降,并在情感性语音中连续变化。<span class="arithmatex">\(F_0\)</span> 使用 CREPE(一种神经音高追踪器)或 YIN(基于自相关,来自文件 01)等算法从音频中提取。在 TTS 中,音高由声学模型预测(FastSpeech 2 的音高预测器)或隐式学习(Tacotron 2)。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>时长</strong>决定了语速和节奏。重读音节更长,功能词缩短,停顿标记短语边界。时长建模在非自回归模型(FastSpeech)中是显式的,在自回归模型(Tacotron 的注意力对齐决定时长)中是隐式的。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>能量</strong>(响度)承载着重音。"我没说他<strong>偷</strong>了" vs "我没说他<strong>偷</strong>了"具有完全不同的含义,完全通过能量模式传达。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>风格嵌入</strong>捕获更高级的韵律模式。<strong>全局风格令牌(GST)</strong>框架(Wang 等人,2018)学习一个风格令牌库(对一组学习到的嵌入进行软注意力),捕获"兴奋"、"悲伤"或"低语"等说话风格。风格嵌入从参考话语中提取并添加到编码器输出中,允许在推理时进行风格迁移。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>声音转换(VC)</strong>改变话语的说话人身份,同时保留语言内容。想象一下录下自己的声音,然后让输出听起来像某个特定的目标说话人。VC 需要将说话人身份与内容解耦。</p>
|
||
</li>
|
||
</ul>
|
||
<p><img alt="声音转换流水线:源语音被分解为内容表示和说话人嵌入,目标说话人嵌入替换源说话人嵌入,解码器以目标声音重构语音" src="../../images/voice_conversion_pipeline.svg" /></p>
|
||
<ul>
|
||
<li>
|
||
<p><strong>说话人嵌入</strong>(在文件 04 中进一步详述)将说话人身份编码为固定维度的向量。这些可以来自预训练的说话人验证模型(x-vectors、ECAPA-TDNN)。在 VC 中,源语音被编码为与说话人无关的内容表示,然后使用目标说话人嵌入进行解码。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>解耦表示</strong>将语音分离为独立因素:内容(音素)、说话人身份、音高和节奏。方法包括:</p>
|
||
<ul>
|
||
<li><strong>信息瓶颈</strong>:压缩内容表示,使其紧密到丢失说话人信息(AutoVC)</li>
|
||
<li><strong>对抗训练</strong>:在内容表示上训练说话人分类器,并使用梯度反转去除说话人信息</li>
|
||
<li><strong>向量量化</strong>:VQ-VAE 迫使内容通过离散瓶颈,这自然剥离了说话人身份(因为码本条目表示音素类别,而非说话人特征)</li>
|
||
</ul>
|
||
</li>
|
||
<li>
|
||
<p><strong>声音克隆</strong>以目标说话人的声音合成语音。<strong>多说话人 TTS</strong>在来自许多说话人的数据上训练,以说话人嵌入条件化模型。推理时,从注册音频中提取新说话人的嵌入,并用于条件化生成。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>少样本声音克隆</strong>使用少量数据(几分钟)适应新说话人。说话人编码器从注册音频中提取嵌入,TTS 模型以此嵌入为条件生成语音。这是 SV2TTS(Jia 等人,2018)中使用的方法:一个单独训练的说话人编码器、一个以说话人嵌入为条件的 Tacotron 2 合成器,以及一个 WaveRNN 声码器。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>零样本声音克隆</strong>完全不需要适应:一个简短的话语(3-30 秒)就足够了。VALL-E 通过将注册音频作为语言模型的提示来实现这一点。该模型学会以相同的声音继续生成,因为它是在大规模多说话人数据上训练的,其中话语内声音一致性是统计上的常态。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>语音活动检测(VAD)</strong>在每个时间帧回答一个简单的二值问题:是否有人在说话?尽管简单,VAD 是 ASR(文件 02)、说话人日志(文件 04)和降噪(文件 05)的关键预处理步骤。好的 VAD 通过跳过静音减少计算量,并通过防止噪声被作为语音处理来提高准确性。</p>
|
||
</li>
|
||
<li>
|
||
<p>经典 VAD 使用能量阈值法(语音比静音响亮)、过零率(语音具有特征性的过零模式)和频谱特征。这些在信噪比较低的嘈杂环境中会失效。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>神经 VAD</strong>模型将问题视为帧级二分类。小型 RNN 或 CNN 接收声学特征(来自文件 01 的对数梅尔能量)并预测语音/非语音概率。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>WebRTC VAD</strong>(Google)是一个经典轻量级 VAD,使用基于 GMM 的分类器对简单的频谱特征进行分类。它以四个激进级别(0-3)运行,速度极快,但在音乐、非语音发声和低 SNR 环境中表现不佳。由于其零依赖的简单性,它仍然被广泛用作基线。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>Silero VAD</strong>(Silero Team,2021)是生产环境中的事实标准神经 VAD。其架构是一个小型深度可分离一维卷积堆叠(第 08 章的 MobileNet 思路应用于音频),后跟一个用于时间上下文的单层 LSTM,最后是一个线性头产生每帧的语音概率。整个模型小于 2MB(约 1M 参数),以 30-100 ms 块处理音频。</p>
|
||
<ul>
|
||
<li><strong>输入</strong>:原始 16 kHz 音频(无需手动特征提取——卷积前端直接从波形中学习自己的特征)。</li>
|
||
<li><strong>窗口化有状态推理</strong>:LSTM 隐藏状态在块之间传递,因此模型处理流式音频而无需重新处理完整历史。每次调用处理一个 30、60 或 100 ms 的块,并返回 <span class="arithmatex">\([0, 1]\)</span> 范围内的语音概率。</li>
|
||
<li><strong>自适应阈值</strong>:Silero VAD 使用独立的开始和结束阈值,而不是单个固定阈值,并设有最小语音/静音持续时间,防止在噪声边界上快速切换。语音段必须超过开始阈值并持续最小时长才被确认,静音必须低于结束阈值持续一段时间后段才关闭。</li>
|
||
<li><strong>性能</strong>:Silero VAD 在 CPU 上以 1-2% 的实时因子运行(处理 1 秒音频约需 10-20 ms),使其适用于边缘设备、手机和实时流水线。它在嘈杂和音乐丰富的音频上显著优于 WebRTC VAD,同时保持足够小以便于设备端部署。</li>
|
||
<li>Silero VAD 通常用作 Whisper(文件 02)的前端,将长音频在转录前分割成话语级块,也用于说话人日志流水线(文件 04),在提取说话人嵌入之前识别语音区域。</li>
|
||
</ul>
|
||
</li>
|
||
<li>
|
||
<p><strong>声学活动检测(AAD)</strong>将 VAD 泛化为检测任何声学活动,而不仅仅是语音。这在智能家居设备、安防系统和野生动物监测中很有用。AAD 模型检测诸如玻璃破碎、狗叫或警报等事件,通常使用文件 04 中描述的音频分类框架。</p>
|
||
</li>
|
||
<li>
|
||
<p><strong>TTS 评估指标</strong>衡量客观质量和主观自然度:</p>
|
||
<ul>
|
||
<li><strong>平均意见得分(MOS)</strong>:人类听者在 1-5 量表上对自然度进行评分。黄金标准,但昂贵且缓慢。</li>
|
||
<li><strong>梅尔倒谱失真(MCD)</strong>:测量合成与参考梅尔倒谱之间的距离。越低越好,但并不总是与感知相关。</li>
|
||
<li><strong>PESQ / POLQA</strong>:最初为电话语音设计的标准化感知评估指标。</li>
|
||
<li><strong>说话人相似度</strong>:合成与参考音频的说话人嵌入之间的余弦相似度(与声音克隆相关)。</li>
|
||
<li><strong>可懂度</strong>:将合成音频输入 ASR 系统(文件 02)并计算词错误率(WER)来衡量。</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<h2 id="colab-notebook">编程任务(使用 CoLab 或 notebook)<a class="headerlink" href="#colab-notebook" title="Permanent link">¶</a></h2>
|
||
<ul>
|
||
<li><strong>任务 1:基于梅尔语谱图的 Griffin-Lim 声码器。</strong> 实现 Griffin-Lim 迭代相位重构算法,将梅尔语谱图转换回波形。这演示了声码器问题以及为何需要神经声码器。</li>
|
||
</ul>
|
||
<div class="highlight"><pre><span></span><code><a id="__codelineno-0-1" name="__codelineno-0-1" href="#__codelineno-0-1"></a><span class="kn">import</span><span class="w"> </span><span class="nn">jax</span>
|
||
<a id="__codelineno-0-2" name="__codelineno-0-2" href="#__codelineno-0-2"></a><span class="kn">import</span><span class="w"> </span><span class="nn">jax.numpy</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">jnp</span>
|
||
<a id="__codelineno-0-3" name="__codelineno-0-3" href="#__codelineno-0-3"></a><span class="kn">import</span><span class="w"> </span><span class="nn">matplotlib.pyplot</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">plt</span>
|
||
<a id="__codelineno-0-4" name="__codelineno-0-4" href="#__codelineno-0-4"></a>
|
||
<a id="__codelineno-0-5" name="__codelineno-0-5" href="#__codelineno-0-5"></a><span class="c1"># 生成合成波形(模拟元音的谐波之和)</span>
|
||
<a id="__codelineno-0-6" name="__codelineno-0-6" href="#__codelineno-0-6"></a><span class="n">sr</span> <span class="o">=</span> <span class="mi">16000</span>
|
||
<a id="__codelineno-0-7" name="__codelineno-0-7" href="#__codelineno-0-7"></a><span class="n">duration</span> <span class="o">=</span> <span class="mf">1.0</span>
|
||
<a id="__codelineno-0-8" name="__codelineno-0-8" href="#__codelineno-0-8"></a><span class="n">t</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">linspace</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">duration</span><span class="p">,</span> <span class="nb">int</span><span class="p">(</span><span class="n">sr</span> <span class="o">*</span> <span class="n">duration</span><span class="p">))</span>
|
||
<a id="__codelineno-0-9" name="__codelineno-0-9" href="#__codelineno-0-9"></a><span class="n">f0</span> <span class="o">=</span> <span class="mf">220.0</span> <span class="c1"># 基频</span>
|
||
<a id="__codelineno-0-10" name="__codelineno-0-10" href="#__codelineno-0-10"></a><span class="n">waveform</span> <span class="o">=</span> <span class="p">(</span>
|
||
<a id="__codelineno-0-11" name="__codelineno-0-11" href="#__codelineno-0-11"></a> <span class="mf">0.6</span> <span class="o">*</span> <span class="n">jnp</span><span class="o">.</span><span class="n">sin</span><span class="p">(</span><span class="mi">2</span> <span class="o">*</span> <span class="n">jnp</span><span class="o">.</span><span class="n">pi</span> <span class="o">*</span> <span class="n">f0</span> <span class="o">*</span> <span class="n">t</span><span class="p">)</span> <span class="o">+</span>
|
||
<a id="__codelineno-0-12" name="__codelineno-0-12" href="#__codelineno-0-12"></a> <span class="mf">0.3</span> <span class="o">*</span> <span class="n">jnp</span><span class="o">.</span><span class="n">sin</span><span class="p">(</span><span class="mi">2</span> <span class="o">*</span> <span class="n">jnp</span><span class="o">.</span><span class="n">pi</span> <span class="o">*</span> <span class="mi">2</span> <span class="o">*</span> <span class="n">f0</span> <span class="o">*</span> <span class="n">t</span><span class="p">)</span> <span class="o">+</span>
|
||
<a id="__codelineno-0-13" name="__codelineno-0-13" href="#__codelineno-0-13"></a> <span class="mf">0.1</span> <span class="o">*</span> <span class="n">jnp</span><span class="o">.</span><span class="n">sin</span><span class="p">(</span><span class="mi">2</span> <span class="o">*</span> <span class="n">jnp</span><span class="o">.</span><span class="n">pi</span> <span class="o">*</span> <span class="mi">3</span> <span class="o">*</span> <span class="n">f0</span> <span class="o">*</span> <span class="n">t</span><span class="p">)</span>
|
||
<a id="__codelineno-0-14" name="__codelineno-0-14" href="#__codelineno-0-14"></a><span class="p">)</span>
|
||
<a id="__codelineno-0-15" name="__codelineno-0-15" href="#__codelineno-0-15"></a>
|
||
<a id="__codelineno-0-16" name="__codelineno-0-16" href="#__codelineno-0-16"></a><span class="c1"># 计算 STFT</span>
|
||
<a id="__codelineno-0-17" name="__codelineno-0-17" href="#__codelineno-0-17"></a><span class="n">n_fft</span> <span class="o">=</span> <span class="mi">1024</span>
|
||
<a id="__codelineno-0-18" name="__codelineno-0-18" href="#__codelineno-0-18"></a><span class="n">hop_length</span> <span class="o">=</span> <span class="mi">256</span>
|
||
<a id="__codelineno-0-19" name="__codelineno-0-19" href="#__codelineno-0-19"></a><span class="n">window</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">hanning</span><span class="p">(</span><span class="n">n_fft</span><span class="p">)</span>
|
||
<a id="__codelineno-0-20" name="__codelineno-0-20" href="#__codelineno-0-20"></a>
|
||
<a id="__codelineno-0-21" name="__codelineno-0-21" href="#__codelineno-0-21"></a><span class="k">def</span><span class="w"> </span><span class="nf">stft</span><span class="p">(</span><span class="n">signal</span><span class="p">,</span> <span class="n">n_fft</span><span class="p">,</span> <span class="n">hop_length</span><span class="p">,</span> <span class="n">window</span><span class="p">):</span>
|
||
<a id="__codelineno-0-22" name="__codelineno-0-22" href="#__codelineno-0-22"></a><span class="w"> </span><span class="sd">"""计算短时傅里叶变换。"""</span>
|
||
<a id="__codelineno-0-23" name="__codelineno-0-23" href="#__codelineno-0-23"></a> <span class="n">n_frames</span> <span class="o">=</span> <span class="mi">1</span> <span class="o">+</span> <span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">signal</span><span class="p">)</span> <span class="o">-</span> <span class="n">n_fft</span><span class="p">)</span> <span class="o">//</span> <span class="n">hop_length</span>
|
||
<a id="__codelineno-0-24" name="__codelineno-0-24" href="#__codelineno-0-24"></a> <span class="n">frames</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">stack</span><span class="p">([</span>
|
||
<a id="__codelineno-0-25" name="__codelineno-0-25" href="#__codelineno-0-25"></a> <span class="n">signal</span><span class="p">[</span><span class="n">i</span> <span class="o">*</span> <span class="n">hop_length</span> <span class="p">:</span> <span class="n">i</span> <span class="o">*</span> <span class="n">hop_length</span> <span class="o">+</span> <span class="n">n_fft</span><span class="p">]</span> <span class="o">*</span> <span class="n">window</span>
|
||
<a id="__codelineno-0-26" name="__codelineno-0-26" href="#__codelineno-0-26"></a> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">n_frames</span><span class="p">)</span>
|
||
<a id="__codelineno-0-27" name="__codelineno-0-27" href="#__codelineno-0-27"></a> <span class="p">])</span>
|
||
<a id="__codelineno-0-28" name="__codelineno-0-28" href="#__codelineno-0-28"></a> <span class="k">return</span> <span class="n">jnp</span><span class="o">.</span><span class="n">fft</span><span class="o">.</span><span class="n">rfft</span><span class="p">(</span><span class="n">frames</span><span class="p">,</span> <span class="n">n</span><span class="o">=</span><span class="n">n_fft</span><span class="p">)</span>
|
||
<a id="__codelineno-0-29" name="__codelineno-0-29" href="#__codelineno-0-29"></a>
|
||
<a id="__codelineno-0-30" name="__codelineno-0-30" href="#__codelineno-0-30"></a><span class="k">def</span><span class="w"> </span><span class="nf">istft</span><span class="p">(</span><span class="n">stft_matrix</span><span class="p">,</span> <span class="n">hop_length</span><span class="p">,</span> <span class="n">window</span><span class="p">,</span> <span class="n">length</span><span class="p">):</span>
|
||
<a id="__codelineno-0-31" name="__codelineno-0-31" href="#__codelineno-0-31"></a><span class="w"> </span><span class="sd">"""使用重叠相加法计算逆 STFT。"""</span>
|
||
<a id="__codelineno-0-32" name="__codelineno-0-32" href="#__codelineno-0-32"></a> <span class="n">n_fft</span> <span class="o">=</span> <span class="p">(</span><span class="n">stft_matrix</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">*</span> <span class="mi">2</span>
|
||
<a id="__codelineno-0-33" name="__codelineno-0-33" href="#__codelineno-0-33"></a> <span class="n">n_frames</span> <span class="o">=</span> <span class="n">stft_matrix</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
|
||
<a id="__codelineno-0-34" name="__codelineno-0-34" href="#__codelineno-0-34"></a> <span class="n">frames</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">fft</span><span class="o">.</span><span class="n">irfft</span><span class="p">(</span><span class="n">stft_matrix</span><span class="p">,</span> <span class="n">n</span><span class="o">=</span><span class="n">n_fft</span><span class="p">)</span>
|
||
<a id="__codelineno-0-35" name="__codelineno-0-35" href="#__codelineno-0-35"></a> <span class="n">frames</span> <span class="o">=</span> <span class="n">frames</span> <span class="o">*</span> <span class="n">window</span><span class="p">[</span><span class="kc">None</span><span class="p">,</span> <span class="p">:]</span>
|
||
<a id="__codelineno-0-36" name="__codelineno-0-36" href="#__codelineno-0-36"></a> <span class="n">output</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">length</span><span class="p">)</span>
|
||
<a id="__codelineno-0-37" name="__codelineno-0-37" href="#__codelineno-0-37"></a> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">n_frames</span><span class="p">):</span>
|
||
<a id="__codelineno-0-38" name="__codelineno-0-38" href="#__codelineno-0-38"></a> <span class="n">start</span> <span class="o">=</span> <span class="n">i</span> <span class="o">*</span> <span class="n">hop_length</span>
|
||
<a id="__codelineno-0-39" name="__codelineno-0-39" href="#__codelineno-0-39"></a> <span class="n">end</span> <span class="o">=</span> <span class="n">start</span> <span class="o">+</span> <span class="n">n_fft</span>
|
||
<a id="__codelineno-0-40" name="__codelineno-0-40" href="#__codelineno-0-40"></a> <span class="k">if</span> <span class="n">end</span> <span class="o"><=</span> <span class="n">length</span><span class="p">:</span>
|
||
<a id="__codelineno-0-41" name="__codelineno-0-41" href="#__codelineno-0-41"></a> <span class="n">output</span> <span class="o">=</span> <span class="n">output</span><span class="o">.</span><span class="n">at</span><span class="p">[</span><span class="n">start</span><span class="p">:</span><span class="n">end</span><span class="p">]</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">frames</span><span class="p">[</span><span class="n">i</span><span class="p">])</span>
|
||
<a id="__codelineno-0-42" name="__codelineno-0-42" href="#__codelineno-0-42"></a> <span class="k">return</span> <span class="n">output</span>
|
||
<a id="__codelineno-0-43" name="__codelineno-0-43" href="#__codelineno-0-43"></a>
|
||
<a id="__codelineno-0-44" name="__codelineno-0-44" href="#__codelineno-0-44"></a><span class="c1"># 正向 STFT</span>
|
||
<a id="__codelineno-0-45" name="__codelineno-0-45" href="#__codelineno-0-45"></a><span class="n">S</span> <span class="o">=</span> <span class="n">stft</span><span class="p">(</span><span class="n">waveform</span><span class="p">,</span> <span class="n">n_fft</span><span class="p">,</span> <span class="n">hop_length</span><span class="p">,</span> <span class="n">window</span><span class="p">)</span>
|
||
<a id="__codelineno-0-46" name="__codelineno-0-46" href="#__codelineno-0-46"></a><span class="n">magnitude</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="n">S</span><span class="p">)</span>
|
||
<a id="__codelineno-0-47" name="__codelineno-0-47" href="#__codelineno-0-47"></a>
|
||
<a id="__codelineno-0-48" name="__codelineno-0-48" href="#__codelineno-0-48"></a><span class="c1"># 梅尔滤波器组</span>
|
||
<a id="__codelineno-0-49" name="__codelineno-0-49" href="#__codelineno-0-49"></a><span class="n">n_mels</span> <span class="o">=</span> <span class="mi">80</span>
|
||
<a id="__codelineno-0-50" name="__codelineno-0-50" href="#__codelineno-0-50"></a><span class="n">mel_low</span> <span class="o">=</span> <span class="mf">0.0</span>
|
||
<a id="__codelineno-0-51" name="__codelineno-0-51" href="#__codelineno-0-51"></a><span class="n">mel_high</span> <span class="o">=</span> <span class="mi">2595</span> <span class="o">*</span> <span class="n">jnp</span><span class="o">.</span><span class="n">log10</span><span class="p">(</span><span class="mi">1</span> <span class="o">+</span> <span class="p">(</span><span class="n">sr</span> <span class="o">/</span> <span class="mi">2</span><span class="p">)</span> <span class="o">/</span> <span class="mi">700</span><span class="p">)</span>
|
||
<a id="__codelineno-0-52" name="__codelineno-0-52" href="#__codelineno-0-52"></a><span class="n">mel_points</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">linspace</span><span class="p">(</span><span class="n">mel_low</span><span class="p">,</span> <span class="n">mel_high</span><span class="p">,</span> <span class="n">n_mels</span> <span class="o">+</span> <span class="mi">2</span><span class="p">)</span>
|
||
<a id="__codelineno-0-53" name="__codelineno-0-53" href="#__codelineno-0-53"></a><span class="n">hz_points</span> <span class="o">=</span> <span class="mi">700</span> <span class="o">*</span> <span class="p">(</span><span class="mi">10</span> <span class="o">**</span> <span class="p">(</span><span class="n">mel_points</span> <span class="o">/</span> <span class="mi">2595</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span>
|
||
<a id="__codelineno-0-54" name="__codelineno-0-54" href="#__codelineno-0-54"></a><span class="n">freq_bins</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">floor</span><span class="p">((</span><span class="n">n_fft</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span> <span class="o">*</span> <span class="n">hz_points</span> <span class="o">/</span> <span class="n">sr</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">int</span><span class="p">)</span>
|
||
<a id="__codelineno-0-55" name="__codelineno-0-55" href="#__codelineno-0-55"></a>
|
||
<a id="__codelineno-0-56" name="__codelineno-0-56" href="#__codelineno-0-56"></a><span class="n">mel_filterbank</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="n">n_mels</span><span class="p">,</span> <span class="n">n_fft</span> <span class="o">//</span> <span class="mi">2</span> <span class="o">+</span> <span class="mi">1</span><span class="p">))</span>
|
||
<a id="__codelineno-0-57" name="__codelineno-0-57" href="#__codelineno-0-57"></a><span class="k">for</span> <span class="n">m</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">n_mels</span><span class="p">):</span>
|
||
<a id="__codelineno-0-58" name="__codelineno-0-58" href="#__codelineno-0-58"></a> <span class="n">f_left</span> <span class="o">=</span> <span class="n">freq_bins</span><span class="p">[</span><span class="n">m</span><span class="p">]</span>
|
||
<a id="__codelineno-0-59" name="__codelineno-0-59" href="#__codelineno-0-59"></a> <span class="n">f_center</span> <span class="o">=</span> <span class="n">freq_bins</span><span class="p">[</span><span class="n">m</span> <span class="o">+</span> <span class="mi">1</span><span class="p">]</span>
|
||
<a id="__codelineno-0-60" name="__codelineno-0-60" href="#__codelineno-0-60"></a> <span class="n">f_right</span> <span class="o">=</span> <span class="n">freq_bins</span><span class="p">[</span><span class="n">m</span> <span class="o">+</span> <span class="mi">2</span><span class="p">]</span>
|
||
<a id="__codelineno-0-61" name="__codelineno-0-61" href="#__codelineno-0-61"></a> <span class="k">for</span> <span class="n">k</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">f_left</span><span class="p">,</span> <span class="n">f_center</span><span class="p">):</span>
|
||
<a id="__codelineno-0-62" name="__codelineno-0-62" href="#__codelineno-0-62"></a> <span class="n">mel_filterbank</span> <span class="o">=</span> <span class="n">mel_filterbank</span><span class="o">.</span><span class="n">at</span><span class="p">[</span><span class="n">m</span><span class="p">,</span> <span class="n">k</span><span class="p">]</span><span class="o">.</span><span class="n">set</span><span class="p">(</span>
|
||
<a id="__codelineno-0-63" name="__codelineno-0-63" href="#__codelineno-0-63"></a> <span class="p">(</span><span class="n">k</span> <span class="o">-</span> <span class="n">f_left</span><span class="p">)</span> <span class="o">/</span> <span class="nb">max</span><span class="p">(</span><span class="n">f_center</span> <span class="o">-</span> <span class="n">f_left</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
|
||
<a id="__codelineno-0-64" name="__codelineno-0-64" href="#__codelineno-0-64"></a> <span class="p">)</span>
|
||
<a id="__codelineno-0-65" name="__codelineno-0-65" href="#__codelineno-0-65"></a> <span class="k">for</span> <span class="n">k</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">f_center</span><span class="p">,</span> <span class="n">f_right</span><span class="p">):</span>
|
||
<a id="__codelineno-0-66" name="__codelineno-0-66" href="#__codelineno-0-66"></a> <span class="n">mel_filterbank</span> <span class="o">=</span> <span class="n">mel_filterbank</span><span class="o">.</span><span class="n">at</span><span class="p">[</span><span class="n">m</span><span class="p">,</span> <span class="n">k</span><span class="p">]</span><span class="o">.</span><span class="n">set</span><span class="p">(</span>
|
||
<a id="__codelineno-0-67" name="__codelineno-0-67" href="#__codelineno-0-67"></a> <span class="p">(</span><span class="n">f_right</span> <span class="o">-</span> <span class="n">k</span><span class="p">)</span> <span class="o">/</span> <span class="nb">max</span><span class="p">(</span><span class="n">f_right</span> <span class="o">-</span> <span class="n">f_center</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
|
||
<a id="__codelineno-0-68" name="__codelineno-0-68" href="#__codelineno-0-68"></a> <span class="p">)</span>
|
||
<a id="__codelineno-0-69" name="__codelineno-0-69" href="#__codelineno-0-69"></a>
|
||
<a id="__codelineno-0-70" name="__codelineno-0-70" href="#__codelineno-0-70"></a><span class="c1"># 转到梅尔并返回(伪逆)</span>
|
||
<a id="__codelineno-0-71" name="__codelineno-0-71" href="#__codelineno-0-71"></a><span class="n">mel_spec</span> <span class="o">=</span> <span class="n">magnitude</span> <span class="o">@</span> <span class="n">mel_filterbank</span><span class="o">.</span><span class="n">T</span>
|
||
<a id="__codelineno-0-72" name="__codelineno-0-72" href="#__codelineno-0-72"></a><span class="n">magnitude_reconstructed</span> <span class="o">=</span> <span class="n">mel_spec</span> <span class="o">@</span> <span class="n">jnp</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">pinv</span><span class="p">(</span><span class="n">mel_filterbank</span><span class="o">.</span><span class="n">T</span><span class="p">)</span>
|
||
<a id="__codelineno-0-73" name="__codelineno-0-73" href="#__codelineno-0-73"></a><span class="n">magnitude_reconstructed</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">maximum</span><span class="p">(</span><span class="n">magnitude_reconstructed</span><span class="p">,</span> <span class="mf">1e-7</span><span class="p">)</span>
|
||
<a id="__codelineno-0-74" name="__codelineno-0-74" href="#__codelineno-0-74"></a>
|
||
<a id="__codelineno-0-75" name="__codelineno-0-75" href="#__codelineno-0-75"></a><span class="c1"># Griffin-Lim 算法</span>
|
||
<a id="__codelineno-0-76" name="__codelineno-0-76" href="#__codelineno-0-76"></a><span class="k">def</span><span class="w"> </span><span class="nf">griffin_lim</span><span class="p">(</span><span class="n">magnitude</span><span class="p">,</span> <span class="n">n_iter</span><span class="p">,</span> <span class="n">hop_length</span><span class="p">,</span> <span class="n">window</span><span class="p">,</span> <span class="n">signal_length</span><span class="p">):</span>
|
||
<a id="__codelineno-0-77" name="__codelineno-0-77" href="#__codelineno-0-77"></a><span class="w"> </span><span class="sd">"""迭代相位重构。"""</span>
|
||
<a id="__codelineno-0-78" name="__codelineno-0-78" href="#__codelineno-0-78"></a> <span class="n">n_fft</span> <span class="o">=</span> <span class="p">(</span><span class="n">magnitude</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">*</span> <span class="mi">2</span>
|
||
<a id="__codelineno-0-79" name="__codelineno-0-79" href="#__codelineno-0-79"></a> <span class="n">key</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">PRNGKey</span><span class="p">(</span><span class="mi">42</span><span class="p">)</span>
|
||
<a id="__codelineno-0-80" name="__codelineno-0-80" href="#__codelineno-0-80"></a> <span class="n">phase</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">magnitude</span><span class="o">.</span><span class="n">shape</span><span class="p">,</span> <span class="n">minval</span><span class="o">=-</span><span class="n">jnp</span><span class="o">.</span><span class="n">pi</span><span class="p">,</span> <span class="n">maxval</span><span class="o">=</span><span class="n">jnp</span><span class="o">.</span><span class="n">pi</span><span class="p">)</span>
|
||
<a id="__codelineno-0-81" name="__codelineno-0-81" href="#__codelineno-0-81"></a>
|
||
<a id="__codelineno-0-82" name="__codelineno-0-82" href="#__codelineno-0-82"></a> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">n_iter</span><span class="p">):</span>
|
||
<a id="__codelineno-0-83" name="__codelineno-0-83" href="#__codelineno-0-83"></a> <span class="n">complex_spec</span> <span class="o">=</span> <span class="n">magnitude</span> <span class="o">*</span> <span class="n">jnp</span><span class="o">.</span><span class="n">exp</span><span class="p">(</span><span class="mi">1</span><span class="n">j</span> <span class="o">*</span> <span class="n">phase</span><span class="p">)</span>
|
||
<a id="__codelineno-0-84" name="__codelineno-0-84" href="#__codelineno-0-84"></a> <span class="n">signal</span> <span class="o">=</span> <span class="n">istft</span><span class="p">(</span><span class="n">complex_spec</span><span class="p">,</span> <span class="n">hop_length</span><span class="p">,</span> <span class="n">window</span><span class="p">,</span> <span class="n">signal_length</span><span class="p">)</span>
|
||
<a id="__codelineno-0-85" name="__codelineno-0-85" href="#__codelineno-0-85"></a> <span class="n">reanalysis</span> <span class="o">=</span> <span class="n">stft</span><span class="p">(</span><span class="n">signal</span><span class="p">,</span> <span class="n">n_fft</span><span class="p">,</span> <span class="n">hop_length</span><span class="p">,</span> <span class="n">window</span><span class="p">)</span>
|
||
<a id="__codelineno-0-86" name="__codelineno-0-86" href="#__codelineno-0-86"></a> <span class="n">phase</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">angle</span><span class="p">(</span><span class="n">reanalysis</span><span class="p">)</span>
|
||
<a id="__codelineno-0-87" name="__codelineno-0-87" href="#__codelineno-0-87"></a>
|
||
<a id="__codelineno-0-88" name="__codelineno-0-88" href="#__codelineno-0-88"></a> <span class="n">complex_spec</span> <span class="o">=</span> <span class="n">magnitude</span> <span class="o">*</span> <span class="n">jnp</span><span class="o">.</span><span class="n">exp</span><span class="p">(</span><span class="mi">1</span><span class="n">j</span> <span class="o">*</span> <span class="n">phase</span><span class="p">)</span>
|
||
<a id="__codelineno-0-89" name="__codelineno-0-89" href="#__codelineno-0-89"></a> <span class="k">return</span> <span class="n">istft</span><span class="p">(</span><span class="n">complex_spec</span><span class="p">,</span> <span class="n">hop_length</span><span class="p">,</span> <span class="n">window</span><span class="p">,</span> <span class="n">signal_length</span><span class="p">)</span>
|
||
<a id="__codelineno-0-90" name="__codelineno-0-90" href="#__codelineno-0-90"></a>
|
||
<a id="__codelineno-0-91" name="__codelineno-0-91" href="#__codelineno-0-91"></a><span class="n">reconstructed</span> <span class="o">=</span> <span class="n">griffin_lim</span><span class="p">(</span><span class="n">magnitude_reconstructed</span><span class="p">,</span> <span class="n">n_iter</span><span class="o">=</span><span class="mi">60</span><span class="p">,</span> <span class="n">hop_length</span><span class="o">=</span><span class="n">hop_length</span><span class="p">,</span>
|
||
<a id="__codelineno-0-92" name="__codelineno-0-92" href="#__codelineno-0-92"></a> <span class="n">window</span><span class="o">=</span><span class="n">window</span><span class="p">,</span> <span class="n">signal_length</span><span class="o">=</span><span class="nb">len</span><span class="p">(</span><span class="n">waveform</span><span class="p">))</span>
|
||
<a id="__codelineno-0-93" name="__codelineno-0-93" href="#__codelineno-0-93"></a>
|
||
<a id="__codelineno-0-94" name="__codelineno-0-94" href="#__codelineno-0-94"></a><span class="c1"># 绘制对比图</span>
|
||
<a id="__codelineno-0-95" name="__codelineno-0-95" href="#__codelineno-0-95"></a><span class="n">fig</span><span class="p">,</span> <span class="n">axes</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">figsize</span><span class="o">=</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span> <span class="mi">8</span><span class="p">))</span>
|
||
<a id="__codelineno-0-96" name="__codelineno-0-96" href="#__codelineno-0-96"></a>
|
||
<a id="__codelineno-0-97" name="__codelineno-0-97" href="#__codelineno-0-97"></a><span class="n">axes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">t</span><span class="p">[:</span><span class="mi">1000</span><span class="p">],</span> <span class="n">waveform</span><span class="p">[:</span><span class="mi">1000</span><span class="p">],</span> <span class="n">color</span><span class="o">=</span><span class="s1">'#3498db'</span><span class="p">,</span> <span class="n">linewidth</span><span class="o">=</span><span class="mf">0.8</span><span class="p">)</span>
|
||
<a id="__codelineno-0-98" name="__codelineno-0-98" href="#__codelineno-0-98"></a><span class="n">axes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s1">'原始波形'</span><span class="p">)</span>
|
||
<a id="__codelineno-0-99" name="__codelineno-0-99" href="#__codelineno-0-99"></a><span class="n">axes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s1">'振幅'</span><span class="p">)</span>
|
||
<a id="__codelineno-0-100" name="__codelineno-0-100" href="#__codelineno-0-100"></a>
|
||
<a id="__codelineno-0-101" name="__codelineno-0-101" href="#__codelineno-0-101"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">imshow</span><span class="p">(</span><span class="n">jnp</span><span class="o">.</span><span class="n">log1p</span><span class="p">(</span><span class="n">mel_spec</span><span class="o">.</span><span class="n">T</span><span class="p">),</span> <span class="n">aspect</span><span class="o">=</span><span class="s1">'auto'</span><span class="p">,</span> <span class="n">origin</span><span class="o">=</span><span class="s1">'lower'</span><span class="p">,</span> <span class="n">cmap</span><span class="o">=</span><span class="s1">'magma'</span><span class="p">)</span>
|
||
<a id="__codelineno-0-102" name="__codelineno-0-102" href="#__codelineno-0-102"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s1">'梅尔语谱图(中间表示)'</span><span class="p">)</span>
|
||
<a id="__codelineno-0-103" name="__codelineno-0-103" href="#__codelineno-0-103"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s1">'梅尔频带'</span><span class="p">)</span>
|
||
<a id="__codelineno-0-104" name="__codelineno-0-104" href="#__codelineno-0-104"></a>
|
||
<a id="__codelineno-0-105" name="__codelineno-0-105" href="#__codelineno-0-105"></a><span class="n">axes</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">t</span><span class="p">[:</span><span class="mi">1000</span><span class="p">],</span> <span class="n">reconstructed</span><span class="p">[:</span><span class="mi">1000</span><span class="p">],</span> <span class="n">color</span><span class="o">=</span><span class="s1">'#e74c3c'</span><span class="p">,</span> <span class="n">linewidth</span><span class="o">=</span><span class="mf">0.8</span><span class="p">)</span>
|
||
<a id="__codelineno-0-106" name="__codelineno-0-106" href="#__codelineno-0-106"></a><span class="n">axes</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s1">'Griffin-Lim 重构波形(60 次迭代)'</span><span class="p">)</span>
|
||
<a id="__codelineno-0-107" name="__codelineno-0-107" href="#__codelineno-0-107"></a><span class="n">axes</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">set_xlabel</span><span class="p">(</span><span class="s1">'时间 (秒)'</span><span class="p">)</span>
|
||
<a id="__codelineno-0-108" name="__codelineno-0-108" href="#__codelineno-0-108"></a><span class="n">axes</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s1">'振幅'</span><span class="p">)</span>
|
||
<a id="__codelineno-0-109" name="__codelineno-0-109" href="#__codelineno-0-109"></a>
|
||
<a id="__codelineno-0-110" name="__codelineno-0-110" href="#__codelineno-0-110"></a><span class="n">plt</span><span class="o">.</span><span class="n">tight_layout</span><span class="p">()</span>
|
||
<a id="__codelineno-0-111" name="__codelineno-0-111" href="#__codelineno-0-111"></a><span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
|
||
<a id="__codelineno-0-112" name="__codelineno-0-112" href="#__codelineno-0-112"></a>
|
||
<a id="__codelineno-0-113" name="__codelineno-0-113" href="#__codelineno-0-113"></a><span class="c1"># 测量重构误差</span>
|
||
<a id="__codelineno-0-114" name="__codelineno-0-114" href="#__codelineno-0-114"></a><span class="n">mse</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">mean</span><span class="p">((</span><span class="n">waveform</span><span class="p">[:</span><span class="nb">len</span><span class="p">(</span><span class="n">reconstructed</span><span class="p">)]</span> <span class="o">-</span> <span class="n">reconstructed</span><span class="p">[:</span><span class="nb">len</span><span class="p">(</span><span class="n">waveform</span><span class="p">)])</span> <span class="o">**</span> <span class="mi">2</span><span class="p">)</span>
|
||
<a id="__codelineno-0-115" name="__codelineno-0-115" href="#__codelineno-0-115"></a><span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"原始与重构之间的 MSE:</span><span class="si">{</span><span class="n">mse</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
||
<a id="__codelineno-0-116" name="__codelineno-0-116" href="#__codelineno-0-116"></a><span class="nb">print</span><span class="p">(</span><span class="s2">"注意:通过梅尔反演导致的相位信息丢失会引起伪影。"</span><span class="p">)</span>
|
||
</code></pre></div>
|
||
<ul>
|
||
<li><strong>任务 2:时长预测器(FastSpeech 风格)。</strong> 训练一个小型卷积时长预测器,将音素嵌入映射到时长。这是实现非自回归 TTS 的核心组件。</li>
|
||
</ul>
|
||
<div class="highlight"><pre><span></span><code><a id="__codelineno-1-1" name="__codelineno-1-1" href="#__codelineno-1-1"></a><span class="kn">import</span><span class="w"> </span><span class="nn">jax</span>
|
||
<a id="__codelineno-1-2" name="__codelineno-1-2" href="#__codelineno-1-2"></a><span class="kn">import</span><span class="w"> </span><span class="nn">jax.numpy</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">jnp</span>
|
||
<a id="__codelineno-1-3" name="__codelineno-1-3" href="#__codelineno-1-3"></a><span class="kn">import</span><span class="w"> </span><span class="nn">jax.random</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">jr</span>
|
||
<a id="__codelineno-1-4" name="__codelineno-1-4" href="#__codelineno-1-4"></a><span class="kn">import</span><span class="w"> </span><span class="nn">matplotlib.pyplot</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">plt</span>
|
||
<a id="__codelineno-1-5" name="__codelineno-1-5" href="#__codelineno-1-5"></a>
|
||
<a id="__codelineno-1-6" name="__codelineno-1-6" href="#__codelineno-1-6"></a><span class="c1"># 模拟带真实时长的音素序列</span>
|
||
<a id="__codelineno-1-7" name="__codelineno-1-7" href="#__codelineno-1-7"></a><span class="c1"># 在真实 TTS 中,时长来自强制对齐或教师注意力</span>
|
||
<a id="__codelineno-1-8" name="__codelineno-1-8" href="#__codelineno-1-8"></a><span class="k">def</span><span class="w"> </span><span class="nf">generate_synthetic_data</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">n_samples</span><span class="o">=</span><span class="mi">200</span><span class="p">,</span> <span class="n">max_phonemes</span><span class="o">=</span><span class="mi">30</span><span class="p">,</span> <span class="n">embed_dim</span><span class="o">=</span><span class="mi">64</span><span class="p">):</span>
|
||
<a id="__codelineno-1-9" name="__codelineno-1-9" href="#__codelineno-1-9"></a><span class="w"> </span><span class="sd">"""生成合成音素嵌入和时长。"""</span>
|
||
<a id="__codelineno-1-10" name="__codelineno-1-10" href="#__codelineno-1-10"></a> <span class="n">keys</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span>
|
||
<a id="__codelineno-1-11" name="__codelineno-1-11" href="#__codelineno-1-11"></a> <span class="n">lengths</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">(</span><span class="n">n_samples</span><span class="p">,),</span> <span class="mi">5</span><span class="p">,</span> <span class="n">max_phonemes</span><span class="p">)</span>
|
||
<a id="__codelineno-1-12" name="__codelineno-1-12" href="#__codelineno-1-12"></a>
|
||
<a id="__codelineno-1-13" name="__codelineno-1-13" href="#__codelineno-1-13"></a> <span class="n">all_embeddings</span> <span class="o">=</span> <span class="p">[]</span>
|
||
<a id="__codelineno-1-14" name="__codelineno-1-14" href="#__codelineno-1-14"></a> <span class="n">all_durations</span> <span class="o">=</span> <span class="p">[]</span>
|
||
<a id="__codelineno-1-15" name="__codelineno-1-15" href="#__codelineno-1-15"></a> <span class="n">all_masks</span> <span class="o">=</span> <span class="p">[]</span>
|
||
<a id="__codelineno-1-16" name="__codelineno-1-16" href="#__codelineno-1-16"></a>
|
||
<a id="__codelineno-1-17" name="__codelineno-1-17" href="#__codelineno-1-17"></a> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">n_samples</span><span class="p">):</span>
|
||
<a id="__codelineno-1-18" name="__codelineno-1-18" href="#__codelineno-1-18"></a> <span class="n">L</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">lengths</span><span class="p">[</span><span class="n">i</span><span class="p">])</span>
|
||
<a id="__codelineno-1-19" name="__codelineno-1-19" href="#__codelineno-1-19"></a> <span class="n">emb</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="p">(</span><span class="n">max_phonemes</span><span class="p">,</span> <span class="n">embed_dim</span><span class="p">))</span>
|
||
<a id="__codelineno-1-20" name="__codelineno-1-20" href="#__codelineno-1-20"></a> <span class="c1"># 时长:元音(偶数索引)较长,辅音较短</span>
|
||
<a id="__codelineno-1-21" name="__codelineno-1-21" href="#__codelineno-1-21"></a> <span class="n">base_dur</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">jnp</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="n">max_phonemes</span><span class="p">)</span> <span class="o">%</span> <span class="mi">2</span> <span class="o">==</span> <span class="mi">0</span><span class="p">,</span> <span class="mf">8.0</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">)</span>
|
||
<a id="__codelineno-1-22" name="__codelineno-1-22" href="#__codelineno-1-22"></a> <span class="n">noise</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">jr</span><span class="o">.</span><span class="n">fold_in</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="mi">2</span><span class="p">],</span> <span class="n">i</span><span class="p">),</span> <span class="p">(</span><span class="n">max_phonemes</span><span class="p">,))</span> <span class="o">*</span> <span class="mf">1.5</span>
|
||
<a id="__codelineno-1-23" name="__codelineno-1-23" href="#__codelineno-1-23"></a> <span class="n">dur</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">clip</span><span class="p">(</span><span class="n">base_dur</span> <span class="o">+</span> <span class="n">noise</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">20.0</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">jnp</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
|
||
<a id="__codelineno-1-24" name="__codelineno-1-24" href="#__codelineno-1-24"></a> <span class="n">mask</span> <span class="o">=</span> <span class="p">(</span><span class="n">jnp</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="n">max_phonemes</span><span class="p">)</span> <span class="o"><</span> <span class="n">L</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">jnp</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
|
||
<a id="__codelineno-1-25" name="__codelineno-1-25" href="#__codelineno-1-25"></a>
|
||
<a id="__codelineno-1-26" name="__codelineno-1-26" href="#__codelineno-1-26"></a> <span class="n">all_embeddings</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">emb</span><span class="p">)</span>
|
||
<a id="__codelineno-1-27" name="__codelineno-1-27" href="#__codelineno-1-27"></a> <span class="n">all_durations</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">dur</span> <span class="o">*</span> <span class="n">mask</span><span class="p">)</span>
|
||
<a id="__codelineno-1-28" name="__codelineno-1-28" href="#__codelineno-1-28"></a> <span class="n">all_masks</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">mask</span><span class="p">)</span>
|
||
<a id="__codelineno-1-29" name="__codelineno-1-29" href="#__codelineno-1-29"></a>
|
||
<a id="__codelineno-1-30" name="__codelineno-1-30" href="#__codelineno-1-30"></a> <span class="k">return</span> <span class="p">(</span><span class="n">jnp</span><span class="o">.</span><span class="n">stack</span><span class="p">(</span><span class="n">all_embeddings</span><span class="p">),</span> <span class="n">jnp</span><span class="o">.</span><span class="n">stack</span><span class="p">(</span><span class="n">all_durations</span><span class="p">),</span>
|
||
<a id="__codelineno-1-31" name="__codelineno-1-31" href="#__codelineno-1-31"></a> <span class="n">jnp</span><span class="o">.</span><span class="n">stack</span><span class="p">(</span><span class="n">all_masks</span><span class="p">))</span>
|
||
<a id="__codelineno-1-32" name="__codelineno-1-32" href="#__codelineno-1-32"></a>
|
||
<a id="__codelineno-1-33" name="__codelineno-1-33" href="#__codelineno-1-33"></a><span class="n">key</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">PRNGKey</span><span class="p">(</span><span class="mi">42</span><span class="p">)</span>
|
||
<a id="__codelineno-1-34" name="__codelineno-1-34" href="#__codelineno-1-34"></a><span class="n">embeddings</span><span class="p">,</span> <span class="n">durations</span><span class="p">,</span> <span class="n">masks</span> <span class="o">=</span> <span class="n">generate_synthetic_data</span><span class="p">(</span><span class="n">key</span><span class="p">)</span>
|
||
<a id="__codelineno-1-35" name="__codelineno-1-35" href="#__codelineno-1-35"></a>
|
||
<a id="__codelineno-1-36" name="__codelineno-1-36" href="#__codelineno-1-36"></a><span class="c1"># 时长预测器:2 层一维卷积 + 线性投影</span>
|
||
<a id="__codelineno-1-37" name="__codelineno-1-37" href="#__codelineno-1-37"></a><span class="k">def</span><span class="w"> </span><span class="nf">init_duration_predictor</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">embed_dim</span><span class="o">=</span><span class="mi">64</span><span class="p">,</span> <span class="n">hidden_dim</span><span class="o">=</span><span class="mi">128</span><span class="p">,</span> <span class="n">kernel_size</span><span class="o">=</span><span class="mi">3</span><span class="p">):</span>
|
||
<a id="__codelineno-1-38" name="__codelineno-1-38" href="#__codelineno-1-38"></a><span class="w"> </span><span class="sd">"""初始化时长预测器权重。"""</span>
|
||
<a id="__codelineno-1-39" name="__codelineno-1-39" href="#__codelineno-1-39"></a> <span class="n">keys</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span>
|
||
<a id="__codelineno-1-40" name="__codelineno-1-40" href="#__codelineno-1-40"></a> <span class="n">scale1</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="mf">2.0</span> <span class="o">/</span> <span class="p">(</span><span class="n">embed_dim</span> <span class="o">*</span> <span class="n">kernel_size</span><span class="p">))</span>
|
||
<a id="__codelineno-1-41" name="__codelineno-1-41" href="#__codelineno-1-41"></a> <span class="n">scale2</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="mf">2.0</span> <span class="o">/</span> <span class="p">(</span><span class="n">hidden_dim</span> <span class="o">*</span> <span class="n">kernel_size</span><span class="p">))</span>
|
||
<a id="__codelineno-1-42" name="__codelineno-1-42" href="#__codelineno-1-42"></a> <span class="n">params</span> <span class="o">=</span> <span class="p">{</span>
|
||
<a id="__codelineno-1-43" name="__codelineno-1-43" href="#__codelineno-1-43"></a> <span class="s1">'conv1_w'</span><span class="p">:</span> <span class="n">jr</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">(</span><span class="n">kernel_size</span><span class="p">,</span> <span class="n">embed_dim</span><span class="p">,</span> <span class="n">hidden_dim</span><span class="p">))</span> <span class="o">*</span> <span class="n">scale1</span><span class="p">,</span>
|
||
<a id="__codelineno-1-44" name="__codelineno-1-44" href="#__codelineno-1-44"></a> <span class="s1">'conv1_b'</span><span class="p">:</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">hidden_dim</span><span class="p">),</span>
|
||
<a id="__codelineno-1-45" name="__codelineno-1-45" href="#__codelineno-1-45"></a> <span class="s1">'conv2_w'</span><span class="p">:</span> <span class="n">jr</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="p">(</span><span class="n">kernel_size</span><span class="p">,</span> <span class="n">hidden_dim</span><span class="p">,</span> <span class="n">hidden_dim</span><span class="p">))</span> <span class="o">*</span> <span class="n">scale2</span><span class="p">,</span>
|
||
<a id="__codelineno-1-46" name="__codelineno-1-46" href="#__codelineno-1-46"></a> <span class="s1">'conv2_b'</span><span class="p">:</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">hidden_dim</span><span class="p">),</span>
|
||
<a id="__codelineno-1-47" name="__codelineno-1-47" href="#__codelineno-1-47"></a> <span class="s1">'linear_w'</span><span class="p">:</span> <span class="n">jr</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="mi">2</span><span class="p">],</span> <span class="p">(</span><span class="n">hidden_dim</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span> <span class="o">*</span> <span class="n">jnp</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="mf">2.0</span> <span class="o">/</span> <span class="n">hidden_dim</span><span class="p">),</span>
|
||
<a id="__codelineno-1-48" name="__codelineno-1-48" href="#__codelineno-1-48"></a> <span class="s1">'linear_b'</span><span class="p">:</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="mi">1</span><span class="p">),</span>
|
||
<a id="__codelineno-1-49" name="__codelineno-1-49" href="#__codelineno-1-49"></a> <span class="p">}</span>
|
||
<a id="__codelineno-1-50" name="__codelineno-1-50" href="#__codelineno-1-50"></a> <span class="k">return</span> <span class="n">params</span>
|
||
<a id="__codelineno-1-51" name="__codelineno-1-51" href="#__codelineno-1-51"></a>
|
||
<a id="__codelineno-1-52" name="__codelineno-1-52" href="#__codelineno-1-52"></a><span class="k">def</span><span class="w"> </span><span class="nf">duration_predictor</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
|
||
<a id="__codelineno-1-53" name="__codelineno-1-53" href="#__codelineno-1-53"></a><span class="w"> </span><span class="sd">"""从音素嵌入预测对数时长。x: (batch, seq, embed)。"""</span>
|
||
<a id="__codelineno-1-54" name="__codelineno-1-54" href="#__codelineno-1-54"></a> <span class="c1"># 卷积层 1 加 ReLU</span>
|
||
<a id="__codelineno-1-55" name="__codelineno-1-55" href="#__codelineno-1-55"></a> <span class="n">h</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">lax</span><span class="o">.</span><span class="n">conv_general_dilated</span><span class="p">(</span>
|
||
<a id="__codelineno-1-56" name="__codelineno-1-56" href="#__codelineno-1-56"></a> <span class="n">x</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="c1"># (batch, embed, seq)</span>
|
||
<a id="__codelineno-1-57" name="__codelineno-1-57" href="#__codelineno-1-57"></a> <span class="n">params</span><span class="p">[</span><span class="s1">'conv1_w'</span><span class="p">]</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="c1"># (out, in, kernel)</span>
|
||
<a id="__codelineno-1-58" name="__codelineno-1-58" href="#__codelineno-1-58"></a> <span class="n">window_strides</span><span class="o">=</span><span class="p">(</span><span class="mi">1</span><span class="p">,),</span> <span class="n">padding</span><span class="o">=</span><span class="s1">'SAME'</span>
|
||
<a id="__codelineno-1-59" name="__codelineno-1-59" href="#__codelineno-1-59"></a> <span class="p">)</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="o">+</span> <span class="n">params</span><span class="p">[</span><span class="s1">'conv1_b'</span><span class="p">]</span> <span class="c1"># 回到 (batch, seq, hidden)</span>
|
||
<a id="__codelineno-1-60" name="__codelineno-1-60" href="#__codelineno-1-60"></a> <span class="n">h</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">relu</span><span class="p">(</span><span class="n">h</span><span class="p">)</span>
|
||
<a id="__codelineno-1-61" name="__codelineno-1-61" href="#__codelineno-1-61"></a>
|
||
<a id="__codelineno-1-62" name="__codelineno-1-62" href="#__codelineno-1-62"></a> <span class="c1"># 卷积层 2 加 ReLU</span>
|
||
<a id="__codelineno-1-63" name="__codelineno-1-63" href="#__codelineno-1-63"></a> <span class="n">h</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">lax</span><span class="o">.</span><span class="n">conv_general_dilated</span><span class="p">(</span>
|
||
<a id="__codelineno-1-64" name="__codelineno-1-64" href="#__codelineno-1-64"></a> <span class="n">h</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span>
|
||
<a id="__codelineno-1-65" name="__codelineno-1-65" href="#__codelineno-1-65"></a> <span class="n">params</span><span class="p">[</span><span class="s1">'conv2_w'</span><span class="p">]</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span>
|
||
<a id="__codelineno-1-66" name="__codelineno-1-66" href="#__codelineno-1-66"></a> <span class="n">window_strides</span><span class="o">=</span><span class="p">(</span><span class="mi">1</span><span class="p">,),</span> <span class="n">padding</span><span class="o">=</span><span class="s1">'SAME'</span>
|
||
<a id="__codelineno-1-67" name="__codelineno-1-67" href="#__codelineno-1-67"></a> <span class="p">)</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="o">+</span> <span class="n">params</span><span class="p">[</span><span class="s1">'conv2_b'</span><span class="p">]</span>
|
||
<a id="__codelineno-1-68" name="__codelineno-1-68" href="#__codelineno-1-68"></a> <span class="n">h</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">relu</span><span class="p">(</span><span class="n">h</span><span class="p">)</span>
|
||
<a id="__codelineno-1-69" name="__codelineno-1-69" href="#__codelineno-1-69"></a>
|
||
<a id="__codelineno-1-70" name="__codelineno-1-70" href="#__codelineno-1-70"></a> <span class="c1"># 线性投影到标量</span>
|
||
<a id="__codelineno-1-71" name="__codelineno-1-71" href="#__codelineno-1-71"></a> <span class="n">log_dur</span> <span class="o">=</span> <span class="p">(</span><span class="n">h</span> <span class="o">@</span> <span class="n">params</span><span class="p">[</span><span class="s1">'linear_w'</span><span class="p">]</span> <span class="o">+</span> <span class="n">params</span><span class="p">[</span><span class="s1">'linear_b'</span><span class="p">])</span><span class="o">.</span><span class="n">squeeze</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
|
||
<a id="__codelineno-1-72" name="__codelineno-1-72" href="#__codelineno-1-72"></a> <span class="k">return</span> <span class="n">log_dur</span>
|
||
<a id="__codelineno-1-73" name="__codelineno-1-73" href="#__codelineno-1-73"></a>
|
||
<a id="__codelineno-1-74" name="__codelineno-1-74" href="#__codelineno-1-74"></a><span class="c1"># 损失:对数时长的 MSE(FastSpeech 中的标准做法)</span>
|
||
<a id="__codelineno-1-75" name="__codelineno-1-75" href="#__codelineno-1-75"></a><span class="k">def</span><span class="w"> </span><span class="nf">loss_fn</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">embeddings</span><span class="p">,</span> <span class="n">durations</span><span class="p">,</span> <span class="n">masks</span><span class="p">):</span>
|
||
<a id="__codelineno-1-76" name="__codelineno-1-76" href="#__codelineno-1-76"></a> <span class="n">log_dur_pred</span> <span class="o">=</span> <span class="n">duration_predictor</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">embeddings</span><span class="p">)</span>
|
||
<a id="__codelineno-1-77" name="__codelineno-1-77" href="#__codelineno-1-77"></a> <span class="n">log_dur_true</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="n">jnp</span><span class="o">.</span><span class="n">clip</span><span class="p">(</span><span class="n">durations</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="kc">None</span><span class="p">))</span>
|
||
<a id="__codelineno-1-78" name="__codelineno-1-78" href="#__codelineno-1-78"></a> <span class="n">sq_err</span> <span class="o">=</span> <span class="p">(</span><span class="n">log_dur_pred</span> <span class="o">-</span> <span class="n">log_dur_true</span><span class="p">)</span> <span class="o">**</span> <span class="mi">2</span> <span class="o">*</span> <span class="n">masks</span>
|
||
<a id="__codelineno-1-79" name="__codelineno-1-79" href="#__codelineno-1-79"></a> <span class="k">return</span> <span class="n">jnp</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">sq_err</span><span class="p">)</span> <span class="o">/</span> <span class="n">jnp</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">masks</span><span class="p">)</span>
|
||
<a id="__codelineno-1-80" name="__codelineno-1-80" href="#__codelineno-1-80"></a>
|
||
<a id="__codelineno-1-81" name="__codelineno-1-81" href="#__codelineno-1-81"></a><span class="n">grad_fn</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">jit</span><span class="p">(</span><span class="n">jax</span><span class="o">.</span><span class="n">value_and_grad</span><span class="p">(</span><span class="n">loss_fn</span><span class="p">))</span>
|
||
<a id="__codelineno-1-82" name="__codelineno-1-82" href="#__codelineno-1-82"></a>
|
||
<a id="__codelineno-1-83" name="__codelineno-1-83" href="#__codelineno-1-83"></a><span class="c1"># 训练循环</span>
|
||
<a id="__codelineno-1-84" name="__codelineno-1-84" href="#__codelineno-1-84"></a><span class="n">params</span> <span class="o">=</span> <span class="n">init_duration_predictor</span><span class="p">(</span><span class="n">jr</span><span class="o">.</span><span class="n">PRNGKey</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span>
|
||
<a id="__codelineno-1-85" name="__codelineno-1-85" href="#__codelineno-1-85"></a><span class="n">lr</span> <span class="o">=</span> <span class="mf">1e-3</span>
|
||
<a id="__codelineno-1-86" name="__codelineno-1-86" href="#__codelineno-1-86"></a><span class="n">losses</span> <span class="o">=</span> <span class="p">[]</span>
|
||
<a id="__codelineno-1-87" name="__codelineno-1-87" href="#__codelineno-1-87"></a>
|
||
<a id="__codelineno-1-88" name="__codelineno-1-88" href="#__codelineno-1-88"></a><span class="k">for</span> <span class="n">epoch</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">300</span><span class="p">):</span>
|
||
<a id="__codelineno-1-89" name="__codelineno-1-89" href="#__codelineno-1-89"></a> <span class="n">loss_val</span><span class="p">,</span> <span class="n">grads</span> <span class="o">=</span> <span class="n">grad_fn</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">embeddings</span><span class="p">,</span> <span class="n">durations</span><span class="p">,</span> <span class="n">masks</span><span class="p">)</span>
|
||
<a id="__codelineno-1-90" name="__codelineno-1-90" href="#__codelineno-1-90"></a> <span class="n">params</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">tree</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">,</span> <span class="n">g</span><span class="p">:</span> <span class="n">p</span> <span class="o">-</span> <span class="n">lr</span> <span class="o">*</span> <span class="n">g</span><span class="p">,</span> <span class="n">params</span><span class="p">,</span> <span class="n">grads</span><span class="p">)</span>
|
||
<a id="__codelineno-1-91" name="__codelineno-1-91" href="#__codelineno-1-91"></a> <span class="n">losses</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="n">loss_val</span><span class="p">))</span>
|
||
<a id="__codelineno-1-92" name="__codelineno-1-92" href="#__codelineno-1-92"></a>
|
||
<a id="__codelineno-1-93" name="__codelineno-1-93" href="#__codelineno-1-93"></a><span class="c1"># 在一个样本上评估</span>
|
||
<a id="__codelineno-1-94" name="__codelineno-1-94" href="#__codelineno-1-94"></a><span class="n">log_dur_pred</span> <span class="o">=</span> <span class="n">duration_predictor</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">embeddings</span><span class="p">[:</span><span class="mi">1</span><span class="p">])</span>
|
||
<a id="__codelineno-1-95" name="__codelineno-1-95" href="#__codelineno-1-95"></a><span class="n">dur_pred</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">exp</span><span class="p">(</span><span class="n">log_dur_pred</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
|
||
<a id="__codelineno-1-96" name="__codelineno-1-96" href="#__codelineno-1-96"></a><span class="n">dur_true</span> <span class="o">=</span> <span class="n">durations</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
|
||
<a id="__codelineno-1-97" name="__codelineno-1-97" href="#__codelineno-1-97"></a><span class="n">mask</span> <span class="o">=</span> <span class="n">masks</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
|
||
<a id="__codelineno-1-98" name="__codelineno-1-98" href="#__codelineno-1-98"></a><span class="n">valid_len</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">jnp</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">mask</span><span class="p">))</span>
|
||
<a id="__codelineno-1-99" name="__codelineno-1-99" href="#__codelineno-1-99"></a>
|
||
<a id="__codelineno-1-100" name="__codelineno-1-100" href="#__codelineno-1-100"></a><span class="n">fig</span><span class="p">,</span> <span class="n">axes</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="n">figsize</span><span class="o">=</span><span class="p">(</span><span class="mi">14</span><span class="p">,</span> <span class="mi">5</span><span class="p">))</span>
|
||
<a id="__codelineno-1-101" name="__codelineno-1-101" href="#__codelineno-1-101"></a>
|
||
<a id="__codelineno-1-102" name="__codelineno-1-102" href="#__codelineno-1-102"></a><span class="n">axes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">losses</span><span class="p">,</span> <span class="n">color</span><span class="o">=</span><span class="s1">'#3498db'</span><span class="p">,</span> <span class="n">linewidth</span><span class="o">=</span><span class="mf">1.5</span><span class="p">)</span>
|
||
<a id="__codelineno-1-103" name="__codelineno-1-103" href="#__codelineno-1-103"></a><span class="n">axes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">set_xlabel</span><span class="p">(</span><span class="s1">'轮次'</span><span class="p">)</span>
|
||
<a id="__codelineno-1-104" name="__codelineno-1-104" href="#__codelineno-1-104"></a><span class="n">axes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s1">'MSE 损失(对数时长)'</span><span class="p">)</span>
|
||
<a id="__codelineno-1-105" name="__codelineno-1-105" href="#__codelineno-1-105"></a><span class="n">axes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s1">'时长预测器训练'</span><span class="p">)</span>
|
||
<a id="__codelineno-1-106" name="__codelineno-1-106" href="#__codelineno-1-106"></a><span class="n">axes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">set_yscale</span><span class="p">(</span><span class="s1">'log'</span><span class="p">)</span>
|
||
<a id="__codelineno-1-107" name="__codelineno-1-107" href="#__codelineno-1-107"></a>
|
||
<a id="__codelineno-1-108" name="__codelineno-1-108" href="#__codelineno-1-108"></a><span class="n">x_pos</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="n">valid_len</span><span class="p">)</span>
|
||
<a id="__codelineno-1-109" name="__codelineno-1-109" href="#__codelineno-1-109"></a><span class="n">width</span> <span class="o">=</span> <span class="mf">0.35</span>
|
||
<a id="__codelineno-1-110" name="__codelineno-1-110" href="#__codelineno-1-110"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">bar</span><span class="p">(</span><span class="n">x_pos</span> <span class="o">-</span> <span class="n">width</span><span class="o">/</span><span class="mi">2</span><span class="p">,</span> <span class="n">dur_true</span><span class="p">[:</span><span class="n">valid_len</span><span class="p">],</span> <span class="n">width</span><span class="p">,</span> <span class="n">color</span><span class="o">=</span><span class="s1">'#27ae60'</span><span class="p">,</span>
|
||
<a id="__codelineno-1-111" name="__codelineno-1-111" href="#__codelineno-1-111"></a> <span class="n">label</span><span class="o">=</span><span class="s1">'真实值'</span><span class="p">,</span> <span class="n">alpha</span><span class="o">=</span><span class="mf">0.8</span><span class="p">)</span>
|
||
<a id="__codelineno-1-112" name="__codelineno-1-112" href="#__codelineno-1-112"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">bar</span><span class="p">(</span><span class="n">x_pos</span> <span class="o">+</span> <span class="n">width</span><span class="o">/</span><span class="mi">2</span><span class="p">,</span> <span class="n">dur_pred</span><span class="p">[:</span><span class="n">valid_len</span><span class="p">],</span> <span class="n">width</span><span class="p">,</span> <span class="n">color</span><span class="o">=</span><span class="s1">'#e74c3c'</span><span class="p">,</span>
|
||
<a id="__codelineno-1-113" name="__codelineno-1-113" href="#__codelineno-1-113"></a> <span class="n">label</span><span class="o">=</span><span class="s1">'预测值'</span><span class="p">,</span> <span class="n">alpha</span><span class="o">=</span><span class="mf">0.8</span><span class="p">)</span>
|
||
<a id="__codelineno-1-114" name="__codelineno-1-114" href="#__codelineno-1-114"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">set_xlabel</span><span class="p">(</span><span class="s1">'音素索引'</span><span class="p">)</span>
|
||
<a id="__codelineno-1-115" name="__codelineno-1-115" href="#__codelineno-1-115"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s1">'时长(帧)'</span><span class="p">)</span>
|
||
<a id="__codelineno-1-116" name="__codelineno-1-116" href="#__codelineno-1-116"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s1">'时长预测与真实值对比'</span><span class="p">)</span>
|
||
<a id="__codelineno-1-117" name="__codelineno-1-117" href="#__codelineno-1-117"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">legend</span><span class="p">()</span>
|
||
<a id="__codelineno-1-118" name="__codelineno-1-118" href="#__codelineno-1-118"></a>
|
||
<a id="__codelineno-1-119" name="__codelineno-1-119" href="#__codelineno-1-119"></a><span class="n">plt</span><span class="o">.</span><span class="n">tight_layout</span><span class="p">()</span>
|
||
<a id="__codelineno-1-120" name="__codelineno-1-120" href="#__codelineno-1-120"></a><span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
|
||
</code></pre></div>
|
||
<ul>
|
||
<li><strong>任务 3:使用上采样卷积的简单神经声码器。</strong> 构建一个最小化的 HiFi-GAN 风格生成器,使用转置卷积和残差块将梅尔语谱图上采样为波形。</li>
|
||
</ul>
|
||
<div class="highlight"><pre><span></span><code><a id="__codelineno-2-1" name="__codelineno-2-1" href="#__codelineno-2-1"></a><span class="kn">import</span><span class="w"> </span><span class="nn">jax</span>
|
||
<a id="__codelineno-2-2" name="__codelineno-2-2" href="#__codelineno-2-2"></a><span class="kn">import</span><span class="w"> </span><span class="nn">jax.numpy</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">jnp</span>
|
||
<a id="__codelineno-2-3" name="__codelineno-2-3" href="#__codelineno-2-3"></a><span class="kn">import</span><span class="w"> </span><span class="nn">jax.random</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">jr</span>
|
||
<a id="__codelineno-2-4" name="__codelineno-2-4" href="#__codelineno-2-4"></a><span class="kn">import</span><span class="w"> </span><span class="nn">matplotlib.pyplot</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">plt</span>
|
||
<a id="__codelineno-2-5" name="__codelineno-2-5" href="#__codelineno-2-5"></a>
|
||
<a id="__codelineno-2-6" name="__codelineno-2-6" href="#__codelineno-2-6"></a><span class="k">def</span><span class="w"> </span><span class="nf">init_residual_block</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">channels</span><span class="p">,</span> <span class="n">kernel_size</span><span class="p">,</span> <span class="n">dilation</span><span class="p">):</span>
|
||
<a id="__codelineno-2-7" name="__codelineno-2-7" href="#__codelineno-2-7"></a><span class="w"> </span><span class="sd">"""初始化扩张残差卷积块。"""</span>
|
||
<a id="__codelineno-2-8" name="__codelineno-2-8" href="#__codelineno-2-8"></a> <span class="n">k1</span><span class="p">,</span> <span class="n">k2</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">key</span><span class="p">)</span>
|
||
<a id="__codelineno-2-9" name="__codelineno-2-9" href="#__codelineno-2-9"></a> <span class="n">scale</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="mf">2.0</span> <span class="o">/</span> <span class="p">(</span><span class="n">channels</span> <span class="o">*</span> <span class="n">kernel_size</span><span class="p">))</span>
|
||
<a id="__codelineno-2-10" name="__codelineno-2-10" href="#__codelineno-2-10"></a> <span class="k">return</span> <span class="p">{</span>
|
||
<a id="__codelineno-2-11" name="__codelineno-2-11" href="#__codelineno-2-11"></a> <span class="s1">'conv1_w'</span><span class="p">:</span> <span class="n">jr</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">k1</span><span class="p">,</span> <span class="p">(</span><span class="n">kernel_size</span><span class="p">,</span> <span class="n">channels</span><span class="p">,</span> <span class="n">channels</span><span class="p">))</span> <span class="o">*</span> <span class="n">scale</span><span class="p">,</span>
|
||
<a id="__codelineno-2-12" name="__codelineno-2-12" href="#__codelineno-2-12"></a> <span class="s1">'conv1_b'</span><span class="p">:</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">channels</span><span class="p">),</span>
|
||
<a id="__codelineno-2-13" name="__codelineno-2-13" href="#__codelineno-2-13"></a> <span class="s1">'conv2_w'</span><span class="p">:</span> <span class="n">jr</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">k2</span><span class="p">,</span> <span class="p">(</span><span class="n">kernel_size</span><span class="p">,</span> <span class="n">channels</span><span class="p">,</span> <span class="n">channels</span><span class="p">))</span> <span class="o">*</span> <span class="n">scale</span><span class="p">,</span>
|
||
<a id="__codelineno-2-14" name="__codelineno-2-14" href="#__codelineno-2-14"></a> <span class="s1">'conv2_b'</span><span class="p">:</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">channels</span><span class="p">),</span>
|
||
<a id="__codelineno-2-15" name="__codelineno-2-15" href="#__codelineno-2-15"></a> <span class="s1">'dilation'</span><span class="p">:</span> <span class="n">dilation</span>
|
||
<a id="__codelineno-2-16" name="__codelineno-2-16" href="#__codelineno-2-16"></a> <span class="p">}</span>
|
||
<a id="__codelineno-2-17" name="__codelineno-2-17" href="#__codelineno-2-17"></a>
|
||
<a id="__codelineno-2-18" name="__codelineno-2-18" href="#__codelineno-2-18"></a><span class="k">def</span><span class="w"> </span><span class="nf">residual_block</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
|
||
<a id="__codelineno-2-19" name="__codelineno-2-19" href="#__codelineno-2-19"></a><span class="w"> </span><span class="sd">"""x: (batch, time, channels)。带 LeakyReLU 的扩张卷积残差块。"""</span>
|
||
<a id="__codelineno-2-20" name="__codelineno-2-20" href="#__codelineno-2-20"></a> <span class="n">h</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">leaky_relu</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">negative_slope</span><span class="o">=</span><span class="mf">0.1</span><span class="p">)</span>
|
||
<a id="__codelineno-2-21" name="__codelineno-2-21" href="#__codelineno-2-21"></a> <span class="c1"># 简化:使用标准卷积(扩张在概念上处理)</span>
|
||
<a id="__codelineno-2-22" name="__codelineno-2-22" href="#__codelineno-2-22"></a> <span class="n">h</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">lax</span><span class="o">.</span><span class="n">conv_general_dilated</span><span class="p">(</span>
|
||
<a id="__codelineno-2-23" name="__codelineno-2-23" href="#__codelineno-2-23"></a> <span class="n">h</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span>
|
||
<a id="__codelineno-2-24" name="__codelineno-2-24" href="#__codelineno-2-24"></a> <span class="n">params</span><span class="p">[</span><span class="s1">'conv1_w'</span><span class="p">]</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span>
|
||
<a id="__codelineno-2-25" name="__codelineno-2-25" href="#__codelineno-2-25"></a> <span class="n">window_strides</span><span class="o">=</span><span class="p">(</span><span class="mi">1</span><span class="p">,),</span>
|
||
<a id="__codelineno-2-26" name="__codelineno-2-26" href="#__codelineno-2-26"></a> <span class="n">padding</span><span class="o">=</span><span class="s1">'SAME'</span><span class="p">,</span>
|
||
<a id="__codelineno-2-27" name="__codelineno-2-27" href="#__codelineno-2-27"></a> <span class="n">rhs_dilation</span><span class="o">=</span><span class="p">(</span><span class="n">params</span><span class="p">[</span><span class="s1">'dilation'</span><span class="p">],)</span>
|
||
<a id="__codelineno-2-28" name="__codelineno-2-28" href="#__codelineno-2-28"></a> <span class="p">)</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="o">+</span> <span class="n">params</span><span class="p">[</span><span class="s1">'conv1_b'</span><span class="p">]</span>
|
||
<a id="__codelineno-2-29" name="__codelineno-2-29" href="#__codelineno-2-29"></a> <span class="n">h</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">leaky_relu</span><span class="p">(</span><span class="n">h</span><span class="p">,</span> <span class="n">negative_slope</span><span class="o">=</span><span class="mf">0.1</span><span class="p">)</span>
|
||
<a id="__codelineno-2-30" name="__codelineno-2-30" href="#__codelineno-2-30"></a> <span class="n">h</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">lax</span><span class="o">.</span><span class="n">conv_general_dilated</span><span class="p">(</span>
|
||
<a id="__codelineno-2-31" name="__codelineno-2-31" href="#__codelineno-2-31"></a> <span class="n">h</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span>
|
||
<a id="__codelineno-2-32" name="__codelineno-2-32" href="#__codelineno-2-32"></a> <span class="n">params</span><span class="p">[</span><span class="s1">'conv2_w'</span><span class="p">]</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span>
|
||
<a id="__codelineno-2-33" name="__codelineno-2-33" href="#__codelineno-2-33"></a> <span class="n">window_strides</span><span class="o">=</span><span class="p">(</span><span class="mi">1</span><span class="p">,),</span>
|
||
<a id="__codelineno-2-34" name="__codelineno-2-34" href="#__codelineno-2-34"></a> <span class="n">padding</span><span class="o">=</span><span class="s1">'SAME'</span>
|
||
<a id="__codelineno-2-35" name="__codelineno-2-35" href="#__codelineno-2-35"></a> <span class="p">)</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="o">+</span> <span class="n">params</span><span class="p">[</span><span class="s1">'conv2_b'</span><span class="p">]</span>
|
||
<a id="__codelineno-2-36" name="__codelineno-2-36" href="#__codelineno-2-36"></a> <span class="k">return</span> <span class="n">x</span> <span class="o">+</span> <span class="n">h</span>
|
||
<a id="__codelineno-2-37" name="__codelineno-2-37" href="#__codelineno-2-37"></a>
|
||
<a id="__codelineno-2-38" name="__codelineno-2-38" href="#__codelineno-2-38"></a><span class="k">def</span><span class="w"> </span><span class="nf">init_generator</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">n_mels</span><span class="o">=</span><span class="mi">80</span><span class="p">,</span> <span class="n">upsample_rates</span><span class="o">=</span><span class="p">(</span><span class="mi">8</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">4</span><span class="p">),</span>
|
||
<a id="__codelineno-2-39" name="__codelineno-2-39" href="#__codelineno-2-39"></a> <span class="n">channels</span><span class="o">=</span><span class="mi">128</span><span class="p">):</span>
|
||
<a id="__codelineno-2-40" name="__codelineno-2-40" href="#__codelineno-2-40"></a><span class="w"> </span><span class="sd">"""初始化最小化的 HiFi-GAN 风格生成器。"""</span>
|
||
<a id="__codelineno-2-41" name="__codelineno-2-41" href="#__codelineno-2-41"></a> <span class="n">keys</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span>
|
||
<a id="__codelineno-2-42" name="__codelineno-2-42" href="#__codelineno-2-42"></a> <span class="n">params</span> <span class="o">=</span> <span class="p">{}</span>
|
||
<a id="__codelineno-2-43" name="__codelineno-2-43" href="#__codelineno-2-43"></a>
|
||
<a id="__codelineno-2-44" name="__codelineno-2-44" href="#__codelineno-2-44"></a> <span class="c1"># 输入投影:梅尔频带 -> 通道</span>
|
||
<a id="__codelineno-2-45" name="__codelineno-2-45" href="#__codelineno-2-45"></a> <span class="n">params</span><span class="p">[</span><span class="s1">'input_w'</span><span class="p">]</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">(</span><span class="mi">7</span><span class="p">,</span> <span class="n">n_mels</span><span class="p">,</span> <span class="n">channels</span><span class="p">))</span> <span class="o">*</span> <span class="mf">0.02</span>
|
||
<a id="__codelineno-2-46" name="__codelineno-2-46" href="#__codelineno-2-46"></a> <span class="n">params</span><span class="p">[</span><span class="s1">'input_b'</span><span class="p">]</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">channels</span><span class="p">)</span>
|
||
<a id="__codelineno-2-47" name="__codelineno-2-47" href="#__codelineno-2-47"></a>
|
||
<a id="__codelineno-2-48" name="__codelineno-2-48" href="#__codelineno-2-48"></a> <span class="c1"># 上采样块(转置卷积)</span>
|
||
<a id="__codelineno-2-49" name="__codelineno-2-49" href="#__codelineno-2-49"></a> <span class="n">in_ch</span> <span class="o">=</span> <span class="n">channels</span>
|
||
<a id="__codelineno-2-50" name="__codelineno-2-50" href="#__codelineno-2-50"></a> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">rate</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">upsample_rates</span><span class="p">):</span>
|
||
<a id="__codelineno-2-51" name="__codelineno-2-51" href="#__codelineno-2-51"></a> <span class="n">k_size</span> <span class="o">=</span> <span class="n">rate</span> <span class="o">*</span> <span class="mi">2</span>
|
||
<a id="__codelineno-2-52" name="__codelineno-2-52" href="#__codelineno-2-52"></a> <span class="n">scale</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="mf">2.0</span> <span class="o">/</span> <span class="p">(</span><span class="n">in_ch</span> <span class="o">*</span> <span class="n">k_size</span><span class="p">))</span>
|
||
<a id="__codelineno-2-53" name="__codelineno-2-53" href="#__codelineno-2-53"></a> <span class="n">out_ch</span> <span class="o">=</span> <span class="n">in_ch</span> <span class="o">//</span> <span class="mi">2</span>
|
||
<a id="__codelineno-2-54" name="__codelineno-2-54" href="#__codelineno-2-54"></a> <span class="n">params</span><span class="p">[</span><span class="sa">f</span><span class="s1">'up</span><span class="si">{</span><span class="n">i</span><span class="si">}</span><span class="s1">_w'</span><span class="p">]</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="n">i</span><span class="o">+</span><span class="mi">1</span><span class="p">],</span> <span class="p">(</span><span class="n">k_size</span><span class="p">,</span> <span class="n">in_ch</span><span class="p">,</span> <span class="n">out_ch</span><span class="p">))</span> <span class="o">*</span> <span class="n">scale</span>
|
||
<a id="__codelineno-2-55" name="__codelineno-2-55" href="#__codelineno-2-55"></a> <span class="n">params</span><span class="p">[</span><span class="sa">f</span><span class="s1">'up</span><span class="si">{</span><span class="n">i</span><span class="si">}</span><span class="s1">_b'</span><span class="p">]</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">out_ch</span><span class="p">)</span>
|
||
<a id="__codelineno-2-56" name="__codelineno-2-56" href="#__codelineno-2-56"></a> <span class="c1"># 每个尺度下的残差块</span>
|
||
<a id="__codelineno-2-57" name="__codelineno-2-57" href="#__codelineno-2-57"></a> <span class="n">params</span><span class="p">[</span><span class="sa">f</span><span class="s1">'res</span><span class="si">{</span><span class="n">i</span><span class="si">}</span><span class="s1">_0'</span><span class="p">]</span> <span class="o">=</span> <span class="n">init_residual_block</span><span class="p">(</span><span class="n">jr</span><span class="o">.</span><span class="n">fold_in</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="n">i</span><span class="o">+</span><span class="mi">4</span><span class="p">],</span> <span class="mi">0</span><span class="p">),</span>
|
||
<a id="__codelineno-2-58" name="__codelineno-2-58" href="#__codelineno-2-58"></a> <span class="n">out_ch</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
|
||
<a id="__codelineno-2-59" name="__codelineno-2-59" href="#__codelineno-2-59"></a> <span class="n">params</span><span class="p">[</span><span class="sa">f</span><span class="s1">'res</span><span class="si">{</span><span class="n">i</span><span class="si">}</span><span class="s1">_1'</span><span class="p">]</span> <span class="o">=</span> <span class="n">init_residual_block</span><span class="p">(</span><span class="n">jr</span><span class="o">.</span><span class="n">fold_in</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="n">i</span><span class="o">+</span><span class="mi">4</span><span class="p">],</span> <span class="mi">1</span><span class="p">),</span>
|
||
<a id="__codelineno-2-60" name="__codelineno-2-60" href="#__codelineno-2-60"></a> <span class="n">out_ch</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">3</span><span class="p">)</span>
|
||
<a id="__codelineno-2-61" name="__codelineno-2-61" href="#__codelineno-2-61"></a> <span class="n">in_ch</span> <span class="o">=</span> <span class="n">out_ch</span>
|
||
<a id="__codelineno-2-62" name="__codelineno-2-62" href="#__codelineno-2-62"></a>
|
||
<a id="__codelineno-2-63" name="__codelineno-2-63" href="#__codelineno-2-63"></a> <span class="c1"># 输出投影到单声道波形</span>
|
||
<a id="__codelineno-2-64" name="__codelineno-2-64" href="#__codelineno-2-64"></a> <span class="n">params</span><span class="p">[</span><span class="s1">'output_w'</span><span class="p">]</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="mi">8</span><span class="p">],</span> <span class="p">(</span><span class="mi">7</span><span class="p">,</span> <span class="n">in_ch</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span> <span class="o">*</span> <span class="mf">0.02</span>
|
||
<a id="__codelineno-2-65" name="__codelineno-2-65" href="#__codelineno-2-65"></a> <span class="n">params</span><span class="p">[</span><span class="s1">'output_b'</span><span class="p">]</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
|
||
<a id="__codelineno-2-66" name="__codelineno-2-66" href="#__codelineno-2-66"></a> <span class="n">params</span><span class="p">[</span><span class="s1">'upsample_rates'</span><span class="p">]</span> <span class="o">=</span> <span class="n">upsample_rates</span>
|
||
<a id="__codelineno-2-67" name="__codelineno-2-67" href="#__codelineno-2-67"></a>
|
||
<a id="__codelineno-2-68" name="__codelineno-2-68" href="#__codelineno-2-68"></a> <span class="k">return</span> <span class="n">params</span>
|
||
<a id="__codelineno-2-69" name="__codelineno-2-69" href="#__codelineno-2-69"></a>
|
||
<a id="__codelineno-2-70" name="__codelineno-2-70" href="#__codelineno-2-70"></a><span class="k">def</span><span class="w"> </span><span class="nf">generator_forward</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">mel</span><span class="p">):</span>
|
||
<a id="__codelineno-2-71" name="__codelineno-2-71" href="#__codelineno-2-71"></a><span class="w"> </span><span class="sd">"""mel: (batch, time, n_mels) -> waveform: (batch, time * prod(rates), 1)。"""</span>
|
||
<a id="__codelineno-2-72" name="__codelineno-2-72" href="#__codelineno-2-72"></a> <span class="c1"># 输入投影</span>
|
||
<a id="__codelineno-2-73" name="__codelineno-2-73" href="#__codelineno-2-73"></a> <span class="n">h</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">lax</span><span class="o">.</span><span class="n">conv_general_dilated</span><span class="p">(</span>
|
||
<a id="__codelineno-2-74" name="__codelineno-2-74" href="#__codelineno-2-74"></a> <span class="n">mel</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span>
|
||
<a id="__codelineno-2-75" name="__codelineno-2-75" href="#__codelineno-2-75"></a> <span class="n">params</span><span class="p">[</span><span class="s1">'input_w'</span><span class="p">]</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span>
|
||
<a id="__codelineno-2-76" name="__codelineno-2-76" href="#__codelineno-2-76"></a> <span class="n">window_strides</span><span class="o">=</span><span class="p">(</span><span class="mi">1</span><span class="p">,),</span> <span class="n">padding</span><span class="o">=</span><span class="s1">'SAME'</span>
|
||
<a id="__codelineno-2-77" name="__codelineno-2-77" href="#__codelineno-2-77"></a> <span class="p">)</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="o">+</span> <span class="n">params</span><span class="p">[</span><span class="s1">'input_b'</span><span class="p">]</span>
|
||
<a id="__codelineno-2-78" name="__codelineno-2-78" href="#__codelineno-2-78"></a>
|
||
<a id="__codelineno-2-79" name="__codelineno-2-79" href="#__codelineno-2-79"></a> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">rate</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">params</span><span class="p">[</span><span class="s1">'upsample_rates'</span><span class="p">]):</span>
|
||
<a id="__codelineno-2-80" name="__codelineno-2-80" href="#__codelineno-2-80"></a> <span class="n">h</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">leaky_relu</span><span class="p">(</span><span class="n">h</span><span class="p">,</span> <span class="n">negative_slope</span><span class="o">=</span><span class="mf">0.1</span><span class="p">)</span>
|
||
<a id="__codelineno-2-81" name="__codelineno-2-81" href="#__codelineno-2-81"></a> <span class="c1"># 通过转置卷积上采样</span>
|
||
<a id="__codelineno-2-82" name="__codelineno-2-82" href="#__codelineno-2-82"></a> <span class="n">k_size</span> <span class="o">=</span> <span class="n">rate</span> <span class="o">*</span> <span class="mi">2</span>
|
||
<a id="__codelineno-2-83" name="__codelineno-2-83" href="#__codelineno-2-83"></a> <span class="n">h</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">lax</span><span class="o">.</span><span class="n">conv_transpose</span><span class="p">(</span>
|
||
<a id="__codelineno-2-84" name="__codelineno-2-84" href="#__codelineno-2-84"></a> <span class="n">h</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span>
|
||
<a id="__codelineno-2-85" name="__codelineno-2-85" href="#__codelineno-2-85"></a> <span class="n">params</span><span class="p">[</span><span class="sa">f</span><span class="s1">'up</span><span class="si">{</span><span class="n">i</span><span class="si">}</span><span class="s1">_w'</span><span class="p">]</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span>
|
||
<a id="__codelineno-2-86" name="__codelineno-2-86" href="#__codelineno-2-86"></a> <span class="n">strides</span><span class="o">=</span><span class="p">(</span><span class="n">rate</span><span class="p">,),</span>
|
||
<a id="__codelineno-2-87" name="__codelineno-2-87" href="#__codelineno-2-87"></a> <span class="n">padding</span><span class="o">=</span><span class="s1">'SAME'</span>
|
||
<a id="__codelineno-2-88" name="__codelineno-2-88" href="#__codelineno-2-88"></a> <span class="p">)</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="o">+</span> <span class="n">params</span><span class="p">[</span><span class="sa">f</span><span class="s1">'up</span><span class="si">{</span><span class="n">i</span><span class="si">}</span><span class="s1">_b'</span><span class="p">]</span>
|
||
<a id="__codelineno-2-89" name="__codelineno-2-89" href="#__codelineno-2-89"></a> <span class="c1"># 残差块</span>
|
||
<a id="__codelineno-2-90" name="__codelineno-2-90" href="#__codelineno-2-90"></a> <span class="n">h</span> <span class="o">=</span> <span class="n">residual_block</span><span class="p">(</span><span class="n">params</span><span class="p">[</span><span class="sa">f</span><span class="s1">'res</span><span class="si">{</span><span class="n">i</span><span class="si">}</span><span class="s1">_0'</span><span class="p">],</span> <span class="n">h</span><span class="p">)</span>
|
||
<a id="__codelineno-2-91" name="__codelineno-2-91" href="#__codelineno-2-91"></a> <span class="n">h</span> <span class="o">=</span> <span class="n">residual_block</span><span class="p">(</span><span class="n">params</span><span class="p">[</span><span class="sa">f</span><span class="s1">'res</span><span class="si">{</span><span class="n">i</span><span class="si">}</span><span class="s1">_1'</span><span class="p">],</span> <span class="n">h</span><span class="p">)</span>
|
||
<a id="__codelineno-2-92" name="__codelineno-2-92" href="#__codelineno-2-92"></a>
|
||
<a id="__codelineno-2-93" name="__codelineno-2-93" href="#__codelineno-2-93"></a> <span class="n">h</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">leaky_relu</span><span class="p">(</span><span class="n">h</span><span class="p">,</span> <span class="n">negative_slope</span><span class="o">=</span><span class="mf">0.1</span><span class="p">)</span>
|
||
<a id="__codelineno-2-94" name="__codelineno-2-94" href="#__codelineno-2-94"></a> <span class="n">out</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">lax</span><span class="o">.</span><span class="n">conv_general_dilated</span><span class="p">(</span>
|
||
<a id="__codelineno-2-95" name="__codelineno-2-95" href="#__codelineno-2-95"></a> <span class="n">h</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span>
|
||
<a id="__codelineno-2-96" name="__codelineno-2-96" href="#__codelineno-2-96"></a> <span class="n">params</span><span class="p">[</span><span class="s1">'output_w'</span><span class="p">]</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span>
|
||
<a id="__codelineno-2-97" name="__codelineno-2-97" href="#__codelineno-2-97"></a> <span class="n">window_strides</span><span class="o">=</span><span class="p">(</span><span class="mi">1</span><span class="p">,),</span> <span class="n">padding</span><span class="o">=</span><span class="s1">'SAME'</span>
|
||
<a id="__codelineno-2-98" name="__codelineno-2-98" href="#__codelineno-2-98"></a> <span class="p">)</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="o">+</span> <span class="n">params</span><span class="p">[</span><span class="s1">'output_b'</span><span class="p">]</span>
|
||
<a id="__codelineno-2-99" name="__codelineno-2-99" href="#__codelineno-2-99"></a>
|
||
<a id="__codelineno-2-100" name="__codelineno-2-100" href="#__codelineno-2-100"></a> <span class="k">return</span> <span class="n">jnp</span><span class="o">.</span><span class="n">tanh</span><span class="p">(</span><span class="n">out</span><span class="p">)</span>
|
||
<a id="__codelineno-2-101" name="__codelineno-2-101" href="#__codelineno-2-101"></a>
|
||
<a id="__codelineno-2-102" name="__codelineno-2-102" href="#__codelineno-2-102"></a><span class="c1"># 创建一个合成梅尔语谱图(模拟元音)</span>
|
||
<a id="__codelineno-2-103" name="__codelineno-2-103" href="#__codelineno-2-103"></a><span class="n">n_mels</span> <span class="o">=</span> <span class="mi">80</span>
|
||
<a id="__codelineno-2-104" name="__codelineno-2-104" href="#__codelineno-2-104"></a><span class="n">n_frames</span> <span class="o">=</span> <span class="mi">50</span>
|
||
<a id="__codelineno-2-105" name="__codelineno-2-105" href="#__codelineno-2-105"></a><span class="n">mel</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="n">n_frames</span><span class="p">,</span> <span class="n">n_mels</span><span class="p">))</span>
|
||
<a id="__codelineno-2-106" name="__codelineno-2-106" href="#__codelineno-2-106"></a><span class="c1"># 在低频梅尔频带中添加能量(模拟共振峰)</span>
|
||
<a id="__codelineno-2-107" name="__codelineno-2-107" href="#__codelineno-2-107"></a><span class="n">mel</span> <span class="o">=</span> <span class="n">mel</span><span class="o">.</span><span class="n">at</span><span class="p">[:,</span> <span class="p">:,</span> <span class="mi">5</span><span class="p">:</span><span class="mi">15</span><span class="p">]</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="mf">1.0</span><span class="p">)</span>
|
||
<a id="__codelineno-2-108" name="__codelineno-2-108" href="#__codelineno-2-108"></a><span class="n">mel</span> <span class="o">=</span> <span class="n">mel</span><span class="o">.</span><span class="n">at</span><span class="p">[:,</span> <span class="p">:,</span> <span class="mi">20</span><span class="p">:</span><span class="mi">25</span><span class="p">]</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="mf">0.6</span><span class="p">)</span>
|
||
<a id="__codelineno-2-109" name="__codelineno-2-109" href="#__codelineno-2-109"></a>
|
||
<a id="__codelineno-2-110" name="__codelineno-2-110" href="#__codelineno-2-110"></a><span class="c1"># 初始化并运行生成器</span>
|
||
<a id="__codelineno-2-111" name="__codelineno-2-111" href="#__codelineno-2-111"></a><span class="n">key</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">PRNGKey</span><span class="p">(</span><span class="mi">42</span><span class="p">)</span>
|
||
<a id="__codelineno-2-112" name="__codelineno-2-112" href="#__codelineno-2-112"></a><span class="n">params</span> <span class="o">=</span> <span class="n">init_generator</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">n_mels</span><span class="o">=</span><span class="n">n_mels</span><span class="p">,</span> <span class="n">upsample_rates</span><span class="o">=</span><span class="p">(</span><span class="mi">8</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">4</span><span class="p">),</span>
|
||
<a id="__codelineno-2-113" name="__codelineno-2-113" href="#__codelineno-2-113"></a> <span class="n">channels</span><span class="o">=</span><span class="mi">128</span><span class="p">)</span>
|
||
<a id="__codelineno-2-114" name="__codelineno-2-114" href="#__codelineno-2-114"></a><span class="n">waveform</span> <span class="o">=</span> <span class="n">generator_forward</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">mel</span><span class="p">)</span>
|
||
<a id="__codelineno-2-115" name="__codelineno-2-115" href="#__codelineno-2-115"></a>
|
||
<a id="__codelineno-2-116" name="__codelineno-2-116" href="#__codelineno-2-116"></a><span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"输入梅尔形状:</span><span class="si">{</span><span class="n">mel</span><span class="o">.</span><span class="n">shape</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
||
<a id="__codelineno-2-117" name="__codelineno-2-117" href="#__codelineno-2-117"></a><span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"输出波形形状:</span><span class="si">{</span><span class="n">waveform</span><span class="o">.</span><span class="n">shape</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
||
<a id="__codelineno-2-118" name="__codelineno-2-118" href="#__codelineno-2-118"></a><span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"上采样因子:</span><span class="si">{</span><span class="mi">8</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">8</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">4</span><span class="si">}</span><span class="s2"> = </span><span class="si">{</span><span class="mi">8</span><span class="o">*</span><span class="mi">8</span><span class="o">*</span><span class="mi">4</span><span class="si">}</span><span class="s2">x"</span><span class="p">)</span>
|
||
<a id="__codelineno-2-119" name="__codelineno-2-119" href="#__codelineno-2-119"></a>
|
||
<a id="__codelineno-2-120" name="__codelineno-2-120" href="#__codelineno-2-120"></a><span class="n">fig</span><span class="p">,</span> <span class="n">axes</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">figsize</span><span class="o">=</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span> <span class="mi">6</span><span class="p">))</span>
|
||
<a id="__codelineno-2-121" name="__codelineno-2-121" href="#__codelineno-2-121"></a>
|
||
<a id="__codelineno-2-122" name="__codelineno-2-122" href="#__codelineno-2-122"></a><span class="n">axes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">imshow</span><span class="p">(</span><span class="n">mel</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">T</span><span class="p">,</span> <span class="n">aspect</span><span class="o">=</span><span class="s1">'auto'</span><span class="p">,</span> <span class="n">origin</span><span class="o">=</span><span class="s1">'lower'</span><span class="p">,</span> <span class="n">cmap</span><span class="o">=</span><span class="s1">'magma'</span><span class="p">)</span>
|
||
<a id="__codelineno-2-123" name="__codelineno-2-123" href="#__codelineno-2-123"></a><span class="n">axes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s1">'输入梅尔语谱图'</span><span class="p">)</span>
|
||
<a id="__codelineno-2-124" name="__codelineno-2-124" href="#__codelineno-2-124"></a><span class="n">axes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s1">'梅尔频带'</span><span class="p">)</span>
|
||
<a id="__codelineno-2-125" name="__codelineno-2-125" href="#__codelineno-2-125"></a><span class="n">axes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">set_xlabel</span><span class="p">(</span><span class="s1">'帧'</span><span class="p">)</span>
|
||
<a id="__codelineno-2-126" name="__codelineno-2-126" href="#__codelineno-2-126"></a>
|
||
<a id="__codelineno-2-127" name="__codelineno-2-127" href="#__codelineno-2-127"></a><span class="n">waveform_np</span> <span class="o">=</span> <span class="n">waveform</span><span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="p">:,</span> <span class="mi">0</span><span class="p">]</span>
|
||
<a id="__codelineno-2-128" name="__codelineno-2-128" href="#__codelineno-2-128"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">waveform_np</span><span class="p">[:</span><span class="mi">2000</span><span class="p">],</span> <span class="n">color</span><span class="o">=</span><span class="s1">'#9b59b6'</span><span class="p">,</span> <span class="n">linewidth</span><span class="o">=</span><span class="mf">0.5</span><span class="p">)</span>
|
||
<a id="__codelineno-2-129" name="__codelineno-2-129" href="#__codelineno-2-129"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s1">'生成器输出波形(未经训练 - 随机噪声)'</span><span class="p">)</span>
|
||
<a id="__codelineno-2-130" name="__codelineno-2-130" href="#__codelineno-2-130"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s1">'振幅'</span><span class="p">)</span>
|
||
<a id="__codelineno-2-131" name="__codelineno-2-131" href="#__codelineno-2-131"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">set_xlabel</span><span class="p">(</span><span class="s1">'样本'</span><span class="p">)</span>
|
||
<a id="__codelineno-2-132" name="__codelineno-2-132" href="#__codelineno-2-132"></a>
|
||
<a id="__codelineno-2-133" name="__codelineno-2-133" href="#__codelineno-2-133"></a><span class="n">plt</span><span class="o">.</span><span class="n">tight_layout</span><span class="p">()</span>
|
||
<a id="__codelineno-2-134" name="__codelineno-2-134" href="#__codelineno-2-134"></a><span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
|
||
<a id="__codelineno-2-135" name="__codelineno-2-135" href="#__codelineno-2-135"></a><span class="nb">print</span><span class="p">(</span><span class="s2">"注意:输出是噪声,因为生成器未经训练。"</span><span class="p">)</span>
|
||
<a id="__codelineno-2-136" name="__codelineno-2-136" href="#__codelineno-2-136"></a><span class="nb">print</span><span class="p">(</span><span class="s2">"在实践中,对抗损失 + 梅尔损失训练会将其塑造成语音。"</span><span class="p">)</span>
|
||
</code></pre></div>
|
||
<ul>
|
||
<li><strong>任务 4:使用简单 RNN 的语音活动检测。</strong> 在合成音频特征上训练一个基于小型 GRU 的 VAD 模型,对帧进行语音或静音分类。</li>
|
||
</ul>
|
||
<div class="highlight"><pre><span></span><code><a id="__codelineno-3-1" name="__codelineno-3-1" href="#__codelineno-3-1"></a><span class="kn">import</span><span class="w"> </span><span class="nn">jax</span>
|
||
<a id="__codelineno-3-2" name="__codelineno-3-2" href="#__codelineno-3-2"></a><span class="kn">import</span><span class="w"> </span><span class="nn">jax.numpy</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">jnp</span>
|
||
<a id="__codelineno-3-3" name="__codelineno-3-3" href="#__codelineno-3-3"></a><span class="kn">import</span><span class="w"> </span><span class="nn">jax.random</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">jr</span>
|
||
<a id="__codelineno-3-4" name="__codelineno-3-4" href="#__codelineno-3-4"></a><span class="kn">import</span><span class="w"> </span><span class="nn">matplotlib.pyplot</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">plt</span>
|
||
<a id="__codelineno-3-5" name="__codelineno-3-5" href="#__codelineno-3-5"></a>
|
||
<a id="__codelineno-3-6" name="__codelineno-3-6" href="#__codelineno-3-6"></a><span class="c1"># 生成合成对数梅尔能量特征及语音/静音标签</span>
|
||
<a id="__codelineno-3-7" name="__codelineno-3-7" href="#__codelineno-3-7"></a><span class="k">def</span><span class="w"> </span><span class="nf">generate_vad_data</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">n_sequences</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span> <span class="n">n_frames</span><span class="o">=</span><span class="mi">200</span><span class="p">,</span> <span class="n">n_features</span><span class="o">=</span><span class="mi">40</span><span class="p">):</span>
|
||
<a id="__codelineno-3-8" name="__codelineno-3-8" href="#__codelineno-3-8"></a><span class="w"> </span><span class="sd">"""模拟对数梅尔特征:语音区域能量更高且具有结构。"""</span>
|
||
<a id="__codelineno-3-9" name="__codelineno-3-9" href="#__codelineno-3-9"></a> <span class="n">keys</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="mi">5</span><span class="p">)</span>
|
||
<a id="__codelineno-3-10" name="__codelineno-3-10" href="#__codelineno-3-10"></a> <span class="n">all_features</span> <span class="o">=</span> <span class="p">[]</span>
|
||
<a id="__codelineno-3-11" name="__codelineno-3-11" href="#__codelineno-3-11"></a> <span class="n">all_labels</span> <span class="o">=</span> <span class="p">[]</span>
|
||
<a id="__codelineno-3-12" name="__codelineno-3-12" href="#__codelineno-3-12"></a>
|
||
<a id="__codelineno-3-13" name="__codelineno-3-13" href="#__codelineno-3-13"></a> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">n_sequences</span><span class="p">):</span>
|
||
<a id="__codelineno-3-14" name="__codelineno-3-14" href="#__codelineno-3-14"></a> <span class="n">k</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">fold_in</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">i</span><span class="p">)</span>
|
||
<a id="__codelineno-3-15" name="__codelineno-3-15" href="#__codelineno-3-15"></a> <span class="n">k1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">k3</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">k</span><span class="p">,</span> <span class="mi">3</span><span class="p">)</span>
|
||
<a id="__codelineno-3-16" name="__codelineno-3-16" href="#__codelineno-3-16"></a>
|
||
<a id="__codelineno-3-17" name="__codelineno-3-17" href="#__codelineno-3-17"></a> <span class="c1"># 随机语音/静音模式</span>
|
||
<a id="__codelineno-3-18" name="__codelineno-3-18" href="#__codelineno-3-18"></a> <span class="n">label</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">n_frames</span><span class="p">)</span>
|
||
<a id="__codelineno-3-19" name="__codelineno-3-19" href="#__codelineno-3-19"></a> <span class="n">n_segments</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="n">k1</span><span class="p">,</span> <span class="p">(),</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">6</span><span class="p">)</span>
|
||
<a id="__codelineno-3-20" name="__codelineno-3-20" href="#__codelineno-3-20"></a> <span class="k">for</span> <span class="n">seg</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">n_segments</span><span class="p">)):</span>
|
||
<a id="__codelineno-3-21" name="__codelineno-3-21" href="#__codelineno-3-21"></a> <span class="n">start</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="n">jr</span><span class="o">.</span><span class="n">fold_in</span><span class="p">(</span><span class="n">k2</span><span class="p">,</span> <span class="n">seg</span><span class="p">),</span> <span class="p">(),</span> <span class="mi">0</span><span class="p">,</span> <span class="n">n_frames</span> <span class="o">-</span> <span class="mi">20</span><span class="p">)</span>
|
||
<a id="__codelineno-3-22" name="__codelineno-3-22" href="#__codelineno-3-22"></a> <span class="n">length</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="n">jr</span><span class="o">.</span><span class="n">fold_in</span><span class="p">(</span><span class="n">k3</span><span class="p">,</span> <span class="n">seg</span><span class="p">),</span> <span class="p">(),</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">50</span><span class="p">)</span>
|
||
<a id="__codelineno-3-23" name="__codelineno-3-23" href="#__codelineno-3-23"></a> <span class="n">end</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">minimum</span><span class="p">(</span><span class="n">start</span> <span class="o">+</span> <span class="n">length</span><span class="p">,</span> <span class="n">n_frames</span><span class="p">)</span>
|
||
<a id="__codelineno-3-24" name="__codelineno-3-24" href="#__codelineno-3-24"></a> <span class="n">label</span> <span class="o">=</span> <span class="n">label</span><span class="o">.</span><span class="n">at</span><span class="p">[</span><span class="nb">int</span><span class="p">(</span><span class="n">start</span><span class="p">):</span><span class="nb">int</span><span class="p">(</span><span class="n">end</span><span class="p">)]</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="mf">1.0</span><span class="p">)</span>
|
||
<a id="__codelineno-3-25" name="__codelineno-3-25" href="#__codelineno-3-25"></a>
|
||
<a id="__codelineno-3-26" name="__codelineno-3-26" href="#__codelineno-3-26"></a> <span class="c1"># 特征:语音帧具有更高能量 + 频谱结构</span>
|
||
<a id="__codelineno-3-27" name="__codelineno-3-27" href="#__codelineno-3-27"></a> <span class="n">noise</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">jr</span><span class="o">.</span><span class="n">fold_in</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">i</span><span class="p">),</span> <span class="p">(</span><span class="n">n_frames</span><span class="p">,</span> <span class="n">n_features</span><span class="p">))</span> <span class="o">*</span> <span class="mf">0.3</span>
|
||
<a id="__codelineno-3-28" name="__codelineno-3-28" href="#__codelineno-3-28"></a> <span class="n">speech_pattern</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">outer</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">jnp</span><span class="o">.</span><span class="n">exp</span><span class="p">(</span><span class="o">-</span><span class="n">jnp</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="n">n_features</span><span class="p">)</span> <span class="o">/</span> <span class="mf">15.0</span><span class="p">))</span>
|
||
<a id="__codelineno-3-29" name="__codelineno-3-29" href="#__codelineno-3-29"></a> <span class="n">features</span> <span class="o">=</span> <span class="n">speech_pattern</span> <span class="o">*</span> <span class="mf">2.0</span> <span class="o">+</span> <span class="n">noise</span> <span class="o">+</span> <span class="mf">0.1</span>
|
||
<a id="__codelineno-3-30" name="__codelineno-3-30" href="#__codelineno-3-30"></a>
|
||
<a id="__codelineno-3-31" name="__codelineno-3-31" href="#__codelineno-3-31"></a> <span class="n">all_features</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">features</span><span class="p">)</span>
|
||
<a id="__codelineno-3-32" name="__codelineno-3-32" href="#__codelineno-3-32"></a> <span class="n">all_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
|
||
<a id="__codelineno-3-33" name="__codelineno-3-33" href="#__codelineno-3-33"></a>
|
||
<a id="__codelineno-3-34" name="__codelineno-3-34" href="#__codelineno-3-34"></a> <span class="k">return</span> <span class="n">jnp</span><span class="o">.</span><span class="n">stack</span><span class="p">(</span><span class="n">all_features</span><span class="p">),</span> <span class="n">jnp</span><span class="o">.</span><span class="n">stack</span><span class="p">(</span><span class="n">all_labels</span><span class="p">)</span>
|
||
<a id="__codelineno-3-35" name="__codelineno-3-35" href="#__codelineno-3-35"></a>
|
||
<a id="__codelineno-3-36" name="__codelineno-3-36" href="#__codelineno-3-36"></a><span class="n">key</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">PRNGKey</span><span class="p">(</span><span class="mi">123</span><span class="p">)</span>
|
||
<a id="__codelineno-3-37" name="__codelineno-3-37" href="#__codelineno-3-37"></a><span class="n">features</span><span class="p">,</span> <span class="n">labels</span> <span class="o">=</span> <span class="n">generate_vad_data</span><span class="p">(</span><span class="n">key</span><span class="p">)</span>
|
||
<a id="__codelineno-3-38" name="__codelineno-3-38" href="#__codelineno-3-38"></a><span class="n">train_features</span><span class="p">,</span> <span class="n">train_labels</span> <span class="o">=</span> <span class="n">features</span><span class="p">[:</span><span class="mi">80</span><span class="p">],</span> <span class="n">labels</span><span class="p">[:</span><span class="mi">80</span><span class="p">]</span>
|
||
<a id="__codelineno-3-39" name="__codelineno-3-39" href="#__codelineno-3-39"></a><span class="n">test_features</span><span class="p">,</span> <span class="n">test_labels</span> <span class="o">=</span> <span class="n">features</span><span class="p">[</span><span class="mi">80</span><span class="p">:],</span> <span class="n">labels</span><span class="p">[</span><span class="mi">80</span><span class="p">:]</span>
|
||
<a id="__codelineno-3-40" name="__codelineno-3-40" href="#__codelineno-3-40"></a>
|
||
<a id="__codelineno-3-41" name="__codelineno-3-41" href="#__codelineno-3-41"></a><span class="c1"># 基于 GRU 的简单 VAD 模型</span>
|
||
<a id="__codelineno-3-42" name="__codelineno-3-42" href="#__codelineno-3-42"></a><span class="k">def</span><span class="w"> </span><span class="nf">init_vad_model</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">input_dim</span><span class="o">=</span><span class="mi">40</span><span class="p">,</span> <span class="n">hidden_dim</span><span class="o">=</span><span class="mi">64</span><span class="p">):</span>
|
||
<a id="__codelineno-3-43" name="__codelineno-3-43" href="#__codelineno-3-43"></a> <span class="n">keys</span> <span class="o">=</span> <span class="n">jr</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="mi">6</span><span class="p">)</span>
|
||
<a id="__codelineno-3-44" name="__codelineno-3-44" href="#__codelineno-3-44"></a> <span class="n">scale_ih</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="mf">2.0</span> <span class="o">/</span> <span class="n">input_dim</span><span class="p">)</span>
|
||
<a id="__codelineno-3-45" name="__codelineno-3-45" href="#__codelineno-3-45"></a> <span class="n">scale_hh</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="mf">2.0</span> <span class="o">/</span> <span class="n">hidden_dim</span><span class="p">)</span>
|
||
<a id="__codelineno-3-46" name="__codelineno-3-46" href="#__codelineno-3-46"></a> <span class="k">return</span> <span class="p">{</span>
|
||
<a id="__codelineno-3-47" name="__codelineno-3-47" href="#__codelineno-3-47"></a> <span class="s1">'W_z'</span><span class="p">:</span> <span class="n">jr</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">(</span><span class="n">input_dim</span><span class="p">,</span> <span class="n">hidden_dim</span><span class="p">))</span> <span class="o">*</span> <span class="n">scale_ih</span><span class="p">,</span>
|
||
<a id="__codelineno-3-48" name="__codelineno-3-48" href="#__codelineno-3-48"></a> <span class="s1">'U_z'</span><span class="p">:</span> <span class="n">jr</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="p">(</span><span class="n">hidden_dim</span><span class="p">,</span> <span class="n">hidden_dim</span><span class="p">))</span> <span class="o">*</span> <span class="n">scale_hh</span><span class="p">,</span>
|
||
<a id="__codelineno-3-49" name="__codelineno-3-49" href="#__codelineno-3-49"></a> <span class="s1">'b_z'</span><span class="p">:</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">hidden_dim</span><span class="p">),</span>
|
||
<a id="__codelineno-3-50" name="__codelineno-3-50" href="#__codelineno-3-50"></a> <span class="s1">'W_r'</span><span class="p">:</span> <span class="n">jr</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="mi">2</span><span class="p">],</span> <span class="p">(</span><span class="n">input_dim</span><span class="p">,</span> <span class="n">hidden_dim</span><span class="p">))</span> <span class="o">*</span> <span class="n">scale_ih</span><span class="p">,</span>
|
||
<a id="__codelineno-3-51" name="__codelineno-3-51" href="#__codelineno-3-51"></a> <span class="s1">'U_r'</span><span class="p">:</span> <span class="n">jr</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="mi">3</span><span class="p">],</span> <span class="p">(</span><span class="n">hidden_dim</span><span class="p">,</span> <span class="n">hidden_dim</span><span class="p">))</span> <span class="o">*</span> <span class="n">scale_hh</span><span class="p">,</span>
|
||
<a id="__codelineno-3-52" name="__codelineno-3-52" href="#__codelineno-3-52"></a> <span class="s1">'b_r'</span><span class="p">:</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">hidden_dim</span><span class="p">),</span>
|
||
<a id="__codelineno-3-53" name="__codelineno-3-53" href="#__codelineno-3-53"></a> <span class="s1">'W_h'</span><span class="p">:</span> <span class="n">jr</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="mi">4</span><span class="p">],</span> <span class="p">(</span><span class="n">input_dim</span><span class="p">,</span> <span class="n">hidden_dim</span><span class="p">))</span> <span class="o">*</span> <span class="n">scale_ih</span><span class="p">,</span>
|
||
<a id="__codelineno-3-54" name="__codelineno-3-54" href="#__codelineno-3-54"></a> <span class="s1">'U_h'</span><span class="p">:</span> <span class="n">jr</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="mi">5</span><span class="p">],</span> <span class="p">(</span><span class="n">hidden_dim</span><span class="p">,</span> <span class="n">hidden_dim</span><span class="p">))</span> <span class="o">*</span> <span class="n">scale_hh</span><span class="p">,</span>
|
||
<a id="__codelineno-3-55" name="__codelineno-3-55" href="#__codelineno-3-55"></a> <span class="s1">'b_h'</span><span class="p">:</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">hidden_dim</span><span class="p">),</span>
|
||
<a id="__codelineno-3-56" name="__codelineno-3-56" href="#__codelineno-3-56"></a> <span class="s1">'W_out'</span><span class="p">:</span> <span class="n">jr</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">jr</span><span class="o">.</span><span class="n">fold_in</span><span class="p">(</span><span class="n">keys</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="mi">99</span><span class="p">),</span> <span class="p">(</span><span class="n">hidden_dim</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span> <span class="o">*</span> <span class="mf">0.1</span><span class="p">,</span>
|
||
<a id="__codelineno-3-57" name="__codelineno-3-57" href="#__codelineno-3-57"></a> <span class="s1">'b_out'</span><span class="p">:</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="mi">1</span><span class="p">),</span>
|
||
<a id="__codelineno-3-58" name="__codelineno-3-58" href="#__codelineno-3-58"></a> <span class="p">}</span>
|
||
<a id="__codelineno-3-59" name="__codelineno-3-59" href="#__codelineno-3-59"></a>
|
||
<a id="__codelineno-3-60" name="__codelineno-3-60" href="#__codelineno-3-60"></a><span class="k">def</span><span class="w"> </span><span class="nf">gru_step</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
|
||
<a id="__codelineno-3-61" name="__codelineno-3-61" href="#__codelineno-3-61"></a><span class="w"> </span><span class="sd">"""单步 GRU。"""</span>
|
||
<a id="__codelineno-3-62" name="__codelineno-3-62" href="#__codelineno-3-62"></a> <span class="n">z</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">sigmoid</span><span class="p">(</span><span class="n">x</span> <span class="o">@</span> <span class="n">params</span><span class="p">[</span><span class="s1">'W_z'</span><span class="p">]</span> <span class="o">+</span> <span class="n">h</span> <span class="o">@</span> <span class="n">params</span><span class="p">[</span><span class="s1">'U_z'</span><span class="p">]</span> <span class="o">+</span> <span class="n">params</span><span class="p">[</span><span class="s1">'b_z'</span><span class="p">])</span>
|
||
<a id="__codelineno-3-63" name="__codelineno-3-63" href="#__codelineno-3-63"></a> <span class="n">r</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">sigmoid</span><span class="p">(</span><span class="n">x</span> <span class="o">@</span> <span class="n">params</span><span class="p">[</span><span class="s1">'W_r'</span><span class="p">]</span> <span class="o">+</span> <span class="n">h</span> <span class="o">@</span> <span class="n">params</span><span class="p">[</span><span class="s1">'U_r'</span><span class="p">]</span> <span class="o">+</span> <span class="n">params</span><span class="p">[</span><span class="s1">'b_r'</span><span class="p">])</span>
|
||
<a id="__codelineno-3-64" name="__codelineno-3-64" href="#__codelineno-3-64"></a> <span class="n">h_tilde</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">tanh</span><span class="p">(</span><span class="n">x</span> <span class="o">@</span> <span class="n">params</span><span class="p">[</span><span class="s1">'W_h'</span><span class="p">]</span> <span class="o">+</span> <span class="p">(</span><span class="n">r</span> <span class="o">*</span> <span class="n">h</span><span class="p">)</span> <span class="o">@</span> <span class="n">params</span><span class="p">[</span><span class="s1">'U_h'</span><span class="p">]</span> <span class="o">+</span> <span class="n">params</span><span class="p">[</span><span class="s1">'b_h'</span><span class="p">])</span>
|
||
<a id="__codelineno-3-65" name="__codelineno-3-65" href="#__codelineno-3-65"></a> <span class="n">h_new</span> <span class="o">=</span> <span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="n">z</span><span class="p">)</span> <span class="o">*</span> <span class="n">h</span> <span class="o">+</span> <span class="n">z</span> <span class="o">*</span> <span class="n">h_tilde</span>
|
||
<a id="__codelineno-3-66" name="__codelineno-3-66" href="#__codelineno-3-66"></a> <span class="k">return</span> <span class="n">h_new</span>
|
||
<a id="__codelineno-3-67" name="__codelineno-3-67" href="#__codelineno-3-67"></a>
|
||
<a id="__codelineno-3-68" name="__codelineno-3-68" href="#__codelineno-3-68"></a><span class="k">def</span><span class="w"> </span><span class="nf">vad_forward</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
|
||
<a id="__codelineno-3-69" name="__codelineno-3-69" href="#__codelineno-3-69"></a><span class="w"> </span><span class="sd">"""x: (batch, time, features) -> logits: (batch, time)。"""</span>
|
||
<a id="__codelineno-3-70" name="__codelineno-3-70" href="#__codelineno-3-70"></a> <span class="n">batch_size</span><span class="p">,</span> <span class="n">n_frames</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">shape</span>
|
||
<a id="__codelineno-3-71" name="__codelineno-3-71" href="#__codelineno-3-71"></a> <span class="n">hidden_dim</span> <span class="o">=</span> <span class="n">params</span><span class="p">[</span><span class="s1">'W_z'</span><span class="p">]</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
|
||
<a id="__codelineno-3-72" name="__codelineno-3-72" href="#__codelineno-3-72"></a> <span class="n">h</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="n">batch_size</span><span class="p">,</span> <span class="n">hidden_dim</span><span class="p">))</span>
|
||
<a id="__codelineno-3-73" name="__codelineno-3-73" href="#__codelineno-3-73"></a>
|
||
<a id="__codelineno-3-74" name="__codelineno-3-74" href="#__codelineno-3-74"></a> <span class="n">outputs</span> <span class="o">=</span> <span class="p">[]</span>
|
||
<a id="__codelineno-3-75" name="__codelineno-3-75" href="#__codelineno-3-75"></a> <span class="k">for</span> <span class="n">t</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">n_frames</span><span class="p">):</span>
|
||
<a id="__codelineno-3-76" name="__codelineno-3-76" href="#__codelineno-3-76"></a> <span class="n">h</span> <span class="o">=</span> <span class="n">gru_step</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">x</span><span class="p">[:,</span> <span class="n">t</span><span class="p">,</span> <span class="p">:])</span>
|
||
<a id="__codelineno-3-77" name="__codelineno-3-77" href="#__codelineno-3-77"></a> <span class="n">logit</span> <span class="o">=</span> <span class="p">(</span><span class="n">h</span> <span class="o">@</span> <span class="n">params</span><span class="p">[</span><span class="s1">'W_out'</span><span class="p">]</span> <span class="o">+</span> <span class="n">params</span><span class="p">[</span><span class="s1">'b_out'</span><span class="p">])</span><span class="o">.</span><span class="n">squeeze</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
|
||
<a id="__codelineno-3-78" name="__codelineno-3-78" href="#__codelineno-3-78"></a> <span class="n">outputs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">logit</span><span class="p">)</span>
|
||
<a id="__codelineno-3-79" name="__codelineno-3-79" href="#__codelineno-3-79"></a>
|
||
<a id="__codelineno-3-80" name="__codelineno-3-80" href="#__codelineno-3-80"></a> <span class="k">return</span> <span class="n">jnp</span><span class="o">.</span><span class="n">stack</span><span class="p">(</span><span class="n">outputs</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
|
||
<a id="__codelineno-3-81" name="__codelineno-3-81" href="#__codelineno-3-81"></a>
|
||
<a id="__codelineno-3-82" name="__codelineno-3-82" href="#__codelineno-3-82"></a><span class="k">def</span><span class="w"> </span><span class="nf">bce_loss</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">features</span><span class="p">,</span> <span class="n">labels</span><span class="p">):</span>
|
||
<a id="__codelineno-3-83" name="__codelineno-3-83" href="#__codelineno-3-83"></a><span class="w"> </span><span class="sd">"""VAD 的二元交叉熵损失。"""</span>
|
||
<a id="__codelineno-3-84" name="__codelineno-3-84" href="#__codelineno-3-84"></a> <span class="n">logits</span> <span class="o">=</span> <span class="n">vad_forward</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">features</span><span class="p">)</span>
|
||
<a id="__codelineno-3-85" name="__codelineno-3-85" href="#__codelineno-3-85"></a> <span class="n">probs</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">sigmoid</span><span class="p">(</span><span class="n">logits</span><span class="p">)</span>
|
||
<a id="__codelineno-3-86" name="__codelineno-3-86" href="#__codelineno-3-86"></a> <span class="n">probs</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">clip</span><span class="p">(</span><span class="n">probs</span><span class="p">,</span> <span class="mf">1e-7</span><span class="p">,</span> <span class="mi">1</span> <span class="o">-</span> <span class="mf">1e-7</span><span class="p">)</span>
|
||
<a id="__codelineno-3-87" name="__codelineno-3-87" href="#__codelineno-3-87"></a> <span class="n">loss</span> <span class="o">=</span> <span class="o">-</span><span class="p">(</span><span class="n">labels</span> <span class="o">*</span> <span class="n">jnp</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="n">probs</span><span class="p">)</span> <span class="o">+</span> <span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="n">labels</span><span class="p">)</span> <span class="o">*</span> <span class="n">jnp</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="n">probs</span><span class="p">))</span>
|
||
<a id="__codelineno-3-88" name="__codelineno-3-88" href="#__codelineno-3-88"></a> <span class="k">return</span> <span class="n">jnp</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">loss</span><span class="p">)</span>
|
||
<a id="__codelineno-3-89" name="__codelineno-3-89" href="#__codelineno-3-89"></a>
|
||
<a id="__codelineno-3-90" name="__codelineno-3-90" href="#__codelineno-3-90"></a><span class="n">grad_fn</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">jit</span><span class="p">(</span><span class="n">jax</span><span class="o">.</span><span class="n">value_and_grad</span><span class="p">(</span><span class="n">bce_loss</span><span class="p">))</span>
|
||
<a id="__codelineno-3-91" name="__codelineno-3-91" href="#__codelineno-3-91"></a>
|
||
<a id="__codelineno-3-92" name="__codelineno-3-92" href="#__codelineno-3-92"></a><span class="c1"># 训练</span>
|
||
<a id="__codelineno-3-93" name="__codelineno-3-93" href="#__codelineno-3-93"></a><span class="n">params</span> <span class="o">=</span> <span class="n">init_vad_model</span><span class="p">(</span><span class="n">jr</span><span class="o">.</span><span class="n">PRNGKey</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span>
|
||
<a id="__codelineno-3-94" name="__codelineno-3-94" href="#__codelineno-3-94"></a><span class="n">lr</span> <span class="o">=</span> <span class="mf">5e-3</span>
|
||
<a id="__codelineno-3-95" name="__codelineno-3-95" href="#__codelineno-3-95"></a><span class="n">losses</span> <span class="o">=</span> <span class="p">[]</span>
|
||
<a id="__codelineno-3-96" name="__codelineno-3-96" href="#__codelineno-3-96"></a>
|
||
<a id="__codelineno-3-97" name="__codelineno-3-97" href="#__codelineno-3-97"></a><span class="k">for</span> <span class="n">epoch</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">200</span><span class="p">):</span>
|
||
<a id="__codelineno-3-98" name="__codelineno-3-98" href="#__codelineno-3-98"></a> <span class="n">loss_val</span><span class="p">,</span> <span class="n">grads</span> <span class="o">=</span> <span class="n">grad_fn</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">train_features</span><span class="p">,</span> <span class="n">train_labels</span><span class="p">)</span>
|
||
<a id="__codelineno-3-99" name="__codelineno-3-99" href="#__codelineno-3-99"></a> <span class="n">params</span> <span class="o">=</span> <span class="n">jax</span><span class="o">.</span><span class="n">tree</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">,</span> <span class="n">g</span><span class="p">:</span> <span class="n">p</span> <span class="o">-</span> <span class="n">lr</span> <span class="o">*</span> <span class="n">g</span><span class="p">,</span> <span class="n">params</span><span class="p">,</span> <span class="n">grads</span><span class="p">)</span>
|
||
<a id="__codelineno-3-100" name="__codelineno-3-100" href="#__codelineno-3-100"></a> <span class="n">losses</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="n">loss_val</span><span class="p">))</span>
|
||
<a id="__codelineno-3-101" name="__codelineno-3-101" href="#__codelineno-3-101"></a> <span class="k">if</span> <span class="n">epoch</span> <span class="o">%</span> <span class="mi">50</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
|
||
<a id="__codelineno-3-102" name="__codelineno-3-102" href="#__codelineno-3-102"></a> <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"轮次 </span><span class="si">{</span><span class="n">epoch</span><span class="si">}</span><span class="s2">:损失 = </span><span class="si">{</span><span class="n">loss_val</span><span class="si">:</span><span class="s2">.4f</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
||
<a id="__codelineno-3-103" name="__codelineno-3-103" href="#__codelineno-3-103"></a>
|
||
<a id="__codelineno-3-104" name="__codelineno-3-104" href="#__codelineno-3-104"></a><span class="c1"># 在测试集上评估</span>
|
||
<a id="__codelineno-3-105" name="__codelineno-3-105" href="#__codelineno-3-105"></a><span class="n">test_logits</span> <span class="o">=</span> <span class="n">vad_forward</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">test_features</span><span class="p">)</span>
|
||
<a id="__codelineno-3-106" name="__codelineno-3-106" href="#__codelineno-3-106"></a><span class="n">test_preds</span> <span class="o">=</span> <span class="p">(</span><span class="n">jax</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">sigmoid</span><span class="p">(</span><span class="n">test_logits</span><span class="p">)</span> <span class="o">></span> <span class="mf">0.5</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">jnp</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
|
||
<a id="__codelineno-3-107" name="__codelineno-3-107" href="#__codelineno-3-107"></a><span class="n">accuracy</span> <span class="o">=</span> <span class="n">jnp</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">test_preds</span> <span class="o">==</span> <span class="n">test_labels</span><span class="p">)</span>
|
||
<a id="__codelineno-3-108" name="__codelineno-3-108" href="#__codelineno-3-108"></a><span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="se">\n</span><span class="s2">测试准确率:</span><span class="si">{</span><span class="n">accuracy</span><span class="si">:</span><span class="s2">.4f</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
||
<a id="__codelineno-3-109" name="__codelineno-3-109" href="#__codelineno-3-109"></a>
|
||
<a id="__codelineno-3-110" name="__codelineno-3-110" href="#__codelineno-3-110"></a><span class="c1"># 可视化一个测试示例</span>
|
||
<a id="__codelineno-3-111" name="__codelineno-3-111" href="#__codelineno-3-111"></a><span class="n">idx</span> <span class="o">=</span> <span class="mi">0</span>
|
||
<a id="__codelineno-3-112" name="__codelineno-3-112" href="#__codelineno-3-112"></a><span class="n">fig</span><span class="p">,</span> <span class="n">axes</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">subplots</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">figsize</span><span class="o">=</span><span class="p">(</span><span class="mi">14</span><span class="p">,</span> <span class="mi">7</span><span class="p">))</span>
|
||
<a id="__codelineno-3-113" name="__codelineno-3-113" href="#__codelineno-3-113"></a>
|
||
<a id="__codelineno-3-114" name="__codelineno-3-114" href="#__codelineno-3-114"></a><span class="n">axes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">imshow</span><span class="p">(</span><span class="n">test_features</span><span class="p">[</span><span class="n">idx</span><span class="p">]</span><span class="o">.</span><span class="n">T</span><span class="p">,</span> <span class="n">aspect</span><span class="o">=</span><span class="s1">'auto'</span><span class="p">,</span> <span class="n">origin</span><span class="o">=</span><span class="s1">'lower'</span><span class="p">,</span> <span class="n">cmap</span><span class="o">=</span><span class="s1">'magma'</span><span class="p">)</span>
|
||
<a id="__codelineno-3-115" name="__codelineno-3-115" href="#__codelineno-3-115"></a><span class="n">axes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s1">'对数梅尔能量特征'</span><span class="p">)</span>
|
||
<a id="__codelineno-3-116" name="__codelineno-3-116" href="#__codelineno-3-116"></a><span class="n">axes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s1">'梅尔频带'</span><span class="p">)</span>
|
||
<a id="__codelineno-3-117" name="__codelineno-3-117" href="#__codelineno-3-117"></a>
|
||
<a id="__codelineno-3-118" name="__codelineno-3-118" href="#__codelineno-3-118"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">fill_between</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">200</span><span class="p">),</span> <span class="n">test_labels</span><span class="p">[</span><span class="n">idx</span><span class="p">],</span> <span class="n">alpha</span><span class="o">=</span><span class="mf">0.4</span><span class="p">,</span> <span class="n">color</span><span class="o">=</span><span class="s1">'#27ae60'</span><span class="p">,</span>
|
||
<a id="__codelineno-3-119" name="__codelineno-3-119" href="#__codelineno-3-119"></a> <span class="n">label</span><span class="o">=</span><span class="s1">'真实值'</span><span class="p">)</span>
|
||
<a id="__codelineno-3-120" name="__codelineno-3-120" href="#__codelineno-3-120"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">jax</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">sigmoid</span><span class="p">(</span><span class="n">test_logits</span><span class="p">[</span><span class="n">idx</span><span class="p">]),</span> <span class="n">color</span><span class="o">=</span><span class="s1">'#e74c3c'</span><span class="p">,</span>
|
||
<a id="__codelineno-3-121" name="__codelineno-3-121" href="#__codelineno-3-121"></a> <span class="n">linewidth</span><span class="o">=</span><span class="mf">1.5</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s1">'预测概率'</span><span class="p">)</span>
|
||
<a id="__codelineno-3-122" name="__codelineno-3-122" href="#__codelineno-3-122"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">axhline</span><span class="p">(</span><span class="mf">0.5</span><span class="p">,</span> <span class="n">color</span><span class="o">=</span><span class="s1">'gray'</span><span class="p">,</span> <span class="n">linestyle</span><span class="o">=</span><span class="s1">'--'</span><span class="p">,</span> <span class="n">linewidth</span><span class="o">=</span><span class="mf">0.8</span><span class="p">)</span>
|
||
<a id="__codelineno-3-123" name="__codelineno-3-123" href="#__codelineno-3-123"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s1">'语音概率'</span><span class="p">)</span>
|
||
<a id="__codelineno-3-124" name="__codelineno-3-124" href="#__codelineno-3-124"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">legend</span><span class="p">()</span>
|
||
<a id="__codelineno-3-125" name="__codelineno-3-125" href="#__codelineno-3-125"></a><span class="n">axes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s1">'VAD 预测'</span><span class="p">)</span>
|
||
<a id="__codelineno-3-126" name="__codelineno-3-126" href="#__codelineno-3-126"></a>
|
||
<a id="__codelineno-3-127" name="__codelineno-3-127" href="#__codelineno-3-127"></a><span class="n">axes</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">fill_between</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">200</span><span class="p">),</span> <span class="n">test_labels</span><span class="p">[</span><span class="n">idx</span><span class="p">],</span> <span class="n">alpha</span><span class="o">=</span><span class="mf">0.4</span><span class="p">,</span> <span class="n">color</span><span class="o">=</span><span class="s1">'#27ae60'</span><span class="p">,</span>
|
||
<a id="__codelineno-3-128" name="__codelineno-3-128" href="#__codelineno-3-128"></a> <span class="n">label</span><span class="o">=</span><span class="s1">'真实值'</span><span class="p">)</span>
|
||
<a id="__codelineno-3-129" name="__codelineno-3-129" href="#__codelineno-3-129"></a><span class="n">axes</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">fill_between</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">200</span><span class="p">),</span> <span class="n">test_preds</span><span class="p">[</span><span class="n">idx</span><span class="p">],</span> <span class="n">alpha</span><span class="o">=</span><span class="mf">0.4</span><span class="p">,</span> <span class="n">color</span><span class="o">=</span><span class="s1">'#f39c12'</span><span class="p">,</span>
|
||
<a id="__codelineno-3-130" name="__codelineno-3-130" href="#__codelineno-3-130"></a> <span class="n">label</span><span class="o">=</span><span class="s1">'预测(阈值=0.5)'</span><span class="p">)</span>
|
||
<a id="__codelineno-3-131" name="__codelineno-3-131" href="#__codelineno-3-131"></a><span class="n">axes</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s1">'语音 / 静音'</span><span class="p">)</span>
|
||
<a id="__codelineno-3-132" name="__codelineno-3-132" href="#__codelineno-3-132"></a><span class="n">axes</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">set_xlabel</span><span class="p">(</span><span class="s1">'帧'</span><span class="p">)</span>
|
||
<a id="__codelineno-3-133" name="__codelineno-3-133" href="#__codelineno-3-133"></a><span class="n">axes</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">legend</span><span class="p">()</span>
|
||
<a id="__codelineno-3-134" name="__codelineno-3-134" href="#__codelineno-3-134"></a><span class="n">axes</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">.</span><span class="n">set_title</span><span class="p">(</span><span class="s1">'VAD 二值决策'</span><span class="p">)</span>
|
||
<a id="__codelineno-3-135" name="__codelineno-3-135" href="#__codelineno-3-135"></a>
|
||
<a id="__codelineno-3-136" name="__codelineno-3-136" href="#__codelineno-3-136"></a><span class="n">plt</span><span class="o">.</span><span class="n">tight_layout</span><span class="p">()</span>
|
||
<a id="__codelineno-3-137" name="__codelineno-3-137" href="#__codelineno-3-137"></a><span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
|
||
</code></pre></div>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
</article>
|
||
</div>
|
||
|
||
|
||
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
||
</div>
|
||
|
||
<button type="button" class="md-top md-icon" data-md-component="top" hidden>
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8z"/></svg>
|
||
回到页面顶部
|
||
</button>
|
||
|
||
</main>
|
||
|
||
<footer class="md-footer">
|
||
|
||
|
||
|
||
<nav class="md-footer__inner md-grid" aria-label="页脚" >
|
||
|
||
|
||
<a href="../02.%20automatic%20speech%20recognition/" class="md-footer__link md-footer__link--prev" aria-label="上一页: 自动语音识别">
|
||
<div class="md-footer__button md-icon">
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
|
||
</div>
|
||
<div class="md-footer__title">
|
||
<span class="md-footer__direction">
|
||
上一页
|
||
</span>
|
||
<div class="md-ellipsis">
|
||
自动语音识别
|
||
</div>
|
||
</div>
|
||
</a>
|
||
|
||
|
||
|
||
<a href="../04.%20speaker%20and%20audio%20analysis/" class="md-footer__link md-footer__link--next" aria-label="下一页: 说话人与音频分析">
|
||
<div class="md-footer__title">
|
||
<span class="md-footer__direction">
|
||
下一页
|
||
</span>
|
||
<div class="md-ellipsis">
|
||
说话人与音频分析
|
||
</div>
|
||
</div>
|
||
<div class="md-footer__button md-icon">
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11z"/></svg>
|
||
</div>
|
||
</a>
|
||
|
||
</nav>
|
||
|
||
|
||
<div class="md-footer-meta md-typeset">
|
||
<div class="md-footer-meta__inner md-grid">
|
||
<div class="md-copyright">
|
||
|
||
|
||
Made with
|
||
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||
Material for MkDocs
|
||
</a>
|
||
|
||
</div>
|
||
|
||
|
||
<div class="md-social">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<a href="https://github.com/flykhan/maths-cs-ai-compendium-zh" target="_blank" rel="noopener" title="github.com" class="md-social__link">
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M173.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6m-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3m44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9M252.8 8C114.1 8 8 113.3 8 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C436.2 457.8 504 362.9 504 252 504 113.3 391.5 8 252.8 8M105.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1m-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7m32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1m-11.4-14.7c-1.6 1-1.6 3.6 0 5.9s4.3 3.3 5.6 2.3c1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2"/></svg>
|
||
</a>
|
||
|
||
</div>
|
||
|
||
</div>
|
||
</div>
|
||
</footer>
|
||
|
||
</div>
|
||
<div class="md-dialog" data-md-component="dialog">
|
||
<div class="md-dialog__inner md-typeset"></div>
|
||
</div>
|
||
|
||
|
||
|
||
|
||
|
||
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": ["navigation.tabs", "navigation.sections", "navigation.expand", "navigation.top", "navigation.footer", "search.suggest", "search.highlight", "content.code.copy", "toc.follow"], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "\u5df2\u590d\u5236", "clipboard.copy": "\u590d\u5236", "search.result.more.one": "\u5728\u8be5\u9875\u4e0a\u8fd8\u6709 1 \u4e2a\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.more.other": "\u5728\u8be5\u9875\u4e0a\u8fd8\u6709 # \u4e2a\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.none": "\u6ca1\u6709\u627e\u5230\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.one": "\u627e\u5230 1 \u4e2a\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.other": "# \u4e2a\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.placeholder": "\u952e\u5165\u4ee5\u5f00\u59cb\u641c\u7d22", "search.result.term.missing": "\u7f3a\u5c11", "select.version": "\u9009\u62e9\u5f53\u524d\u7248\u672c"}, "version": null}</script>
|
||
|
||
|
||
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
|
||
|
||
<script src="../../javascripts/mathjax.js"></script>
|
||
|
||
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||
|
||
|
||
</body>
|
||
</html> |