73 lines
5.0 KiB
XML
73 lines
5.0 KiB
XML
<svg xmlns="http://www.w3.org/2000/svg" width="700" height="280" font-family="Arial, Helvetica, sans-serif">
|
|
<defs>
|
|
<marker id="arrow-sd" markerWidth="8" markerHeight="6" refX="8" refY="3" orient="auto">
|
|
<polygon points="0,0 8,3 0,6" fill="#333"/>
|
|
</marker>
|
|
</defs>
|
|
|
|
<!-- Title -->
|
|
<text x="350" y="22" text-anchor="middle" font-size="14" font-weight="bold" fill="#333">Speaker Diarisation</text>
|
|
|
|
<!-- Waveform background -->
|
|
<rect x="40" y="40" width="620" height="55" rx="4" fill="rgba(102,102,102,0.05)" stroke="#999" stroke-width="0.8"/>
|
|
<text x="50" y="55" font-size="8" fill="#666">Audio Signal</text>
|
|
|
|
<!-- Simulated waveform -->
|
|
<polyline points="50,68 58,58 65,75 72,55 80,78 88,52 95,72 102,60 110,68 118,55 125,78 132,48 140,75 148,58 155,68 162,62 170,72 178,55 185,78 192,52 200,70 208,58 215,75 222,48 230,72 238,60 245,68 252,55 260,75 268,52 275,68 282,62 290,72 298,55 305,78 312,48 320,70 328,58 335,75 342,52 350,68 358,60 365,72 372,55 380,78 388,52 395,68 402,62 410,55 418,75 425,48 432,72 440,58 448,68 455,55 462,75 470,52 478,68 485,62 492,72 500,55 508,78 515,48 522,70 530,58 538,75 545,52 552,68 560,60 568,72 575,55 582,78 590,52 598,68 605,60 612,72 620,55 628,68 635,60 645,68 650,62" fill="none" stroke="#999" stroke-width="1"/>
|
|
|
|
<!-- Speaker segment timeline -->
|
|
<text x="50" y="115" font-size="9" font-weight="bold" fill="#333">Speaker Timeline</text>
|
|
|
|
<!-- Speaker A segments (blue) -->
|
|
<rect x="50" y="122" width="120" height="28" rx="3" fill="rgba(52,152,219,0.2)" stroke="#3498db" stroke-width="1.5"/>
|
|
<text x="110" y="140" text-anchor="middle" font-size="9" fill="#3498db">Speaker A</text>
|
|
|
|
<rect x="310" y="122" width="90" height="28" rx="3" fill="rgba(52,152,219,0.2)" stroke="#3498db" stroke-width="1.5"/>
|
|
<text x="355" y="140" text-anchor="middle" font-size="9" fill="#3498db">Speaker A</text>
|
|
|
|
<rect x="540" y="122" width="110" height="28" rx="3" fill="rgba(52,152,219,0.2)" stroke="#3498db" stroke-width="1.5"/>
|
|
<text x="595" y="140" text-anchor="middle" font-size="9" fill="#3498db">Speaker A</text>
|
|
|
|
<!-- Speaker B segments (red) -->
|
|
<rect x="170" y="122" width="100" height="28" rx="3" fill="rgba(231,76,60,0.2)" stroke="#e74c3c" stroke-width="1.5"/>
|
|
<text x="220" y="140" text-anchor="middle" font-size="9" fill="#e74c3c">Speaker B</text>
|
|
|
|
<rect x="450" y="122" width="90" height="28" rx="3" fill="rgba(231,76,60,0.2)" stroke="#e74c3c" stroke-width="1.5"/>
|
|
<text x="495" y="140" text-anchor="middle" font-size="9" fill="#e74c3c">Speaker B</text>
|
|
|
|
<!-- Speaker C segment (green) -->
|
|
<rect x="400" y="122" width="50" height="28" rx="3" fill="rgba(39,174,96,0.2)" stroke="#27ae60" stroke-width="1.5"/>
|
|
<text x="425" y="140" text-anchor="middle" font-size="9" fill="#27ae60">C</text>
|
|
|
|
<!-- Overlap region (purple) -->
|
|
<rect x="260" y="122" width="50" height="28" rx="3" fill="rgba(155,89,182,0.25)" stroke="#9b59b6" stroke-width="1.5"/>
|
|
<text x="285" y="140" text-anchor="middle" font-size="8" fill="#9b59b6">Overlap</text>
|
|
|
|
<!-- Transition markers -->
|
|
<line x1="170" y1="118" x2="170" y2="155" stroke="#666" stroke-width="0.7" stroke-dasharray="2,2"/>
|
|
<line x1="260" y1="118" x2="260" y2="155" stroke="#666" stroke-width="0.7" stroke-dasharray="2,2"/>
|
|
<line x1="310" y1="118" x2="310" y2="155" stroke="#666" stroke-width="0.7" stroke-dasharray="2,2"/>
|
|
<line x1="400" y1="118" x2="400" y2="155" stroke="#666" stroke-width="0.7" stroke-dasharray="2,2"/>
|
|
<line x1="450" y1="118" x2="450" y2="155" stroke="#666" stroke-width="0.7" stroke-dasharray="2,2"/>
|
|
<line x1="540" y1="118" x2="540" y2="155" stroke="#666" stroke-width="0.7" stroke-dasharray="2,2"/>
|
|
|
|
<!-- Time axis -->
|
|
<line x1="50" y1="160" x2="650" y2="160" stroke="#333" stroke-width="1" marker-end="url(#arrow-sd)"/>
|
|
<text x="655" y="164" font-size="8" fill="#666">time</text>
|
|
|
|
<!-- Legend -->
|
|
<rect x="100" y="175" width="14" height="14" rx="2" fill="rgba(52,152,219,0.2)" stroke="#3498db" stroke-width="1"/>
|
|
<text x="120" y="186" font-size="9" fill="#333">Speaker A</text>
|
|
<rect x="200" y="175" width="14" height="14" rx="2" fill="rgba(231,76,60,0.2)" stroke="#e74c3c" stroke-width="1"/>
|
|
<text x="220" y="186" font-size="9" fill="#333">Speaker B</text>
|
|
<rect x="300" y="175" width="14" height="14" rx="2" fill="rgba(39,174,96,0.2)" stroke="#27ae60" stroke-width="1"/>
|
|
<text x="320" y="186" font-size="9" fill="#333">Speaker C</text>
|
|
<rect x="400" y="175" width="14" height="14" rx="2" fill="rgba(155,89,182,0.25)" stroke="#9b59b6" stroke-width="1"/>
|
|
<text x="420" y="186" font-size="9" fill="#333">Overlap</text>
|
|
|
|
<!-- Note box -->
|
|
<rect x="30" y="210" width="640" height="55" rx="6" fill="#f5f5f5" stroke="#333" stroke-width="1"/>
|
|
<text x="50" y="230" font-size="10" font-weight="bold" fill="#333">Diarisation: who spoke when?</text>
|
|
<text x="50" y="248" font-size="9" fill="#666">Combines voice activity detection (VAD), speaker embedding extraction, and clustering to segment</text>
|
|
<text x="50" y="260" font-size="9" fill="#666">audio by speaker identity. Overlap regions require special handling.</text>
|
|
</svg> |