Skip to content

Commit 41699fe

Browse files
committed
examples : add embd_to_audio to tts-outetts.py
This commit contains a suggestion for adding the missing embd_to_audion function from tts.cpp to tts-outetts.py. This functions uses numpy which I was not sure if that is acceptable (only PyTorch was mentioned in referened PR). Also the README has been updated with instructions to run the example with llama-server and the python script. Refs: ggml-org#10784 (comment)
1 parent bbf3e55 commit 41699fe

File tree

2 files changed

+162
-2
lines changed

2 files changed

+162
-2
lines changed

examples/tts/README.md

+35
Original file line numberDiff line numberDiff line change
@@ -78,3 +78,38 @@ play the audio:
7878
$ aplay output.wav
7979
```
8080

81+
### Running the example with llama-server
82+
Running this example with `llama-server` is also possible and requires two
83+
server instances to be started. One will server the LLM model and the other
84+
will serve the voice decoder model.
85+
86+
The LLM model server can be started with the following command:
87+
```console
88+
$ ./build/bin/llama-server -m ./models/outetts-0.2-0.5B-q8_0.gguf --port 8020
89+
```
90+
And the voice decoder model server can be started using:
91+
```console
92+
./build/bin/llama-server -m ./models/wavtokenizer-large-75-f16.gguf --port 8021 --embeddings --pooling none
93+
```
94+
Then we can run the following python script to generate the audio.
95+
96+
First create a virtual environment for python and install the required
97+
dependencies (this in only required to be done once):
98+
```console
99+
$ python3 -m venv venv
100+
$ source venv/bin/activate
101+
(venv) pip install requests numpy
102+
```
103+
104+
And then run the script using:
105+
```conole
106+
(venv) python ./examples/tts/tts-outetts.py http://localhost:8020 http://localhost:8021 "Hello world"
107+
spectrogram generated: n_codes: 90, n_embd: 1282
108+
converting to audio ...
109+
audio generated: 28800 samples
110+
audio written to file "output.wav"
111+
```
112+
And to play the audio we can again use aplay or any other media player:
113+
```console
114+
$ aplay output.wav
115+
```

examples/tts/tts-outetts.py

+127-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,122 @@
33
#import struct
44
import requests
55
import re
6+
import struct
7+
import numpy as np
8+
from concurrent.futures import ThreadPoolExecutor
9+
10+
11+
def fill_hann_window(size, periodic=True):
12+
if periodic:
13+
return np.hanning(size + 1)[:-1]
14+
return np.hanning(size)
15+
16+
17+
def irfft(n_fft, complex_input):
18+
# Ensure proper shape for IFFT
19+
return np.fft.irfft(complex_input, n=n_fft)
20+
21+
22+
def fold(buffer, n_out, n_win, n_hop, n_pad):
23+
result = np.zeros(n_out)
24+
n_frames = len(buffer) // n_win
25+
26+
for i in range(n_frames):
27+
start = i * n_hop
28+
end = start + n_win
29+
result[start:end] += buffer[i * n_win:(i + 1) * n_win]
30+
31+
return result[n_pad:-n_pad] if n_pad > 0 else result
32+
33+
34+
def process_frame(args):
35+
l, n_fft, ST, hann = args
36+
frame = irfft(n_fft, ST[l])
37+
frame = frame * hann
38+
hann2 = hann * hann
39+
return frame, hann2
40+
41+
42+
def embd_to_audio(embd, n_codes, n_embd, n_thread=4):
43+
embd = np.asarray(embd, dtype=np.float32).reshape(n_codes, n_embd)
44+
45+
n_fft = 1280
46+
n_hop = 320
47+
n_win = 1280
48+
n_pad = (n_win - n_hop) // 2
49+
n_out = (n_codes - 1) * n_hop + n_win
50+
51+
hann = fill_hann_window(n_fft, True)
52+
53+
E = np.zeros((n_embd, n_codes), dtype=np.float32)
54+
for l in range(n_codes):
55+
for k in range(n_embd):
56+
E[k, l] = embd[l, k]
57+
58+
half_embd = n_embd // 2
59+
S = np.zeros((n_codes, half_embd + 1), dtype=np.complex64)
60+
61+
for k in range(half_embd):
62+
for l in range(n_codes):
63+
mag = E[k, l]
64+
phi = E[k + half_embd, l]
65+
66+
mag = np.clip(np.exp(mag), 0, 1e2)
67+
S[l, k] = mag * np.exp(1j * phi)
68+
69+
res = np.zeros(n_codes * n_fft)
70+
hann2_buffer = np.zeros(n_codes * n_fft)
71+
72+
with ThreadPoolExecutor(max_workers=n_thread) as executor:
73+
args = [(l, n_fft, S, hann) for l in range(n_codes)]
74+
results = list(executor.map(process_frame, args))
75+
76+
for l, (frame, hann2) in enumerate(results):
77+
res[l*n_fft:(l+1)*n_fft] = frame
78+
hann2_buffer[l*n_fft:(l+1)*n_fft] = hann2
79+
80+
audio = fold(res, n_out, n_win, n_hop, n_pad)
81+
env = fold(hann2_buffer, n_out, n_win, n_hop, n_pad)
82+
83+
mask = env > 1e-10
84+
audio[mask] /= env[mask]
85+
86+
return audio
87+
88+
89+
def save_wav(filename, audio_data, sample_rate):
90+
num_channels = 1
91+
bits_per_sample = 16
92+
bytes_per_sample = bits_per_sample // 8
93+
data_size = len(audio_data) * bytes_per_sample
94+
byte_rate = sample_rate * num_channels * bytes_per_sample
95+
block_align = num_channels * bytes_per_sample
96+
chunk_size = 36 + data_size # 36 = size of header minus first 8 bytes
97+
98+
header = struct.pack(
99+
'<4sI4s4sIHHIIHH4sI',
100+
b'RIFF',
101+
chunk_size,
102+
b'WAVE',
103+
b'fmt ',
104+
16, # fmt chunk size
105+
1, # audio format (PCM)
106+
num_channels,
107+
sample_rate,
108+
byte_rate,
109+
block_align,
110+
bits_per_sample,
111+
b'data',
112+
data_size
113+
)
114+
115+
audio_data = np.clip(audio_data * 32767, -32768, 32767)
116+
pcm_data = audio_data.astype(np.int16)
117+
118+
with open(filename, 'wb') as f:
119+
f.write(header)
120+
f.write(pcm_data.tobytes())
121+
6122

7123
def process_text(text: str):
8124
text = re.sub(r'\d+(\.\d+)?', lambda x: x.group(), text.lower()) # TODO this needs to be fixed
@@ -170,6 +286,15 @@ def process_text(text: str):
170286
print('spectrogram generated: n_codes: %d, n_embd: %d' % (n_codes, n_embd))
171287

172288
# post-process the spectrogram to convert to audio
173-
# TODO: see the tts.cpp:embd_to_audio() and implement it in Python
174289
print('converting to audio ...')
175-
print('TODO: see the tts.cpp:embd_to_audio() and implement it in Python')
290+
audio = embd_to_audio(embd, n_codes, n_embd)
291+
print('audio generated: %d samples' % len(audio))
292+
293+
filename = "output.wav"
294+
sample_rate = 24000 # sampling rate
295+
296+
# zero out first 0.25 seconds
297+
audio[:24000 // 4] = 0.0
298+
299+
save_wav(filename, audio, sample_rate)
300+
print('audio written to file "%s"' % filename)

0 commit comments

Comments
 (0)