Skip to content

Commit 63d1dc7

Browse files
authored
Python: Text to audio (#9625)
### Motivation and Context <!-- Thank you for your contribution to the semantic-kernel repo! Please help reviewers and future users, providing the following information: 1. Why is this change required? 2. What problem does it solve? 3. What scenario does it contribute to? 4. If it fixes an open issue, please link to the issue here. --> Addresses: #7433 ### Description <!-- Describe your changes, the overall approach, the underlying design. These notes will help understanding how your code works. Thanks! --> Add text to audio interface to Python Semantic Kernel. ### Contribution Checklist 1. Text-to-audio client base. 2. OpenAI and Azure OpenAI implementations of the client. 3. Unit tests and integration tests. 4. Samples. <!-- Before submitting this PR, please make sure: --> - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone 😄
1 parent 5764c8c commit 63d1dc7

34 files changed

+1019
-36
lines changed

.github/workflows/python-integration-tests.yml

+14-2
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,19 @@ jobs:
6464
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME }} # azure-text-embedding-ada-002
6565
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }}
6666
AZURE_OPENAI_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_DEPLOYMENT_NAME }}
67-
AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
67+
AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME }}
68+
AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME }}
69+
AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME }}
6870
AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
6971
AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
72+
AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
73+
AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT: ${{ secrets.AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT }}
7074
BING_API_KEY: ${{ secrets.BING_API_KEY }}
7175
OPENAI_CHAT_MODEL_ID: ${{ vars.OPENAI_CHAT_MODEL_ID }}
7276
OPENAI_TEXT_MODEL_ID: ${{ vars.OPENAI_TEXT_MODEL_ID }}
7377
OPENAI_EMBEDDING_MODEL_ID: ${{ vars.OPENAI_EMBEDDING_MODEL_ID }}
78+
OPENAI_AUDIO_TO_TEXT_MODEL_ID: ${{ vars.OPENAI_AUDIO_TO_TEXT_MODEL_ID }}
79+
OPENAI_TEXT_TO_AUDIO_MODEL_ID: ${{ vars.OPENAI_TEXT_TO_AUDIO_MODEL_ID }}
7480
OPENAI_TEXT_TO_IMAGE_MODEL_ID: ${{ vars.OPENAI_TEXT_TO_IMAGE_MODEL_ID }}
7581
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
7682
PINECONE_API_KEY: ${{ secrets.PINECONE__APIKEY }}
@@ -233,13 +239,19 @@ jobs:
233239
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME }} # azure-text-embedding-ada-002
234240
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }}
235241
AZURE_OPENAI_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_DEPLOYMENT_NAME }}
236-
AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
242+
AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME }}
243+
AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME }}
244+
AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME }}
237245
AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
238246
AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
247+
AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
248+
AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT: ${{ secrets.AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT }}
239249
BING_API_KEY: ${{ secrets.BING_API_KEY }}
240250
OPENAI_CHAT_MODEL_ID: ${{ vars.OPENAI_CHAT_MODEL_ID }}
241251
OPENAI_TEXT_MODEL_ID: ${{ vars.OPENAI_TEXT_MODEL_ID }}
242252
OPENAI_EMBEDDING_MODEL_ID: ${{ vars.OPENAI_EMBEDDING_MODEL_ID }}
253+
OPENAI_AUDIO_TO_TEXT_MODEL_ID: ${{ vars.OPENAI_AUDIO_TO_TEXT_MODEL_ID }}
254+
OPENAI_TEXT_TO_AUDIO_MODEL_ID: ${{ vars.OPENAI_TEXT_TO_AUDIO_MODEL_ID }}
243255
OPENAI_TEXT_TO_IMAGE_MODEL_ID: ${{ vars.OPENAI_TEXT_TO_IMAGE_MODEL_ID }}
244256
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
245257
PINECONE_API_KEY: ${{ secrets.PINECONE__APIKEY }}

python/samples/concepts/audio_to_text/chat_with_audio_input.py renamed to python/samples/concepts/audio/01-chat_with_audio_input.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,22 @@
44
import logging
55
import os
66

7-
from samples.concepts.audio_to_text.audio_recorder import AudioRecorder
8-
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
9-
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
7+
from samples.concepts.audio.audio_recorder import AudioRecorder
8+
from semantic_kernel.connectors.ai.open_ai import (
9+
AzureAudioToText,
10+
AzureChatCompletion,
1011
OpenAIChatPromptExecutionSettings,
1112
)
12-
from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
13-
from semantic_kernel.contents import ChatHistory
14-
from semantic_kernel.contents.audio_content import AudioContent
13+
from semantic_kernel.contents import AudioContent, ChatHistory
1514

1615
# This simple sample demonstrates how to use the AzureChatCompletion and AzureAudioToText services
1716
# to create a chat bot that can communicate with the user using audio input.
1817
# The user can enage a long conversation with the chat bot by speaking to it.
1918

19+
# Resources required for this sample:
20+
# 1. An Azure OpenAI model deployment (e.g. GPT-4o-mini).
21+
# 2. An Azure Speech to Text deployment (e.g. whisper).
22+
2023
# Additional dependencies required for this sample:
2124
# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
2225
# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
import asyncio
4+
import logging
5+
6+
from samples.concepts.audio.audio_player import AudioPlayer
7+
from semantic_kernel.connectors.ai.open_ai import (
8+
AzureChatCompletion,
9+
AzureTextToAudio,
10+
OpenAIChatPromptExecutionSettings,
11+
OpenAITextToAudioExecutionSettings,
12+
)
13+
from semantic_kernel.contents import ChatHistory
14+
15+
# This simple sample demonstrates how to use the AzureChatCompletion and AzureTextToAudio services
16+
# to create a chat bot that can communicate with the user using audio output.
17+
# The chatbot will engage in a conversation with the user and respond using audio output.
18+
19+
# Resources required for this sample:
20+
# 1. An Azure OpenAI model deployment (e.g. GPT-4o-mini).
21+
# 2. An Azure Text to Speech deployment (e.g. tts).
22+
23+
# Additional dependencies required for this sample:
24+
# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
25+
# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.
26+
27+
28+
logging.basicConfig(level=logging.WARNING)
29+
30+
system_message = """
31+
You are a chat bot. Your name is Mosscap and
32+
you have one goal: figure out what people need.
33+
Your full name, should you need to know it, is
34+
Splendid Speckled Mosscap. You communicate
35+
effectively, but you tend to answer with long
36+
flowery prose.
37+
"""
38+
39+
40+
chat_service = AzureChatCompletion()
41+
text_to_audio_service = AzureTextToAudio()
42+
43+
history = ChatHistory()
44+
history.add_user_message("Hi there, who are you?")
45+
history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
46+
47+
48+
async def chat() -> bool:
49+
try:
50+
user_input = input("User:> ")
51+
except KeyboardInterrupt:
52+
print("\n\nExiting chat...")
53+
return False
54+
except EOFError:
55+
print("\n\nExiting chat...")
56+
return False
57+
58+
if user_input == "exit":
59+
print("\n\nExiting chat...")
60+
return False
61+
62+
history.add_user_message(user_input)
63+
64+
# No need to stream the response since we can only pass the
65+
# response to the text to audio service as a whole
66+
response = await chat_service.get_chat_message_content(
67+
chat_history=history,
68+
settings=OpenAIChatPromptExecutionSettings(
69+
max_tokens=2000,
70+
temperature=0.7,
71+
top_p=0.8,
72+
),
73+
)
74+
75+
# Need to set the response format to wav since the audio player only supports wav files
76+
audio_content = await text_to_audio_service.get_audio_content(
77+
response.content, OpenAITextToAudioExecutionSettings(response_format="wav")
78+
)
79+
AudioPlayer(audio_content=audio_content).play()
80+
81+
print(f"Mosscap:> {response.content}")
82+
83+
history.add_message(response)
84+
85+
return True
86+
87+
88+
async def main() -> None:
89+
chatting = True
90+
while chatting:
91+
chatting = await chat()
92+
93+
94+
if __name__ == "__main__":
95+
asyncio.run(main())
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
import asyncio
4+
import logging
5+
import os
6+
7+
from samples.concepts.audio.audio_player import AudioPlayer
8+
from samples.concepts.audio.audio_recorder import AudioRecorder
9+
from semantic_kernel.connectors.ai.open_ai import (
10+
AzureAudioToText,
11+
AzureChatCompletion,
12+
AzureTextToAudio,
13+
OpenAIChatPromptExecutionSettings,
14+
OpenAITextToAudioExecutionSettings,
15+
)
16+
from semantic_kernel.contents import AudioContent, ChatHistory
17+
18+
# This simple sample demonstrates how to use the AzureChatCompletion, AzureTextToAudio, and AzureAudioToText
19+
# services to create a chat bot that can communicate with the user using both audio input and output.
20+
# The chatbot will engage in a conversation with the user by audio only.
21+
# This sample combines the functionality of the samples/concepts/audio/01-chat_with_audio_input.py and
22+
# samples/concepts/audio/02-chat_with_audio_output.py samples.
23+
24+
# Resources required for this sample:
25+
# 1. An Azure OpenAI model deployment (e.g. GPT-4o-mini).
26+
# 2. An Azure Text to Speech deployment (e.g. tts).
27+
# 3. An Azure Speech to Text deployment (e.g. whisper).
28+
29+
# Additional dependencies required for this sample:
30+
# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
31+
# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.
32+
33+
34+
logging.basicConfig(level=logging.WARNING)
35+
AUDIO_FILEPATH = os.path.join(os.path.dirname(__file__), "output.wav")
36+
37+
38+
system_message = """
39+
You are a chat bot. Your name is Mosscap and
40+
you have one goal: figure out what people need.
41+
Your full name, should you need to know it, is
42+
Splendid Speckled Mosscap. You communicate
43+
effectively, but you tend to answer with long
44+
flowery prose.
45+
"""
46+
47+
48+
chat_service = AzureChatCompletion()
49+
text_to_audio_service = AzureTextToAudio()
50+
audio_to_text_service = AzureAudioToText()
51+
52+
history = ChatHistory()
53+
history.add_user_message("Hi there, who are you?")
54+
history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
55+
56+
57+
async def chat() -> bool:
58+
try:
59+
print("User:> ", end="", flush=True)
60+
with AudioRecorder(output_filepath=AUDIO_FILEPATH) as recorder:
61+
recorder.start_recording()
62+
user_input = await audio_to_text_service.get_text_content(AudioContent.from_audio_file(AUDIO_FILEPATH))
63+
print(user_input.text)
64+
except KeyboardInterrupt:
65+
print("\n\nExiting chat...")
66+
return False
67+
except EOFError:
68+
print("\n\nExiting chat...")
69+
return False
70+
71+
if "exit" in user_input.text.lower():
72+
print("\n\nExiting chat...")
73+
return False
74+
75+
history.add_user_message(user_input.text)
76+
77+
# No need to stream the response since we can only pass the
78+
# response to the text to audio service as a whole
79+
response = await chat_service.get_chat_message_content(
80+
chat_history=history,
81+
settings=OpenAIChatPromptExecutionSettings(
82+
max_tokens=2000,
83+
temperature=0.7,
84+
top_p=0.8,
85+
),
86+
)
87+
88+
# Need to set the response format to wav since the audio player only supports wav files
89+
audio_content = await text_to_audio_service.get_audio_content(
90+
response.content, OpenAITextToAudioExecutionSettings(response_format="wav")
91+
)
92+
print("Mosscap:> ", end="", flush=True)
93+
AudioPlayer(audio_content=audio_content).play(text=response.content)
94+
95+
history.add_message(response)
96+
97+
return True
98+
99+
100+
async def main() -> None:
101+
print(
102+
"Instruction: when it's your turn to speak, press the spacebar to start recording."
103+
" Release the spacebar to stop recording."
104+
)
105+
106+
chatting = True
107+
while chatting:
108+
chatting = await chat()
109+
110+
111+
if __name__ == "__main__":
112+
asyncio.run(main())
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
import io
4+
import logging
5+
import wave
6+
from typing import ClassVar
7+
8+
import pyaudio
9+
from pydantic import BaseModel
10+
11+
from semantic_kernel.contents import AudioContent
12+
13+
logging.basicConfig(level=logging.WARNING)
14+
logger: logging.Logger = logging.getLogger(__name__)
15+
16+
17+
class AudioPlayer(BaseModel):
18+
"""A class to play an audio file to the default audio output device."""
19+
20+
# Audio replay parameters
21+
CHUNK: ClassVar[int] = 1024
22+
23+
audio_content: AudioContent
24+
25+
def play(self, text: str | None = None) -> None:
26+
"""Play the audio content to the default audio output device.
27+
28+
Args:
29+
text (str, optional): The text to display while playing the audio. Defaults to None.
30+
"""
31+
audio_stream = io.BytesIO(self.audio_content.data)
32+
with wave.open(audio_stream, "rb") as wf:
33+
audio = pyaudio.PyAudio()
34+
stream = audio.open(
35+
format=audio.get_format_from_width(wf.getsampwidth()),
36+
channels=wf.getnchannels(),
37+
rate=wf.getframerate(),
38+
output=True,
39+
)
40+
41+
if text:
42+
# Simulate the output of text while playing the audio
43+
data_frames = []
44+
45+
data = wf.readframes(self.CHUNK)
46+
while data:
47+
data_frames.append(data)
48+
data = wf.readframes(self.CHUNK)
49+
50+
if len(data_frames) < len(text):
51+
logger.warning(
52+
"The audio is too short to play the entire text. ",
53+
"The text will be displayed without synchronization.",
54+
)
55+
print(text)
56+
else:
57+
for data_frame, text_frame in self._zip_text_and_audio(text, data_frames):
58+
stream.write(data_frame)
59+
print(text_frame, end="", flush=True)
60+
print()
61+
else:
62+
data = wf.readframes(self.CHUNK)
63+
while data:
64+
stream.write(data)
65+
data = wf.readframes(self.CHUNK)
66+
67+
stream.stop_stream()
68+
stream.close()
69+
audio.terminate()
70+
71+
def _zip_text_and_audio(self, text: str, audio_frames: list) -> zip:
72+
"""Zip the text and audio frames together so that they can be displayed in sync.
73+
74+
This is done by evenly distributing empty strings between each character and
75+
append the remaining empty strings at the end.
76+
77+
Args:
78+
text (str): The text to display while playing the audio.
79+
audio_frames (list): The audio frames to play.
80+
81+
Returns:
82+
zip: The zipped text and audio frames.
83+
"""
84+
text_frames = list(text)
85+
empty_string_count = len(audio_frames) - len(text_frames)
86+
empty_string_spacing = len(text_frames) // empty_string_count
87+
88+
modified_text_frames = []
89+
current_empty_string_count = 0
90+
for i, text_frame in enumerate(text_frames):
91+
modified_text_frames.append(text_frame)
92+
if current_empty_string_count < empty_string_count and i % empty_string_spacing == 0:
93+
modified_text_frames.append("")
94+
current_empty_string_count += 1
95+
96+
if current_empty_string_count < empty_string_count:
97+
modified_text_frames.extend([""] * (empty_string_count - current_empty_string_count))
98+
99+
return zip(audio_frames, modified_text_frames)

python/samples/concepts/audio_to_text/audio_recorder.py renamed to python/samples/concepts/audio/audio_recorder.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,10 @@
66

77
import keyboard
88
import pyaudio
9+
from pydantic import BaseModel
910

10-
from semantic_kernel.kernel_pydantic import KernelBaseModel
1111

12-
13-
class AudioRecorder(KernelBaseModel):
12+
class AudioRecorder(BaseModel):
1413
"""A class to record audio from the microphone and save it to a WAV file.
1514
1615
To start recording, press the spacebar. To stop recording, release the spacebar.

0 commit comments

Comments
 (0)