Azure-Samples · pamelafox · Apr 2, 2025 · Mar 22, 2025 · Mar 23, 2025 · Mar 23, 2025
diff --git a/.azdo/pipelines/azure-dev.yml b/.azdo/pipelines/azure-dev.yml
@@ -69,6 +69,7 @@ steps:
       AZURE_OPENAI_CHATGPT_DEPLOYMENT_CAPACITY: $(AZURE_OPENAI_CHATGPT_DEPLOYMENT_CAPACITY)
       AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION: $(AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION)
       AZURE_OPENAI_CHATGPT_DEPLOYMENT_SKU: $(AZURE_OPENAI_CHATGPT_DEPLOYMENT_SKU)
+      AZURE_OPENAI_REASONING_EFFORT: $(AZURE_OPENAI_REASONING_EFFORT)
       AZURE_OPENAI_EMB_MODEL_NAME: $(AZURE_OPENAI_EMB_MODEL_NAME)
       AZURE_OPENAI_EMB_DEPLOYMENT: $(AZURE_OPENAI_EMB_DEPLOYMENT)
       AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY: $(AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY)

diff --git a/.github/workflows/azure-dev.yml b/.github/workflows/azure-dev.yml
@@ -60,6 +60,7 @@ jobs:
       AZURE_OPENAI_CHATGPT_DEPLOYMENT: ${{ vars.AZURE_OPENAI_CHATGPT_DEPLOYMENT }}
       AZURE_OPENAI_CHATGPT_DEPLOYMENT_CAPACITY: ${{ vars.AZURE_OPENAI_CHATGPT_DEPLOYMENT_CAPACITY }}
       AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION: ${{ vars.AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION }}
+      AZURE_OPENAI_REASONING_EFFORT: ${{ vars.AZURE_OPENAI_REASONING_EFFORT }}
       AZURE_OPENAI_EMB_MODEL_NAME: ${{ vars.AZURE_OPENAI_EMB_MODEL_NAME }}
       AZURE_OPENAI_EMB_DEPLOYMENT: ${{ vars.AZURE_OPENAI_EMB_DEPLOYMENT }}
       AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY: ${{ vars.AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY }}

diff --git a/README.md b/README.md
@@ -256,6 +256,7 @@ You can find extensive documentation in the [docs](docs/README.md) folder:
     - [All features](docs/deploy_features.md)
     - [Login and access control](docs/login_and_acl.md)
     - [GPT-4 Turbo with Vision](docs/gpt4v.md)
+    - [Reasoning](docs/reasoning.md)
     - [Private endpoints](docs/deploy_private.md)
   - [Sharing deployment environments](docs/sharing_environments.md)
 - [Local development](docs/localdev.md)

diff --git a/app/backend/app.py b/app/backend/app.py
@@ -65,11 +65,13 @@
     CONFIG_CHAT_HISTORY_COSMOS_ENABLED,
     CONFIG_CHAT_VISION_APPROACH,
     CONFIG_CREDENTIAL,
+    CONFIG_DEFAULT_REASONING_EFFORT,
     CONFIG_GPT4V_DEPLOYED,
     CONFIG_INGESTER,
     CONFIG_LANGUAGE_PICKER_ENABLED,
     CONFIG_OPENAI_CLIENT,
     CONFIG_QUERY_REWRITING_ENABLED,
+    CONFIG_REASONING_EFFORT_ENABLED,
     CONFIG_SEARCH_CLIENT,
     CONFIG_SEMANTIC_RANKER_DEPLOYED,
     CONFIG_SPEECH_INPUT_ENABLED,
@@ -79,6 +81,7 @@
     CONFIG_SPEECH_SERVICE_LOCATION,
     CONFIG_SPEECH_SERVICE_TOKEN,
     CONFIG_SPEECH_SERVICE_VOICE,
+    CONFIG_STREAMING_ENABLED,
     CONFIG_USER_BLOB_CONTAINER_CLIENT,
     CONFIG_USER_UPLOAD_ENABLED,
     CONFIG_VECTOR_SEARCH_ENABLED,
@@ -293,6 +296,9 @@ def config():
             "showGPT4VOptions": current_app.config[CONFIG_GPT4V_DEPLOYED],
             "showSemanticRankerOption": current_app.config[CONFIG_SEMANTIC_RANKER_DEPLOYED],
             "showQueryRewritingOption": current_app.config[CONFIG_QUERY_REWRITING_ENABLED],
+            "showReasoningEffortOption": current_app.config[CONFIG_REASONING_EFFORT_ENABLED],
+            "streamingEnabled": current_app.config[CONFIG_STREAMING_ENABLED],
+            "defaultReasoningEffort": current_app.config[CONFIG_DEFAULT_REASONING_EFFORT],
             "showVectorOption": current_app.config[CONFIG_VECTOR_SEARCH_ENABLED],
             "showUserUpload": current_app.config[CONFIG_USER_UPLOAD_ENABLED],
             "showLanguagePicker": current_app.config[CONFIG_LANGUAGE_PICKER_ENABLED],
@@ -423,6 +429,7 @@ async def setup_clients():
     OPENAI_CHATGPT_MODEL = os.environ["AZURE_OPENAI_CHATGPT_MODEL"]
     OPENAI_EMB_MODEL = os.getenv("AZURE_OPENAI_EMB_MODEL_NAME", "text-embedding-ada-002")
     OPENAI_EMB_DIMENSIONS = int(os.getenv("AZURE_OPENAI_EMB_DIMENSIONS") or 1536)
+    OPENAI_REASONING_EFFORT = os.getenv("AZURE_OPENAI_REASONING_EFFORT")
     # Used with Azure OpenAI deployments
     AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
     AZURE_OPENAI_GPT4V_DEPLOYMENT = os.environ.get("AZURE_OPENAI_GPT4V_DEPLOYMENT")
@@ -640,6 +647,13 @@ async def setup_clients():
     current_app.config[CONFIG_QUERY_REWRITING_ENABLED] = (
         AZURE_SEARCH_QUERY_REWRITING == "true" and AZURE_SEARCH_SEMANTIC_RANKER != "disabled"
     )
+    current_app.config[CONFIG_DEFAULT_REASONING_EFFORT] = OPENAI_REASONING_EFFORT
+    current_app.config[CONFIG_REASONING_EFFORT_ENABLED] = OPENAI_CHATGPT_MODEL in Approach.GPT_REASONING_MODELS
+    current_app.config[CONFIG_STREAMING_ENABLED] = (
+        bool(USE_GPT4V)
+        or OPENAI_CHATGPT_MODEL not in Approach.GPT_REASONING_MODELS
+        or Approach.GPT_REASONING_MODELS[OPENAI_CHATGPT_MODEL].streaming
+    )
     current_app.config[CONFIG_VECTOR_SEARCH_ENABLED] = os.getenv("USE_VECTORS", "").lower() != "false"
     current_app.config[CONFIG_USER_UPLOAD_ENABLED] = bool(USE_USER_UPLOAD)
     current_app.config[CONFIG_LANGUAGE_PICKER_ENABLED] = ENABLE_LANGUAGE_PICKER
@@ -667,6 +681,7 @@ async def setup_clients():
         query_language=AZURE_SEARCH_QUERY_LANGUAGE,
         query_speller=AZURE_SEARCH_QUERY_SPELLER,
         prompt_manager=prompt_manager,
+        reasoning_effort=OPENAI_REASONING_EFFORT,
     )
 
     # ChatReadRetrieveReadApproach is used by /chat for multi-turn conversation
@@ -684,12 +699,26 @@ async def setup_clients():
         query_language=AZURE_SEARCH_QUERY_LANGUAGE,
         query_speller=AZURE_SEARCH_QUERY_SPELLER,
         prompt_manager=prompt_manager,
+        reasoning_effort=OPENAI_REASONING_EFFORT,
     )
 
     if USE_GPT4V:
         current_app.logger.info("USE_GPT4V is true, setting up GPT4V approach")
         if not AZURE_OPENAI_GPT4V_MODEL:
             raise ValueError("AZURE_OPENAI_GPT4V_MODEL must be set when USE_GPT4V is true")
+        if any(
+            model in Approach.GPT_REASONING_MODELS
+            for model in [
+                OPENAI_CHATGPT_MODEL,
+                AZURE_OPENAI_GPT4V_MODEL,
+                AZURE_OPENAI_CHATGPT_DEPLOYMENT,
+                AZURE_OPENAI_GPT4V_DEPLOYMENT,
+            ]
+        ):
+            raise ValueError(
+                "AZURE_OPENAI_CHATGPT_MODEL and AZURE_OPENAI_GPT4V_MODEL must not be a reasoning model when USE_GPT4V is true"
+            )
+
         token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")
 
         current_app.config[CONFIG_ASK_VISION_APPROACH] = RetrieveThenReadVisionApproach(

diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py
@@ -6,9 +6,11 @@
     AsyncGenerator,
     Awaitable,
     Callable,
+    Dict,
     List,
     Optional,
     TypedDict,
+    Union,
     cast,
 )
 from urllib.parse import urljoin
@@ -21,8 +23,15 @@
     VectorizedQuery,
     VectorQuery,
 )
-from openai import AsyncOpenAI
-from openai.types.chat import ChatCompletionMessageParam
+from openai import AsyncOpenAI, AsyncStream
+from openai.types import CompletionUsage
+from openai.types.chat import (
+    ChatCompletion,
+    ChatCompletionChunk,
+    ChatCompletionMessageParam,
+    ChatCompletionReasoningEffort,
+    ChatCompletionToolParam,
+)
 
 from approaches.promptmanager import PromptManager
 from core.authentication import AuthenticationHelper
@@ -89,8 +98,59 @@ class ThoughtStep:
     description: Optional[Any]
     props: Optional[dict[str, Any]] = None
 
+    def update_token_usage(self, usage: CompletionUsage) -> None:
+        if self.props:
+            self.props["token_usage"] = TokenUsageProps.from_completion_usage(usage)
+
+
+@dataclass
+class DataPoints:
+    text: Optional[List[str]] = None
+    images: Optional[List] = None
+
+
+@dataclass
+class ExtraInfo:
+    data_points: DataPoints
+    thoughts: Optional[List[ThoughtStep]] = None
+    followup_questions: Optional[List[Any]] = None
+
+
+@dataclass
+class TokenUsageProps:
+    prompt_tokens: int
+    completion_tokens: int
+    reasoning_tokens: Optional[int]
+    total_tokens: int
+
+    @classmethod
+    def from_completion_usage(cls, usage: CompletionUsage) -> "TokenUsageProps":
+        return cls(
+            prompt_tokens=usage.prompt_tokens,
+            completion_tokens=usage.completion_tokens,
+            reasoning_tokens=(
+                usage.completion_tokens_details.reasoning_tokens if usage.completion_tokens_details else None
+            ),
+            total_tokens=usage.total_tokens,
+        )
+
+
+# GPT reasoning models don't support the same set of parameters as other models
+# https://learn.microsoft.com/azure/ai-services/openai/how-to/reasoning
+@dataclass
+class GPTReasoningModelSupport:
+    streaming: bool
+
 
 class Approach(ABC):
+    # List of GPT reasoning models support
+    GPT_REASONING_MODELS = {
+        "o1": GPTReasoningModelSupport(streaming=False),
+        "o3-mini": GPTReasoningModelSupport(streaming=True),
+    }
+    # Set a higher token limit for GPT reasoning models
+    RESPONSE_DEFAULT_TOKEN_LIMIT = 1024
+    RESPONSE_REASONING_DEFAULT_TOKEN_LIMIT = 8192
 
     def __init__(
         self,
@@ -106,6 +166,8 @@ def __init__(
         vision_endpoint: str,
         vision_token_provider: Callable[[], Awaitable[str]],
         prompt_manager: PromptManager,
+        reasoning_effort: Optional[str] = None,
+        include_token_usage: Optional[bool] = None,
     ):
         self.search_client = search_client
         self.openai_client = openai_client
@@ -119,6 +181,8 @@ def __init__(
         self.vision_endpoint = vision_endpoint
         self.vision_token_provider = vision_token_provider
         self.prompt_manager = prompt_manager
+        self.reasoning_effort = reasoning_effort
+        self.include_token_usage = include_token_usage
 
     def build_filter(self, overrides: dict[str, Any], auth_claims: dict[str, Any]) -> Optional[str]:
         include_category = overrides.get("include_category")
@@ -281,6 +345,82 @@ def get_system_prompt_variables(self, override_prompt: Optional[str]) -> dict[st
         else:
             return {"override_prompt": override_prompt}
 
+    def get_response_token_limit(self, model: str) -> int:
+        if model in self.GPT_REASONING_MODELS:
+            return self.RESPONSE_REASONING_DEFAULT_TOKEN_LIMIT
+
+        return self.RESPONSE_DEFAULT_TOKEN_LIMIT
+
+    def create_chat_completion(
+        self,
+        chatgpt_deployment: Optional[str],
+        chatgpt_model: str,
+        messages: list[ChatCompletionMessageParam],
+        overrides: dict[str, Any],
+        should_stream: bool = False,
+        response_token_limit: Optional[int] = None,
+        tools: Optional[List[ChatCompletionToolParam]] = None,
+        temperature: Optional[float] = None,
+        n: Optional[int] = None,
+        reasoning_effort: Optional[ChatCompletionReasoningEffort] = None,
+    ) -> Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]]:
+        response_token_limit = response_token_limit or self.get_response_token_limit(chatgpt_model)
+        if chatgpt_model in self.GPT_REASONING_MODELS:
+            params: Dict[str, Any] = {
+                # max_tokens is not supported
+                "max_completion_tokens": response_token_limit
+            }
+
+            # Adjust parameters for reasoning models
+            supported_features = self.GPT_REASONING_MODELS[chatgpt_model]
+            if supported_features.streaming and should_stream:
+                params["stream"] = True
+                params["stream_options"] = {"include_usage": True}
+            params["reasoning_effort"] = reasoning_effort or overrides.get("reasoning_effort") or self.reasoning_effort
+
+        else:
+            # Include parameters that may not be supported for reasoning models
+            params = {
+                "max_tokens": response_token_limit,
+                "temperature": temperature or overrides.get("temperature", 0.3),
+            }
+        if should_stream:
+            params["stream"] = True
+            params["stream_options"] = {"include_usage": True}
+
+        params["tools"] = tools
+
+        # Azure OpenAI takes the deployment name as the model name
+        return self.openai_client.chat.completions.create(
+            model=chatgpt_deployment if chatgpt_deployment else chatgpt_model,
+            messages=messages,
+            seed=overrides.get("seed", None),
+            n=n or 1,
+            **params,
+        )
+
+    def create_generate_thought_step(
+        self,
+        title: str,
+        messages: List[ChatCompletionMessageParam],
+        overrides: dict[str, Any],
+        model: str,
+        deployment: Optional[str],
+        usage: Optional[CompletionUsage] = None,
+        reasoning_effort: Optional[ChatCompletionReasoningEffort] = None,
+    ) -> ThoughtStep:
+        properties: Dict[str, Any] = {"model": model}
+        if deployment:
+            properties["deployment"] = deployment
+        # Only add reasoning_effort setting if the model supports it
+        if model in self.GPT_REASONING_MODELS:
+            properties["reasoning_effort"] = reasoning_effort or overrides.get(
+                "reasoning_effort", self.reasoning_effort
+            )
+        if usage:
+            properties["token_usage"] = TokenUsageProps.from_completion_usage(usage)
+        return ThoughtStep(title, messages, properties)
+
     async def run(
         self,
         messages: list[ChatCompletionMessageParam],

diff --git a/app/backend/approaches/chatapproach.py b/app/backend/approaches/chatapproach.py
@@ -1,19 +1,29 @@
 import json
 import re
 from abc import ABC, abstractmethod
-from typing import Any, AsyncGenerator, Optional
+from typing import Any, AsyncGenerator, Awaitable, Optional, Union, cast
 
-from openai.types.chat import ChatCompletion, ChatCompletionMessageParam
+from openai import AsyncStream
+from openai.types.chat import (
+    ChatCompletion,
+    ChatCompletionChunk,
+    ChatCompletionMessageParam,
+)
 
-from approaches.approach import Approach
+from approaches.approach import (
+    Approach,
+    ExtraInfo,
+)
 
 
 class ChatApproach(Approach, ABC):
 
     NO_RESPONSE = "0"
 
     @abstractmethod
-    async def run_until_final_call(self, messages, overrides, auth_claims, should_stream) -> tuple:
+    async def run_until_final_call(
+        self, messages, overrides, auth_claims, should_stream
+    ) -> tuple[ExtraInfo, Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]]]:
         pass
 
     def get_search_query(self, chat_completion: ChatCompletion, user_query: str):
@@ -49,12 +59,15 @@ async def run_without_streaming(
         extra_info, chat_coroutine = await self.run_until_final_call(
             messages, overrides, auth_claims, should_stream=False
         )
-        chat_completion_response: ChatCompletion = await chat_coroutine
+        chat_completion_response: ChatCompletion = await cast(Awaitable[ChatCompletion], chat_coroutine)
         content = chat_completion_response.choices[0].message.content
         role = chat_completion_response.choices[0].message.role
         if overrides.get("suggest_followup_questions"):
             content, followup_questions = self.extract_followup_questions(content)
-            extra_info["followup_questions"] = followup_questions
+            extra_info.followup_questions = followup_questions
+        # Assume last thought is for generating answer
+        if self.include_token_usage and extra_info.thoughts and chat_completion_response.usage:
+            extra_info.thoughts[-1].update_token_usage(chat_completion_response.usage)
         chat_app_response = {
             "message": {"content": content, "role": role},
             "context": extra_info,
@@ -72,6 +85,7 @@ async def run_with_streaming(
         extra_info, chat_coroutine = await self.run_until_final_call(
             messages, overrides, auth_claims, should_stream=True
         )
+        chat_coroutine = cast(Awaitable[AsyncStream[ChatCompletionChunk]], chat_coroutine)
         yield {"delta": {"role": "assistant"}, "context": extra_info, "session_state": session_state}
 
         followup_questions_started = False
@@ -80,6 +94,7 @@ async def run_with_streaming(
             # "2023-07-01-preview" API version has a bug where first response has empty choices
             event = event_chunk.model_dump()  # Convert pydantic model to dict
             if event["choices"]:
+                # No usage during streaming
                 completion = {
                     "delta": {
                         "content": event["choices"][0]["delta"].get("content"),
@@ -100,9 +115,19 @@ async def run_with_streaming(
                     followup_content += content
                 else:
                     yield completion
+            else:
+                # Final chunk at end of streaming should contain usage
+                # https://cookbook.openai.com/examples/how_to_stream_completions#4-how-to-get-token-usage-data-for-streamed-chat-completion-response
+                if event_chunk.usage and extra_info.thoughts and self.include_token_usage:
+                    extra_info.thoughts[-1].update_token_usage(event_chunk.usage)
+                    yield {"delta": {"role": "assistant"}, "context": extra_info, "session_state": session_state}
+
         if followup_content:
             _, followup_questions = self.extract_followup_questions(followup_content)
-            yield {"delta": {"role": "assistant"}, "context": {"followup_questions": followup_questions}}
+            yield {
+                "delta": {"role": "assistant"},
+                "context": {"context": extra_info, "followup_questions": followup_questions},
+            }
 
     async def run(
         self,