run-llama
diff --git a/‎.changeset/cyan-buttons-clean.md
Lines changed: 5 additions & 0 deletions b/‎.changeset/cyan-buttons-clean.md
Lines changed: 5 additions & 0 deletions
diff --git a/‎helpers/env-variables.ts
Lines changed: 23 additions & 26 deletions b/‎helpers/env-variables.ts
Lines changed: 23 additions & 26 deletions
diff --git a/‎helpers/python.ts
Lines changed: 7 additions & 0 deletions b/‎helpers/python.ts
Lines changed: 7 additions & 0 deletions
diff --git a/‎templates/components/llamaindex/typescript/streaming/suggestion.ts
Lines changed: 8 additions & 20 deletions b/‎templates/components/llamaindex/typescript/streaming/suggestion.ts
Lines changed: 8 additions & 20 deletions
diff --git a/‎templates/types/multiagent/fastapi/app/api/services/file.py renamed to ‎templates/components/services/python/file.py b/‎templates/types/multiagent/fastapi/app/api/services/file.py renamed to ‎templates/components/services/python/file.py
diff --git a/‎templates/components/services/python/suggestion.py
Lines changed: 78 additions & 0 deletions b/‎templates/components/services/python/suggestion.py
Lines changed: 78 additions & 0 deletions
diff --git a/‎templates/types/multiagent/fastapi/app/api/routers/vercel_response.py
Lines changed: 38 additions & 17 deletions b/‎templates/types/multiagent/fastapi/app/api/routers/vercel_response.py
Lines changed: 38 additions & 17 deletions
diff --git a/‎templates/types/multiagent/fastapi/app/api/services/suggestion.py
Lines changed: 0 additions & 60 deletions b/‎templates/types/multiagent/fastapi/app/api/services/suggestion.py
Lines changed: 0 additions & 60 deletions
@@ -0,0 +1,5 @@
+---
+"create-llama": patch
+---
+
+Add env config for next questions feature
@@ -487,33 +487,30 @@ It\\'s cute animal.
 };
 
 const getTemplateEnvs = (template?: TemplateType): EnvVar[] => {
-  if (template === "multiagent") {
-    return [
-      {
-        name: "MESSAGE_QUEUE_PORT",
-      },
-      {
-        name: "CONTROL_PLANE_PORT",
-      },
-      {
-        name: "HUMAN_CONSUMER_PORT",
-      },
-      {
-        name: "AGENT_QUERY_ENGINE_PORT",
-        value: "8003",
-      },
-      {
-        name: "AGENT_QUERY_ENGINE_DESCRIPTION",
-        value: "Query information from the provided data",
-      },
-      {
-        name: "AGENT_DUMMY_PORT",
-        value: "8004",
-      },
-    ];
-  } else {
-    return [];
+  const nextQuestionEnvs: EnvVar[] = [
+    {
+      name: "NEXT_QUESTION_PROMPT",
+      description: `Customize prompt to generate the next question suggestions based on the conversation history.
+Disable this prompt to disable the next question suggestions feature.`,
+      value: `"You're a helpful assistant! Your task is to suggest the next question that user might ask. 
+Here is the conversation history
+---------------------
+{conversation}
+---------------------
+Given the conversation history, please give me 3 questions that you might ask next!
+Your answer should be wrapped in three sticks which follows the following format:
+\`\`\`
+<question 1>
+<question 2>
+<question 3>
+\`\`\`"`,
+    },
+  ];
+
+  if (template === "multiagent" || template === "streaming") {
+    return nextQuestionEnvs;
   }
+  return [];
 };
 
 const getObservabilityEnvs = (
 
@@ -395,6 +395,13 @@ export const installPythonTemplate = async ({
     cwd: path.join(compPath, "settings", "python"),
   });
 
+  // Copy services
+  if (template == "streaming" || template == "multiagent") {
+    await copy("**", path.join(root, "app", "api", "services"), {
+      cwd: path.join(compPath, "services", "python"),
+    });
+  }
+
   if (template === "streaming") {
     // For the streaming template only:
     // Select and copy engine code based on data sources and tools
 
@@ -1,32 +1,20 @@
 import { ChatMessage, Settings } from "llamaindex";
 
-const NEXT_QUESTION_PROMPT_TEMPLATE = `You're a helpful assistant! Your task is to suggest the next question that user might ask. 
-Here is the conversation history
----------------------
-$conversation
----------------------
-Given the conversation history, please give me $number_of_questions questions that you might ask next!
-Your answer should be wrapped in three sticks which follows the following format:
-\`\`\`
-<question 1>
-<question 2>\`\`\`
-`;
-const N_QUESTIONS_TO_GENERATE = 3;
-
-export async function generateNextQuestions(
-  conversation: ChatMessage[],
-  numberOfQuestions: number = N_QUESTIONS_TO_GENERATE,
-) {
+export async function generateNextQuestions(conversation: ChatMessage[]) {
   const llm = Settings.llm;
+  const NEXT_QUESTION_PROMPT = process.env.NEXT_QUESTION_PROMPT;
+  if (!NEXT_QUESTION_PROMPT) {
+    return [];
+  }
 
   // Format conversation
   const conversationText = conversation
     .map((message) => `${message.role}: ${message.content}`)
     .join("\n");
-  const message = NEXT_QUESTION_PROMPT_TEMPLATE.replace(
-    "$conversation",
+  const message = NEXT_QUESTION_PROMPT.replace(
+    "{conversation}",
     conversationText,
-  ).replace("$number_of_questions", numberOfQuestions.toString());
+  );
 
   try {
     const response = await llm.complete({ prompt: message });
 
@@ -0,0 +1,78 @@
+import logging
+import os
+import re
+from typing import List, Optional
+
+from app.api.routers.models import Message
+from llama_index.core.prompts import PromptTemplate
+from llama_index.core.settings import Settings
+
+logger = logging.getLogger("uvicorn")
+
+
+class NextQuestionSuggestion:
+    """
+    Suggest the next questions that user might ask based on the conversation history
+    Disable this feature by removing the NEXT_QUESTION_PROMPT environment variable
+    """
+
+    @classmethod
+    def get_configured_prompt(cls) -> Optional[str]:
+        prompt = os.getenv("NEXT_QUESTION_PROMPT", None)
+        if not prompt:
+            return None
+        return PromptTemplate(prompt)
+
+    @classmethod
+    async def suggest_next_questions_all_messages(
+        cls,
+        messages: List[Message],
+    ) -> Optional[List[str]]:
+        """
+        Suggest the next questions that user might ask based on the conversation history
+        Return None if suggestion is disabled or there is an error
+        """
+        prompt_template = cls.get_configured_prompt()
+        if not prompt_template:
+            return None
+
+        try:
+            # Reduce the cost by only using the last two messages
+            last_user_message = None
+            last_assistant_message = None
+            for message in reversed(messages):
+                if message.role == "user":
+                    last_user_message = f"User: {message.content}"
+                elif message.role == "assistant":
+                    last_assistant_message = f"Assistant: {message.content}"
+                if last_user_message and last_assistant_message:
+                    break
+            conversation: str = f"{last_user_message}\n{last_assistant_message}"
+
+            # Call the LLM and parse questions from the output
+            prompt = prompt_template.format(conversation=conversation)
+            output = await Settings.llm.acomplete(prompt)
+            questions = cls._extract_questions(output.text)
+
+            return questions
+        except Exception as e:
+            logger.error(f"Error when generating next question: {e}")
+            return None
+
+    @classmethod
+    def _extract_questions(cls, text: str) -> List[str]:
+        content_match = re.search(r"```(.*?)```", text, re.DOTALL)
+        content = content_match.group(1) if content_match else ""
+        return content.strip().split("\n")
+
+    @classmethod
+    async def suggest_next_questions(
+        cls,
+        chat_history: List[Message],
+        response: str,
+    ) -> List[str]:
+        """
+        Suggest the next questions that user might ask based on the chat history and the last response
+        """
+        messages = chat_history + [Message(role="assistant", content=response)]
+        return await cls.suggest_next_questions_all_messages(messages)
@@ -1,15 +1,15 @@
-from asyncio import Task
 import json
 import logging
-from typing import AsyncGenerator
+from asyncio import Task
+from typing import AsyncGenerator, List
 
 from aiostream import stream
+from app.agents.single import AgentRunEvent, AgentRunResult
+from app.api.routers.models import ChatData, Message
+from app.api.services.suggestion import NextQuestionSuggestion
 from fastapi import Request
 from fastapi.responses import StreamingResponse
 
-from app.api.routers.models import ChatData
-from app.agents.single import AgentRunEvent, AgentRunResult
-
 logger = logging.getLogger("uvicorn")
 
 
@@ -57,26 +57,35 @@ async def content_generator(
         # Yield the text response
         async def _chat_response_generator():
             result = await task
+            final_response = ""
 
             if isinstance(result, AgentRunResult):
                 for token in result.response.message.content:
-                    yield VercelStreamResponse.convert_text(token)
+                    final_response += token
+                    yield cls.convert_text(token)
 
             if isinstance(result, AsyncGenerator):
                 async for token in result:
-                    yield VercelStreamResponse.convert_text(token.delta)
+                    final_response += token.delta
+                    yield cls.convert_text(token.delta)
+
+            # Generate next questions if next question prompt is configured
+            question_data = await cls._generate_next_questions(
+                chat_data.messages, final_response
+            )
+            if question_data:
+                yield cls.convert_data(question_data)
 
-            # TODO: stream NextQuestionSuggestion
             # TODO: stream sources
 
         # Yield the events from the event handler
         async def _event_generator():
             async for event in events():
-                event_response = _event_to_response(event)
+                event_response = cls._event_to_response(event)
                 if verbose:
                     logger.debug(event_response)
                 if event_response is not None:
-                    yield VercelStreamResponse.convert_data(event_response)
+                    yield cls.convert_data(event_response)
 
         combine = stream.merge(_chat_response_generator(), _event_generator())
 
@@ -85,16 +94,28 @@ async def _event_generator():
             if not is_stream_started:
                 is_stream_started = True
                 # Stream a blank message to start the stream
-                yield VercelStreamResponse.convert_text("")
+                yield cls.convert_text("")
 
             async for output in streamer:
                 yield output
                 if await request.is_disconnected():
                     break
 
-
-def _event_to_response(event: AgentRunEvent) -> dict:
-    return {
-        "type": "agent",
-        "data": {"agent": event.name, "text": event.msg},
-    }
+    @staticmethod
+    def _event_to_response(event: AgentRunEvent) -> dict:
+        return {
+            "type": "agent",
+            "data": {"agent": event.name, "text": event.msg},
+        }
+
+    @staticmethod
+    async def _generate_next_questions(chat_history: List[Message], response: str):
+        questions = await NextQuestionSuggestion.suggest_next_questions(
+            chat_history, response
+        )
+        if questions:
+            return {
+                "type": "suggested_questions",
+                "data": questions,
+            }
+        return None
-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"create-llama": patch
 +---
++
 +Add env config for next questions feature