run-llama · marcusschiesser · Sep 9, 2024 · Sep 6, 2024 · Sep 6, 2024 · Sep 6, 2024
diff --git a/.changeset/cyan-buttons-clean.md b/.changeset/cyan-buttons-clean.md
@@ -0,0 +1,5 @@
+---
+"create-llama": patch
+---
+
+Add env config for next questions feature
diff --git a/helpers/env-variables.ts b/helpers/env-variables.ts
@@ -487,33 +487,30 @@ It\\'s cute animal.
 };
 
 const getTemplateEnvs = (template?: TemplateType): EnvVar[] => {
-  if (template === "multiagent") {
-    return [
-      {
-        name: "MESSAGE_QUEUE_PORT",
-      },
-      {
-        name: "CONTROL_PLANE_PORT",
-      },
-      {
-        name: "HUMAN_CONSUMER_PORT",
-      },
-      {
-        name: "AGENT_QUERY_ENGINE_PORT",
-        value: "8003",
-      },
-      {
-        name: "AGENT_QUERY_ENGINE_DESCRIPTION",
-        value: "Query information from the provided data",
-      },
-      {
-        name: "AGENT_DUMMY_PORT",
-        value: "8004",
-      },
-    ];
-  } else {
-    return [];
+  const nextQuestionEnvs: EnvVar[] = [
+    {
+      name: "NEXT_QUESTION_PROMPT",
+      description: `Customize prompt to generate the next question suggestions based on the conversation history.
+Disable this prompt to disable the next question suggestions feature.`,
+      value: `"You're a helpful assistant! Your task is to suggest the next question that user might ask. 
+Here is the conversation history
+---------------------
+$conversation
+---------------------
+Given the conversation history, please give me 3 questions that you might ask next!
+Your answer should be wrapped in three sticks which follows the following format:
+\`\`\`
+<question 1>
+<question 2>
+<question 3>
+\`\`\`"`,
+    },
+  ];
+
+  if (template === "multiagent" || template === "streaming") {
+    return nextQuestionEnvs;
   }
+  return [];
 };
 
 const getObservabilityEnvs = (

diff --git a/helpers/python.ts b/helpers/python.ts
@@ -395,6 +395,13 @@ export const installPythonTemplate = async ({
     cwd: path.join(compPath, "settings", "python"),
   });
 
+  // Copy services
+  if (template == "streaming" || template == "multiagent") {
+    await copy("**", path.join(root, "app", "api", "services"), {
+      cwd: path.join(compPath, "services", "python"),
+    });
+  }
+
   if (template === "streaming") {
     // For the streaming template only:
     // Select and copy engine code based on data sources and tools

diff --git a/templates/components/llamaindex/typescript/streaming/suggestion.ts b/templates/components/llamaindex/typescript/streaming/suggestion.ts
@@ -1,32 +1,20 @@
 import { ChatMessage, Settings } from "llamaindex";
 
-const NEXT_QUESTION_PROMPT_TEMPLATE = `You're a helpful assistant! Your task is to suggest the next question that user might ask. 
-Here is the conversation history
----------------------
-$conversation
----------------------
-Given the conversation history, please give me $number_of_questions questions that you might ask next!
-Your answer should be wrapped in three sticks which follows the following format:
-\`\`\`
-<question 1>
-<question 2>\`\`\`
-`;
-const N_QUESTIONS_TO_GENERATE = 3;
-
-export async function generateNextQuestions(
-  conversation: ChatMessage[],
-  numberOfQuestions: number = N_QUESTIONS_TO_GENERATE,
-) {
+export async function generateNextQuestions(conversation: ChatMessage[]) {
   const llm = Settings.llm;
+  const NEXT_QUESTION_PROMPT = process.env.NEXT_QUESTION_PROMPT;
+  if (!NEXT_QUESTION_PROMPT) {
+    return [];
+  }
 
   // Format conversation
   const conversationText = conversation
     .map((message) => `${message.role}: ${message.content}`)
     .join("\n");
-  const message = NEXT_QUESTION_PROMPT_TEMPLATE.replace(
+  const message = NEXT_QUESTION_PROMPT.replace(
     "$conversation",
     conversationText,
-  ).replace("$number_of_questions", numberOfQuestions.toString());
+  );
 
   try {
     const response = await llm.complete({ prompt: message });

diff --git a/...treaming/fastapi/app/api/services/file.py → templates/components/services/python/file.py b/...treaming/fastapi/app/api/services/file.py → templates/components/services/python/file.py
diff --git a/templates/components/services/python/suggestion.py b/templates/components/services/python/suggestion.py
@@ -0,0 +1,66 @@
+import logging
+import os
+import re
+from typing import List, Optional
+
+from app.api.routers.models import Message
+from llama_index.core.prompts import PromptTemplate
+from llama_index.core.settings import Settings
+
+logger = logging.getLogger("uvicorn")
+
+
+class NextQuestionSuggestion:
+    """
+    Suggest the next questions that user might ask based on the conversation history
+    Disable this feature by removing the NEXT_QUESTION_PROMPT environment variable
+    """
+
+    @classmethod
+    def get_configured_prompt(cls) -> Optional[str]:
+        prompt = os.getenv("NEXT_QUESTION_PROMPT", None)
+        if not prompt:
+            return None
+        return PromptTemplate(prompt)
+
+    @classmethod
+    async def suggest_next_questions(
+        cls,
+        messages: List[Message],
+    ) -> Optional[List[str]]:
+        """
+        Suggest the next questions that user might ask based on the conversation history
+        Return None if suggestion is disabled or there is an error
+        """
+        prompt_template = cls.get_configured_prompt()
+        if not prompt_template:
+            return None
+
+        try:
+            # Reduce the cost by only using the last two messages
+            last_user_message = None
+            last_assistant_message = None
+            for message in reversed(messages):
+                if message.role == "user":
+                    last_user_message = f"User: {message.content}"
+                elif message.role == "assistant":
+                    last_assistant_message = f"Assistant: {message.content}"
+                if last_user_message and last_assistant_message:
+                    break
+            conversation: str = f"{last_user_message}\n{last_assistant_message}"
+
+            # Call the LLM and parse questions from the output
+            prompt = prompt_template.format(conversation=conversation)
+            output = await Settings.llm.acomplete(prompt)
+            questions = cls._extract_questions(output.text)
+
+            return questions
+        except Exception as e:
+            logger.error(f"Error when generating next question: {e}")
+            return None
+
+    @classmethod
+    def _extract_questions(cls, text: str) -> List[str]:
+        content_match = re.search(r"```(.*?)```", text, re.DOTALL)
+        content = content_match.group(1) if content_match else ""
+        return content.strip().split("\n")
diff --git a/templates/types/multiagent/fastapi/app/api/routers/vercel_response.py b/templates/types/multiagent/fastapi/app/api/routers/vercel_response.py
@@ -1,15 +1,15 @@
-from asyncio import Task
 import json
 import logging
+from asyncio import Task
 from typing import AsyncGenerator
 
 from aiostream import stream
+from app.agents.single import AgentRunEvent, AgentRunResult
+from app.api.routers.models import ChatData, Message
+from app.api.services.suggestion import NextQuestionSuggestion
 from fastapi import Request
 from fastapi.responses import StreamingResponse
 
-from app.api.routers.models import ChatData
-from app.agents.single import AgentRunEvent, AgentRunResult
-
 logger = logging.getLogger("uvicorn")
 
 
@@ -57,16 +57,32 @@ async def content_generator(
         # Yield the text response
         async def _chat_response_generator():
             result = await task
+            final_response = ""
 
             if isinstance(result, AgentRunResult):
                 for token in result.response.message.content:
                     yield VercelStreamResponse.convert_text(token)
 
             if isinstance(result, AsyncGenerator):
                 async for token in result:
+                    final_response += token.delta
                     yield VercelStreamResponse.convert_text(token.delta)
 
-            # TODO: stream NextQuestionSuggestion
+            # Generate next questions if next question prompt is configured
+            if NextQuestionSuggestion.get_configured_prompt() is not None:
+                conversation = chat_data.messages + [
+                    Message(role="assistant", content=final_response)
+                ]
+                questions = await NextQuestionSuggestion.suggest_next_questions(
+                    conversation
+                )
+                if questions:
+                    yield VercelStreamResponse.convert_data(
+                        {
+                            "type": "suggested_questions",
+                            "data": questions,
+                        }
+                    )
             # TODO: stream sources
 
         # Yield the events from the event handler

diff --git a/templates/types/streaming/fastapi/app/api/routers/vercel_response.py b/templates/types/streaming/fastapi/app/api/routers/vercel_response.py
@@ -56,20 +56,21 @@ async def _chat_response_generator():
                 final_response += token
                 yield VercelStreamResponse.convert_text(token)
 
-            # Generate questions that user might interested to
-            conversation = chat_data.messages + [
-                Message(role="assistant", content=final_response)
-            ]
-            questions = await NextQuestionSuggestion.suggest_next_questions(
-                conversation
-            )
-            if len(questions) > 0:
-                yield VercelStreamResponse.convert_data(
-                    {
-                        "type": "suggested_questions",
-                        "data": questions,
-                    }
+            # Generate next questions if next question prompt is configured
+            if NextQuestionSuggestion.get_configured_prompt() is not None:
+                conversation = chat_data.messages + [
+                    Message(role="assistant", content=final_response)
+                ]
+                questions = await NextQuestionSuggestion.suggest_next_questions(
+                    conversation
                 )
+                if questions:
+                    yield VercelStreamResponse.convert_data(
+                        {
+                            "type": "suggested_questions",
+                            "data": questions,
+                        }
+                    )
 
             # the text_generator is the leading stream, once it's finished, also finish the event stream
             event_handler.is_done = True

diff --git a/templates/types/streaming/fastapi/app/api/services/suggestion.py b/templates/types/streaming/fastapi/app/api/services/suggestion.py
diff --git a/templates/types/streaming/fastapi/pyproject.toml b/templates/types/streaming/fastapi/pyproject.toml
@@ -14,8 +14,8 @@ fastapi = "^0.109.1"
 uvicorn = { extras = ["standard"], version = "^0.23.2" }
 python-dotenv = "^1.0.0"
 aiostream = "^0.5.2"
-llama-index = "0.11.6"
 cachetools = "^5.3.3"
+llama-index = "0.11.6"
 
 [build-system]
 requires = ["poetry-core"]