Skip to content

Raga's Evaluation Metrics #787

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 34 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
42cc09d
added Multi modes selection
kartikpersistent Oct 1, 2024
9cae794
ragas eval
vasanthasaikalluri Oct 1, 2024
a6c1a68
added response
vasanthasaikalluri Oct 1, 2024
13ef4d2
multimodes state mangement
kartikpersistent Oct 1, 2024
2bd2aa3
fix: state handling of chat details of the default mode
kartikpersistent Oct 1, 2024
e860223
Merge branch 'DEV' of https://github.com/neo4j-labs/llm-graph-builder…
kartikpersistent Oct 1, 2024
1aa340b
Added the ChatModeSwitch Component
kartikpersistent Oct 1, 2024
8e3464a
modes switch statemangement
kartikpersistent Oct 1, 2024
5250c11
added the chatmodes switch in both view
kartikpersistent Oct 3, 2024
90e8540
removed the copied text
kartikpersistent Oct 3, 2024
b543210
Handled the error scenario
kartikpersistent Oct 3, 2024
8a3c046
fix: speech issue between modes
kartikpersistent Oct 3, 2024
839536e
Merge branch 'DEV' of https://github.com/neo4j-labs/llm-graph-builder…
kartikpersistent Oct 3, 2024
3d7f931
ragas evaluation metric show
kaustubh-darekar Oct 3, 2024
72f607c
Merge branch 'Multiple-chat-modes-selection' of https://github.com/ne…
kartikpersistent Oct 3, 2024
cd70cf8
Output return type changed
kaustubh-darekar Oct 3, 2024
6e29edb
Merge branch 'ragas_integration' of https://github.com/neo4j-labs/llm…
kaustubh-darekar Oct 3, 2024
132ea13
fix: Handled activespeech speech and othermessage modes switch
kartikpersistent Oct 3, 2024
7ce6925
used requestanimationframe instead of setTimeOut
kartikpersistent Oct 3, 2024
65e1b97
removed the commented code
kartikpersistent Oct 3, 2024
8524888
Merge branch 'Multiple-chat-modes-selection' of https://github.com/ne…
kartikpersistent Oct 4, 2024
d5fd219
Merge branch 'ragas_integration' of https://github.com/neo4j-labs/llm…
kartikpersistent Oct 4, 2024
00e7eb0
Added ragas to requirements
a-s-poorna Oct 4, 2024
3cff860
Integrated the metric api
kartikpersistent Oct 4, 2024
b62593e
Merge branch 'ragas_integration' of https://github.com/neo4j-labs/llm…
kartikpersistent Oct 4, 2024
c1456cf
ragas response updated, llm list updated
kaustubh-darekar Oct 4, 2024
0e47dd9
resolved syntax error in score
kaustubh-darekar Oct 4, 2024
c91bc45
Added the Metrics Table
kartikpersistent Oct 7, 2024
030474b
Merge branch 'DEV' of https://github.com/neo4j-labs/llm-graph-builder…
kartikpersistent Oct 7, 2024
6565d7b
fix: Long text UI Issue
kartikpersistent Oct 8, 2024
a034bf2
code optimization for evaluation
kaustubh-darekar Oct 9, 2024
66dc13e
added the download button for downloading the info
kartikpersistent Oct 9, 2024
97696b0
key name change
kartikpersistent Oct 10, 2024
09f9b60
Optimized the downloadClickHandler
kartikpersistent Oct 10, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -179,4 +179,5 @@ PyMuPDF==1.24.5
pypandoc==1.13
graphdatascience==1.10
Secweb==1.11.0
ragas==0.1.14

20 changes: 19 additions & 1 deletion backend/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
from Secweb.XContentTypeOptions import XContentTypeOptions
from Secweb.XFrameOptions import XFrame

from src.ragas_eval import *

logger = CustomLogger()
CHUNK_DIR = os.path.join(os.path.dirname(__file__), "chunks")
MERGED_DIR = os.path.join(os.path.dirname(__file__), "merged_files")
Expand Down Expand Up @@ -711,7 +713,23 @@ async def retry_processing(uri=Form(), userName=Form(), password=Form(), databas
logging.exception(f'{error_message}')
return create_api_response(job_status, message=message, error=error_message)
finally:
gc.collect()
gc.collect()

@app.post('/metric')
async def calculate_metric(question=Form(),context=Form(),answer=Form(),model=Form()):
try:
result = await asyncio.to_thread(get_ragas_metrics,question,context,answer,model)
if result is None:
return create_api_response('Failed', message='Failed to calculate metrics.',error="Ragas evaluation returned null")
return create_api_response('Success',data=result,message=f"Status set to Reprocess for filename : {result}")
except Exception as e:
job_status = "Failed"
message="Error while calculating evaluation metrics"
error_message = str(e)
logging.exception(f'{error_message}')
return create_api_response(job_status, message=message, error=error_message)
finally:
gc.collect()

if __name__ == "__main__":
uvicorn.run(app)
43 changes: 34 additions & 9 deletions backend/src/QA_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
from langchain_text_splitters import TokenTextSplitter
from langchain_core.messages import HumanMessage, AIMessage
from langchain.chains import GraphCypherQAChain
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.callbacks import StdOutCallbackHandler, BaseCallbackHandler

# LangChain chat models
from langchain_openai import ChatOpenAI, AzureChatOpenAI
Expand All @@ -38,13 +39,12 @@
from src.shared.common_fn import load_embedding_model
from src.shared.constants import *
from src.graphDB_dataAccess import graphDBdataAccess
from src.ragas_eval import get_ragas_metrics
load_dotenv()

EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL')
EMBEDDING_FUNCTION , _ = load_embedding_model(EMBEDDING_MODEL)



class SessionChatHistory:
history_dict = {}

Expand All @@ -58,6 +58,17 @@ def get_chat_history(cls, session_id):
logging.info(f"Retrieved existing ChatMessageHistory Local for session ID: {session_id}")
return cls.history_dict[session_id]

class CustomCallback(BaseCallbackHandler):

def __init__(self):
self.transformed_question = None

def on_llm_end(
self,response, **kwargs: Any
) -> None:
logging.info("question transformed")
self.transformed_question = response.generations[0][0].text.strip()

def get_history_by_session_id(session_id):
try:
return SessionChatHistory.get_chat_history(session_id)
Expand Down Expand Up @@ -250,21 +261,25 @@ def process_documents(docs, question, messages, llm, model,chat_mode_settings):
logging.error(f"Error processing documents: {e}")
raise

return content, result, total_tokens
return content, result, total_tokens, formatted_docs

def retrieve_documents(doc_retriever, messages):

start_time = time.time()
try:
docs = doc_retriever.invoke({"messages": messages})
handler = CustomCallback()
docs = doc_retriever.invoke({"messages": messages},{"callbacks":[handler]})
transformed_question = handler.transformed_question
if transformed_question:
logging.info(f"Transformed question : {transformed_question}")
doc_retrieval_time = time.time() - start_time
logging.info(f"Documents retrieved in {doc_retrieval_time:.2f} seconds")

except Exception as e:
logging.error(f"Error retrieving documents: {e}")
raise

return docs
return docs,transformed_question

def create_document_retriever_chain(llm, retriever):
try:
Expand Down Expand Up @@ -408,14 +423,19 @@ def process_chat_response(messages, history, question, model, graph, document_na
try:
llm, doc_retriever, model_version = setup_chat(model, graph, document_names, chat_mode_settings)

docs = retrieve_documents(doc_retriever, messages)
docs,transformed_question = retrieve_documents(doc_retriever, messages)

if docs:
content, result, total_tokens = process_documents(docs, question, messages, llm, model, chat_mode_settings)
content, result, total_tokens,formatted_docs = process_documents(docs, question, messages, llm, model, chat_mode_settings)
else:
content = "I couldn't find any relevant documents to answer your question."
result = {"sources": list(), "nodedetails": list(), "entities": list()}
total_tokens = 0
formatted_docs = ""

question = transformed_question if transformed_question else question
# metrics = get_ragas_metrics(question,formatted_docs,content)
# print(metrics)

ai_response = AIMessage(content=content)
messages.append(ai_response)
Expand All @@ -424,19 +444,22 @@ def process_chat_response(messages, history, question, model, graph, document_na
summarization_thread.start()
logging.info("Summarization thread started.")
# summarize_and_log(history, messages, llm)

metric_details = {"question":question,"contexts":formatted_docs,"answer":content}
return {
"session_id": "",
"message": content,
"info": {
# "metrics" : metrics,
"sources": result["sources"],
"model": model_version,
"nodedetails": result["nodedetails"],
"total_tokens": total_tokens,
"response_time": 0,
"mode": chat_mode_settings["mode"],
"entities": result["entities"],
"metric_details": metric_details,
},

"user": "chatbot"
}

Expand All @@ -446,13 +469,15 @@ def process_chat_response(messages, history, question, model, graph, document_na
"session_id": "",
"message": "Something went wrong",
"info": {
"metrics" : [],
"sources": [],
"nodedetails": [],
"total_tokens": 0,
"response_time": 0,
"error": f"{type(e).__name__}: {str(e)}",
"mode": chat_mode_settings["mode"],
"entities": [],
"metric_details": {},
},
"user": "chatbot"
}
Expand Down
150 changes: 150 additions & 0 deletions backend/src/ragas_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import os
import logging
import time
from typing import Dict, Tuple, Optional
import boto3
from datasets import Dataset
from dotenv import load_dotenv
from langchain_anthropic import ChatAnthropic
from langchain_aws import ChatBedrock
from langchain_community.chat_models import ChatOllama
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
from langchain_fireworks import ChatFireworks
from langchain_google_vertexai import (
ChatVertexAI,
HarmBlockThreshold,
HarmCategory,
)
from langchain_groq import ChatGroq
from langchain_openai import AzureChatOpenAI, ChatOpenAI
from ragas import evaluate
from ragas.metrics import answer_relevancy, context_utilization, faithfulness
from src.shared.common_fn import load_embedding_model

load_dotenv()

# Constants for clarity and maintainability
RAGAS_MODEL_VERSIONS = {
"openai-gpt-3.5": "gpt-3.5-turbo-16k",
"gemini-1.0-pro": "gemini-1.0-pro-001",
"gemini-1.5-pro": "gemini-1.5-pro-002",
"gemini-1.5-flash": "gemini-1.5-flash-002",
"openai-gpt-4": "gpt-4-turbo-2024-04-09",
"openai-gpt-4o-mini": "gpt-4o-mini-2024-07-18",
"openai-gpt-4o": "gpt-4o-mini-2024-07-18",
"diffbot": "gpt-4-turbo-2024-04-09",
"groq-llama3": "groq_llama3_70b",
}

EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
EMBEDDING_FUNCTION, _ = load_embedding_model(EMBEDDING_MODEL)


def get_ragas_llm(model: str) -> Tuple[object, str]:
"""Retrieves the specified language model. Improved error handling and structure."""
env_key = f"LLM_MODEL_CONFIG_{model}"
env_value = os.environ.get(env_key)
logging.info(f"Loading model configuration: {env_key}")
try:
if "gemini" in model:
credentials, project_id = google.auth.default()
model_name = RAGAS_MODEL_VERSIONS[model]
llm = ChatVertexAI(
model_name=model_name,
credentials=credentials,
project=project_id,
temperature=0,
safety_settings={
#setting safety to NONE for all categories. Consider reviewing this for production systems
HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
},
)
elif "openai" in model:
model_name = RAGAS_MODEL_VERSIONS[model]
llm = ChatOpenAI(
api_key=os.environ.get("OPENAI_API_KEY"), model=model_name, temperature=0
)

elif "azure" in model:
model_name, api_endpoint, api_key, api_version = env_value.split(",")
llm = AzureChatOpenAI(
api_key=api_key,
azure_endpoint=api_endpoint,
azure_deployment=model_name,
api_version=api_version,
temperature=0,
)
elif "anthropic" in model:
model_name, api_key = env_value.split(",")
llm = ChatAnthropic(api_key=api_key, model=model_name, temperature=0)
elif "fireworks" in model:
model_name, api_key = env_value.split(",")
llm = ChatFireworks(api_key=api_key, model=model_name)
elif "groq" in model:
model_name, base_url, api_key = env_value.split(",")
llm = ChatGroq(api_key=api_key, model_name=model_name, temperature=0)
elif "bedrock" in model:
model_name, aws_access_key, aws_secret_key, region_name = env_value.split(",")
bedrock_client = boto3.client(
service_name="bedrock-runtime",
region_name=region_name,
aws_access_key_id=aws_access_key,
aws_secret_access_key=aws_secret_key,
)
llm = ChatBedrock(
client=bedrock_client, model_id=model_name, model_kwargs=dict(temperature=0)
)
elif "ollama" in model:
model_name, base_url = env_value.split(",")
llm = ChatOllama(base_url=base_url, model=model_name)
elif "diffbot" in model:
llm = DiffbotGraphTransformer(
diffbot_api_key=os.environ.get("DIFFBOT_API_KEY"),
extract_types=["entities", "facts"],
)
else:
raise ValueError(f"Unsupported model: {model}")

logging.info(f"Model loaded - Model Version: {model}")
return llm, model_name
except (ValueError, KeyError) as e:
logging.error(f"Error loading LLM: {e}")
raise


def get_ragas_metrics(
question: str, context: str, answer: str, model: str
) -> Optional[Dict[str, float]]:
"""Calculates RAGAS metrics."""
try:
start_time = time.time()
dataset = Dataset.from_dict(
{"question": [question], "answer": [answer], "contexts": [[context]]}
)
logging.info("Dataset created successfully.")

llm, model_name = get_ragas_llm(model=model)
logging.info(f"Evaluating with model: {model_name}")

score = evaluate(
dataset=dataset,
metrics=[faithfulness, answer_relevancy, context_utilization],
llm=llm,
embeddings=EMBEDDING_FUNCTION,
)

score_dict = (
score.to_pandas()[["faithfulness", "answer_relevancy", "context_utilization"]]
.round(4)
.to_dict(orient="records")[0]
)
end_time = time.time()
logging.info(f"Evaluation completed in: {end_time - start_time:.2f} seconds")
return score_dict
except Exception as e:
logging.exception(f"Error during metrics evaluation: {e}")
return None
Loading