Skip to content

Raga's Evaluation Metrics #787

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 34 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from 30 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
42cc09d
added Multi modes selection
kartikpersistent Oct 1, 2024
9cae794
ragas eval
vasanthasaikalluri Oct 1, 2024
a6c1a68
added response
vasanthasaikalluri Oct 1, 2024
13ef4d2
multimodes state mangement
kartikpersistent Oct 1, 2024
2bd2aa3
fix: state handling of chat details of the default mode
kartikpersistent Oct 1, 2024
e860223
Merge branch 'DEV' of https://github.com/neo4j-labs/llm-graph-builder…
kartikpersistent Oct 1, 2024
1aa340b
Added the ChatModeSwitch Component
kartikpersistent Oct 1, 2024
8e3464a
modes switch statemangement
kartikpersistent Oct 1, 2024
5250c11
added the chatmodes switch in both view
kartikpersistent Oct 3, 2024
90e8540
removed the copied text
kartikpersistent Oct 3, 2024
b543210
Handled the error scenario
kartikpersistent Oct 3, 2024
8a3c046
fix: speech issue between modes
kartikpersistent Oct 3, 2024
839536e
Merge branch 'DEV' of https://github.com/neo4j-labs/llm-graph-builder…
kartikpersistent Oct 3, 2024
3d7f931
ragas evaluation metric show
kaustubh-darekar Oct 3, 2024
72f607c
Merge branch 'Multiple-chat-modes-selection' of https://github.com/ne…
kartikpersistent Oct 3, 2024
cd70cf8
Output return type changed
kaustubh-darekar Oct 3, 2024
6e29edb
Merge branch 'ragas_integration' of https://github.com/neo4j-labs/llm…
kaustubh-darekar Oct 3, 2024
132ea13
fix: Handled activespeech speech and othermessage modes switch
kartikpersistent Oct 3, 2024
7ce6925
used requestanimationframe instead of setTimeOut
kartikpersistent Oct 3, 2024
65e1b97
removed the commented code
kartikpersistent Oct 3, 2024
8524888
Merge branch 'Multiple-chat-modes-selection' of https://github.com/ne…
kartikpersistent Oct 4, 2024
d5fd219
Merge branch 'ragas_integration' of https://github.com/neo4j-labs/llm…
kartikpersistent Oct 4, 2024
00e7eb0
Added ragas to requirements
a-s-poorna Oct 4, 2024
3cff860
Integrated the metric api
kartikpersistent Oct 4, 2024
b62593e
Merge branch 'ragas_integration' of https://github.com/neo4j-labs/llm…
kartikpersistent Oct 4, 2024
c1456cf
ragas response updated, llm list updated
kaustubh-darekar Oct 4, 2024
0e47dd9
resolved syntax error in score
kaustubh-darekar Oct 4, 2024
c91bc45
Added the Metrics Table
kartikpersistent Oct 7, 2024
030474b
Merge branch 'DEV' of https://github.com/neo4j-labs/llm-graph-builder…
kartikpersistent Oct 7, 2024
6565d7b
fix: Long text UI Issue
kartikpersistent Oct 8, 2024
a034bf2
code optimization for evaluation
kaustubh-darekar Oct 9, 2024
66dc13e
added the download button for downloading the info
kartikpersistent Oct 9, 2024
97696b0
key name change
kartikpersistent Oct 10, 2024
09f9b60
Optimized the downloadClickHandler
kartikpersistent Oct 10, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -179,4 +179,5 @@ PyMuPDF==1.24.5
pypandoc==1.13
graphdatascience==1.10
Secweb==1.11.0
ragas==0.1.14

20 changes: 19 additions & 1 deletion backend/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
from Secweb.XContentTypeOptions import XContentTypeOptions
from Secweb.XFrameOptions import XFrame

from src.ragas_eval import *

logger = CustomLogger()
CHUNK_DIR = os.path.join(os.path.dirname(__file__), "chunks")
MERGED_DIR = os.path.join(os.path.dirname(__file__), "merged_files")
Expand Down Expand Up @@ -711,7 +713,23 @@ async def retry_processing(uri=Form(), userName=Form(), password=Form(), databas
logging.exception(f'{error_message}')
return create_api_response(job_status, message=message, error=error_message)
finally:
gc.collect()
gc.collect()

@app.post('/metric')
async def calculate_metric(question=Form(),context=Form(),answer=Form(),model=Form()):
try:
result = await asyncio.to_thread(get_ragas_metrics,question,context,answer,model)
if result is None:
return create_api_response('Failed', message='Failed to calculate metrics.',error="Ragas evaluation returned null")
return create_api_response('Success',data=result,message=f"Status set to Reprocess for filename : {result}")
except Exception as e:
job_status = "Failed"
message="Error while calculating evaluation metrics"
error_message = str(e)
logging.exception(f'{error_message}')
return create_api_response(job_status, message=message, error=error_message)
finally:
gc.collect()

if __name__ == "__main__":
uvicorn.run(app)
43 changes: 34 additions & 9 deletions backend/src/QA_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
from langchain_text_splitters import TokenTextSplitter
from langchain_core.messages import HumanMessage, AIMessage
from langchain.chains import GraphCypherQAChain
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.callbacks import StdOutCallbackHandler, BaseCallbackHandler

# LangChain chat models
from langchain_openai import ChatOpenAI, AzureChatOpenAI
Expand All @@ -38,13 +39,12 @@
from src.shared.common_fn import load_embedding_model
from src.shared.constants import *
from src.graphDB_dataAccess import graphDBdataAccess
from src.ragas_eval import get_ragas_metrics
load_dotenv()

EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL')
EMBEDDING_FUNCTION , _ = load_embedding_model(EMBEDDING_MODEL)



class SessionChatHistory:
history_dict = {}

Expand All @@ -58,6 +58,17 @@ def get_chat_history(cls, session_id):
logging.info(f"Retrieved existing ChatMessageHistory Local for session ID: {session_id}")
return cls.history_dict[session_id]

class CustomCallback(BaseCallbackHandler):

def __init__(self):
self.transformed_question = None

def on_llm_end(
self,response, **kwargs: Any
) -> None:
logging.info("question transformed")
self.transformed_question = response.generations[0][0].text.strip()

def get_history_by_session_id(session_id):
try:
return SessionChatHistory.get_chat_history(session_id)
Expand Down Expand Up @@ -250,21 +261,25 @@ def process_documents(docs, question, messages, llm, model,chat_mode_settings):
logging.error(f"Error processing documents: {e}")
raise

return content, result, total_tokens
return content, result, total_tokens, formatted_docs

def retrieve_documents(doc_retriever, messages):

start_time = time.time()
try:
docs = doc_retriever.invoke({"messages": messages})
handler = CustomCallback()
docs = doc_retriever.invoke({"messages": messages},{"callbacks":[handler]})
transformed_question = handler.transformed_question
if transformed_question:
logging.info(f"Transformed question : {transformed_question}")
doc_retrieval_time = time.time() - start_time
logging.info(f"Documents retrieved in {doc_retrieval_time:.2f} seconds")

except Exception as e:
logging.error(f"Error retrieving documents: {e}")
raise

return docs
return docs,transformed_question

def create_document_retriever_chain(llm, retriever):
try:
Expand Down Expand Up @@ -408,14 +423,19 @@ def process_chat_response(messages, history, question, model, graph, document_na
try:
llm, doc_retriever, model_version = setup_chat(model, graph, document_names, chat_mode_settings)

docs = retrieve_documents(doc_retriever, messages)
docs,transformed_question = retrieve_documents(doc_retriever, messages)

if docs:
content, result, total_tokens = process_documents(docs, question, messages, llm, model, chat_mode_settings)
content, result, total_tokens,formatted_docs = process_documents(docs, question, messages, llm, model, chat_mode_settings)
else:
content = "I couldn't find any relevant documents to answer your question."
result = {"sources": list(), "nodedetails": list(), "entities": list()}
total_tokens = 0
formatted_docs = ""

question = transformed_question if transformed_question else question
# metrics = get_ragas_metrics(question,formatted_docs,content)
# print(metrics)

ai_response = AIMessage(content=content)
messages.append(ai_response)
Expand All @@ -424,19 +444,22 @@ def process_chat_response(messages, history, question, model, graph, document_na
summarization_thread.start()
logging.info("Summarization thread started.")
# summarize_and_log(history, messages, llm)

metric_details = {"question":question,"contexts":formatted_docs,"answer":content}
return {
"session_id": "",
"message": content,
"info": {
# "metrics" : metrics,
"sources": result["sources"],
"model": model_version,
"nodedetails": result["nodedetails"],
"total_tokens": total_tokens,
"response_time": 0,
"mode": chat_mode_settings["mode"],
"entities": result["entities"],
"metric_details": metric_details,
},

"user": "chatbot"
}

Expand All @@ -446,13 +469,15 @@ def process_chat_response(messages, history, question, model, graph, document_na
"session_id": "",
"message": "Something went wrong",
"info": {
"metrics" : [],
"sources": [],
"nodedetails": [],
"total_tokens": 0,
"response_time": 0,
"error": f"{type(e).__name__}: {str(e)}",
"mode": chat_mode_settings["mode"],
"entities": [],
"metric_details": {},
},
"user": "chatbot"
}
Expand Down
172 changes: 172 additions & 0 deletions backend/src/ragas_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
import os
from ragas.metrics import (
answer_relevancy,
faithfulness,
context_utilization,
)
from datasets import Dataset
from ragas import evaluate
from src.shared.common_fn import load_embedding_model
import logging
from dotenv import load_dotenv
from src.shared.common_fn import load_embedding_model
import os
from langchain_openai import ChatOpenAI, AzureChatOpenAI
from langchain_google_vertexai import ChatVertexAI
from langchain_groq import ChatGroq
from langchain_google_vertexai import HarmBlockThreshold, HarmCategory
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
from langchain_anthropic import ChatAnthropic
from langchain_fireworks import ChatFireworks
from langchain_aws import ChatBedrock
from langchain_community.chat_models import ChatOllama
import boto3
import google.auth

load_dotenv()

RAGAS_MODEL_VERSIONS = {
"openai-gpt-3.5": "gpt-3.5-turbo-16k",
"gemini-1.0-pro": "gemini-1.0-pro-001",
"gemini-1.5-pro": "gemini-1.5-pro-002",
"gemini-1.5-flash": "gemini-1.5-flash-002",
"openai-gpt-4": "gpt-4-turbo-2024-04-09",
"openai-gpt-4o-mini": "gpt-4o-mini-2024-07-18",
"openai-gpt-4o":"gpt-4o-mini-2024-07-18",
"diffbot" : "gpt-4-turbo-2024-04-09",
"groq-llama3" : "groq_llama3_70b" ,
}


EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL')
EMBEDDING_FUNCTION , _ = load_embedding_model(EMBEDDING_MODEL)


def get_ragas_llm(model: str):
"""Retrieve the specified language model based on the model name."""
env_key = "LLM_MODEL_CONFIG_" + model
env_value = os.environ.get(env_key)
logging.info("Model: {}".format(env_key))
if "gemini" in model:
credentials, project_id = google.auth.default()
model_name = RAGAS_MODEL_VERSIONS[model]
llm = ChatVertexAI(
model_name=model_name,
#convert_system_message_to_human=True,
credentials=credentials,
project=project_id,
temperature=0,
safety_settings={
HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
},
)
elif "openai" in model:
model_name = RAGAS_MODEL_VERSIONS[model]
llm = ChatOpenAI(
api_key=os.environ.get("OPENAI_API_KEY"),
model=model_name,
temperature=0,
)

elif "azure" in model:
model_name, api_endpoint, api_key, api_version = env_value.split(",")
llm = AzureChatOpenAI(
api_key=api_key,
azure_endpoint=api_endpoint,
azure_deployment=model_name, # takes precedence over model parameter
api_version=api_version,
temperature=0,
max_tokens=None,
timeout=None,
)

elif "anthropic" in model:
model_name, api_key = env_value.split(",")
llm = ChatAnthropic(
api_key=api_key, model=model_name, temperature=0, timeout=None
)

elif "fireworks" in model:
model_name, api_key = env_value.split(",")
llm = ChatFireworks(api_key=api_key, model=model_name)

elif "groq" in model:
model_name, base_url, api_key = env_value.split(",")
llm = ChatGroq(api_key=api_key, model_name=model_name, temperature=0)

elif "bedrock" in model:
model_name, aws_access_key, aws_secret_key, region_name = env_value.split(",")
bedrock_client = boto3.client(
service_name="bedrock-runtime",
region_name=region_name,
aws_access_key_id=aws_access_key,
aws_secret_access_key=aws_secret_key,
)

llm = ChatBedrock(
client=bedrock_client, model_id=model_name, model_kwargs=dict(temperature=0)
)

elif "ollama" in model:
model_name, base_url = env_value.split(",")
llm = ChatOllama(base_url=base_url, model=model_name)

elif "diffbot" in model:
model_name = "diffbot"
llm = DiffbotGraphTransformer(
diffbot_api_key=os.environ.get("DIFFBOT_API_KEY"),
extract_types=["entities", "facts"],
)

else:
model_name, api_endpoint, api_key = env_value.split(",")
llm = ChatOpenAI(
api_key=api_key,
base_url=api_endpoint,
model=model_name,
temperature=0,
)

logging.info(f"Model created - Model Version: {model}")
return llm, model_name

def get_ragas_metrics(question,context,answer,model):
"""
Calculates metrics (faithfulness, answer relevancy, context utilization)
for a given user question, generated context, generated answer.

Args:
question (str): The question asked by user.
context (str): The relevant context generated by chatbot.
answer (str): The generated answer by chatbot.
model (str): The name of the language model used.

Returns:
dict: A dictionary containing the metrics, or None if an error occurs.
"""
try:
dataset = {
'question': [question],
'answer': [answer],
'contexts':[[context]],
}
dataset = Dataset.from_dict(dataset)
logging.info("Dataset created successfully.")

llm, model_name = get_ragas_llm(model=model)
logging.info(f"Evaluating with model :{model_name}")
embeddings = EMBEDDING_FUNCTION

score = evaluate(dataset=dataset,metrics=[faithfulness, answer_relevancy, context_utilization],llm=llm,embeddings=embeddings)

score_dict = score.to_pandas()[['faithfulness','answer_relevancy','context_utilization']].round(4).to_dict(orient="index")[0]
logging.info("Evaluation completed successfully.")
return score_dict

except Exception as e:
logging.error(f"An error occurred during metrics evaluation: {e}")
return None
Loading