Skip to content

Global search fulltext #767

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
76547be
added global search+vector+fulltext mode
vasanthasaikalluri Sep 23, 2024
0db3df5
added community details in chunk entities
vasanthasaikalluri Sep 23, 2024
ffe7a68
added node ids
vasanthasaikalluri Sep 24, 2024
88b5cd9
Merge branch 'DEV' of https://github.com/neo4j-labs/llm-graph-builder…
prakriti-solankey Sep 24, 2024
0330f49
Merge branch 'DEV' of https://github.com/neo4j-labs/llm-graph-builder…
prakriti-solankey Sep 24, 2024
46168e4
updated vector graph query
vasanthasaikalluri Sep 24, 2024
7ae1d6c
added entities and modified chat response
vasanthasaikalluri Sep 25, 2024
c100e9d
added params
vasanthasaikalluri Sep 25, 2024
574d69d
Merge branch 'global_search_fulltext' of https://github.com/neo4j-lab…
prakriti-solankey Sep 25, 2024
2afc3ae
api response changes
prakriti-solankey Sep 25, 2024
492f564
added chunk entity query
vasanthasaikalluri Sep 25, 2024
87874bd
modifies query
vasanthasaikalluri Sep 25, 2024
d6fb439
payload changes
prakriti-solankey Sep 25, 2024
da71b89
added nodetails properties
vasanthasaikalluri Sep 25, 2024
86d2b8e
payload new changes
prakriti-solankey Sep 25, 2024
fdd9909
communities check
prakriti-solankey Sep 26, 2024
a1518a3
communities selecetion check
prakriti-solankey Sep 26, 2024
99531b6
Communities bug solutions (#770)
prakriti-solankey Sep 26, 2024
af483e8
Merge branch 'DEV' into global_search_fulltext
prakriti-solankey Sep 26, 2024
f378e18
readonly fixed on refresh
prakriti-solankey Sep 26, 2024
48c1a27
clear chat history
prakriti-solankey Sep 27, 2024
8063a62
slectedFiles check for Chatbot
prakriti-solankey Sep 27, 2024
c583166
clear history
prakriti-solankey Sep 27, 2024
18b5968
Merge branch 'DEV' into global_search_fulltext
prakriti-solankey Sep 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions backend/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,8 @@ async def post_processing(uri=Form(), userName=Form(), password=Form(), database
await asyncio.to_thread(create_entity_embedding, graph)
json_obj = {'api_name': 'post_processing/create_entity_embedding', 'db_url': uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
logging.info(f'Entity Embeddings created')

if "create_communities" in tasks:
if "enable_communities" in tasks:
model = "openai-gpt-4o"
await asyncio.to_thread(create_communities, uri, userName, password, database,model)
josn_obj = {'api_name': 'post_processing/create_communities', 'db_url': uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
Expand Down Expand Up @@ -321,10 +321,9 @@ async def chat_bot(uri=Form(),model=Form(None),userName=Form(), password=Form(),
gc.collect()

@app.post("/chunk_entities")
async def chunk_entities(uri=Form(),userName=Form(), password=Form(), database=Form(), chunk_ids=Form(None),is_entity=Form()):
async def chunk_entities(uri=Form(),userName=Form(), password=Form(), database=Form(), nodedetails=Form(None),entities=Form(),mode=Form()):
try:
logging.info(f"URI: {uri}, Username: {userName}, chunk_ids: {chunk_ids}")
result = await asyncio.to_thread(get_entities_from_chunkids,uri=uri, username=userName, password=password, database=database,chunk_ids=chunk_ids,is_entity=json.loads(is_entity.lower()))
result = await asyncio.to_thread(get_entities_from_chunkids,uri=uri, username=userName, password=password, database=database,nodedetails=nodedetails,entities=entities,mode=mode)
json_obj = {'api_name':'chunk_entities','db_url':uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
logger.log_struct(json_obj, "INFO")
return create_api_response('Success',data=result)
Expand Down
86 changes: 49 additions & 37 deletions backend/src/QA_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,6 @@ def get_sources_and_chunks(sources_used, docs):
result = {
'sources': sources_used,
'chunkdetails': chunkdetails_list,
"entities" : list()
}
return result

Expand Down Expand Up @@ -182,16 +181,19 @@ def format_documents(documents, model):
sorted_documents = sorted(documents, key=lambda doc: doc.state.get("query_similarity_score", 0), reverse=True)
sorted_documents = sorted_documents[:prompt_token_cutoff]

formatted_docs = []
formatted_docs = list()
sources = set()
lc_entities = {'entities':list()}
entities = dict()
global_communities = list()


for doc in sorted_documents:
try:
source = doc.metadata.get('source', "unknown")
sources.add(source)

lc_entities = doc.metadata if 'entities'in doc.metadata.keys() else lc_entities
entities = doc.metadata['entities'] if 'entities'in doc.metadata.keys() else entities
global_communities = doc.metadata["communitydetails"] if 'communitydetails'in doc.metadata.keys() else global_communities

formatted_doc = (
"Document start\n"
Expand All @@ -204,13 +206,13 @@ def format_documents(documents, model):
except Exception as e:
logging.error(f"Error formatting document: {e}")

return "\n\n".join(formatted_docs), sources,lc_entities
return "\n\n".join(formatted_docs), sources,entities,global_communities

def process_documents(docs, question, messages, llm, model,chat_mode_settings):
start_time = time.time()

try:
formatted_docs, sources,lc_entities = format_documents(docs, model)
formatted_docs, sources, entitydetails, communities = format_documents(docs, model)

rag_chain = get_rag_chain(llm=llm)

Expand All @@ -219,12 +221,25 @@ def process_documents(docs, question, messages, llm, model,chat_mode_settings):
"context": formatted_docs,
"input": question
})
if chat_mode_settings["mode"] == "entity search+vector":
result = {'sources': list(),
'chunkdetails': list()}
result.update(lc_entities)

result = {'sources': list(), 'nodedetails': dict(), 'entities': dict()}
node_details = {"chunkdetails":list(),"entitydetails":list(),"communitydetails":list()}
entities = {'entityids':list(),"relationshipids":list()}

if chat_mode_settings["mode"] == CHAT_ENTITY_VECTOR_MODE:
node_details["entitydetails"] = entitydetails

elif chat_mode_settings["mode"] == CHAT_GLOBAL_VECTOR_FULLTEXT_MODE:
node_details["communitydetails"] = communities
else:
result = get_sources_and_chunks(sources, docs)
sources_and_chunks = get_sources_and_chunks(sources, docs)
result['sources'] = sources_and_chunks['sources']
node_details["chunkdetails"] = sources_and_chunks["chunkdetails"]
entities.update(entitydetails)

result["nodedetails"] = node_details
result["entities"] = entities

content = ai_response.content
total_tokens = get_total_tokens(ai_response, llm)

Expand Down Expand Up @@ -295,10 +310,13 @@ def create_document_retriever_chain(llm, retriever):

def initialize_neo4j_vector(graph, chat_mode_settings):
try:
mode = chat_mode_settings.get('mode', 'undefined')
retrieval_query = chat_mode_settings.get("retrieval_query")
index_name = chat_mode_settings.get("index_name")
keyword_index = chat_mode_settings.get("keyword_index", "")
node_label = chat_mode_settings.get("node_label")
embedding_node_property = chat_mode_settings.get("embedding_node_property")
text_node_properties = chat_mode_settings.get("text_node_properties")


if not retrieval_query or not index_name:
raise ValueError("Required settings 'retrieval_query' or 'index_name' are missing.")
Expand All @@ -310,28 +328,21 @@ def initialize_neo4j_vector(graph, chat_mode_settings):
retrieval_query=retrieval_query,
graph=graph,
search_type="hybrid",
node_label="Chunk",
embedding_node_property="embedding",
text_node_properties=["text"],
node_label=node_label,
embedding_node_property=embedding_node_property,
text_node_properties=text_node_properties,
keyword_index_name=keyword_index
)
logging.info(f"Successfully retrieved Neo4jVector Fulltext index '{index_name}' and keyword index '{keyword_index}'")
elif mode == "entity search+vector":
neo_db = Neo4jVector.from_existing_index(
embedding=EMBEDDING_FUNCTION,
index_name=index_name,
retrieval_query=retrieval_query,
graph=graph
)
else:
neo_db = Neo4jVector.from_existing_graph(
embedding=EMBEDDING_FUNCTION,
index_name=index_name,
retrieval_query=retrieval_query,
graph=graph,
node_label="Chunk",
embedding_node_property="embedding",
text_node_properties=["text"]
node_label=node_label,
embedding_node_property=embedding_node_property,
text_node_properties=text_node_properties
)
logging.info(f"Successfully retrieved Neo4jVector index '{index_name}'")
except Exception as e:
Expand Down Expand Up @@ -359,12 +370,12 @@ def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_
logging.info(f"Successfully created retriever with search_k={search_k}, score_threshold={score_threshold}")
return retriever

def get_neo4j_retriever(graph, document_names,chat_mode_settings, search_k=CHAT_SEARCH_KWARG_K, score_threshold=CHAT_SEARCH_KWARG_SCORE_THRESHOLD):
def get_neo4j_retriever(graph, document_names,chat_mode_settings, score_threshold=CHAT_SEARCH_KWARG_SCORE_THRESHOLD):
try:

neo_db = initialize_neo4j_vector(graph, chat_mode_settings)
document_names= list(map(str.strip, json.loads(document_names)))
search_k = LOCAL_COMMUNITY_TOP_K if chat_mode_settings["mode"] == "entity search+vector" else CHAT_SEARCH_KWARG_K
search_k = chat_mode_settings["top_k"]
retriever = create_retriever(neo_db, document_names,chat_mode_settings, search_k, score_threshold)
return retriever
except Exception as e:
Expand Down Expand Up @@ -397,12 +408,13 @@ def process_chat_response(messages, history, question, model, graph, document_na
try:
llm, doc_retriever, model_version = setup_chat(model, graph, document_names, chat_mode_settings)

docs = retrieve_documents(doc_retriever, messages)
docs = retrieve_documents(doc_retriever, messages)

if docs:
content, result, total_tokens = process_documents(docs, question, messages, llm, model, chat_mode_settings)
else:
content = "I couldn't find any relevant documents to answer your question."
result = {"sources": [], "chunkdetails": [], "entities": []}
result = {"sources": list(), "nodedetails": list(), "entities": list()}
total_tokens = 0

ai_response = AIMessage(content=content)
Expand All @@ -412,18 +424,18 @@ def process_chat_response(messages, history, question, model, graph, document_na
summarization_thread.start()
logging.info("Summarization thread started.")
# summarize_and_log(history, messages, llm)

return {
"session_id": "",
"message": content,
"info": {
"sources": result["sources"],
"model": model_version,
"chunkdetails": result["chunkdetails"],
"nodedetails": result["nodedetails"],
"total_tokens": total_tokens,
"response_time": 0,
"mode": chat_mode_settings["mode"],
"entities": result["entities"]
"entities": result["entities"],
},
"user": "chatbot"
}
Expand All @@ -435,12 +447,12 @@ def process_chat_response(messages, history, question, model, graph, document_na
"message": "Something went wrong",
"info": {
"sources": [],
"chunkdetails": [],
"nodedetails": [],
"total_tokens": 0,
"response_time": 0,
"error": f"{type(e).__name__}: {str(e)}",
"mode": chat_mode_settings["mode"],
"entities": []
"entities": [],
},
"user": "chatbot"
}
Expand Down Expand Up @@ -593,7 +605,7 @@ def create_neo4j_chat_message_history(graph, session_id, write_access=True):
raise

def get_chat_mode_settings(mode,settings_map=CHAT_MODE_CONFIG_MAP):
default_settings = settings_map["default"]
default_settings = settings_map[CHAT_DEFAULT_MODE]
try:
chat_mode_settings = settings_map.get(mode, default_settings)
chat_mode_settings["mode"] = mode
Expand All @@ -615,7 +627,7 @@ def QA_RAG(graph,model, question, document_names, session_id, mode, write_access
user_question = HumanMessage(content=question)
messages.append(user_question)

if mode == "graph":
if mode == CHAT_GRAPH_MODE:
result = process_graph_response(model, graph, question, messages, history)
else:
chat_mode_settings = get_chat_mode_settings(mode=mode)
Expand Down
104 changes: 60 additions & 44 deletions backend/src/chunkid_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,16 +81,16 @@ def process_chunk_data(chunk_data):
except Exception as e:
logging.error(f"chunkid_entities module: An error occurred while extracting the Chunk text from records: {e}")

def process_chunkids(driver, chunk_ids):
def process_chunkids(driver, chunk_ids, entities):
"""
Processes chunk IDs to retrieve chunk data.
"""
try:
logging.info(f"Starting graph query process for chunk ids: {chunk_ids}")
chunk_ids_list = chunk_ids.split(",")

records, summary, keys = driver.execute_query(CHUNK_QUERY, chunksIds=chunk_ids_list)
records, summary, keys = driver.execute_query(CHUNK_QUERY, chunksIds=chunk_ids,entityIds=entities["entityids"], relationshipIds=entities["relationshipids"])
result = process_records(records)
result["nodes"].extend(records[0]["nodes"])
result["nodes"] = remove_duplicate_nodes(result["nodes"])
logging.info(f"Nodes and relationships are processed")

result["chunk_data"] = process_chunk_data(records)
Expand Down Expand Up @@ -118,79 +118,95 @@ def remove_duplicate_nodes(nodes,property="element_id"):

return unique_nodes

def process_entityids(driver, chunk_ids):
def process_entityids(driver, entity_ids):
"""
Processes entity IDs to retrieve local community data.
"""
try:
logging.info(f"Starting graph query process for entity ids: {chunk_ids}")
entity_ids_list = chunk_ids.split(",")
logging.info(f"Starting graph query process for entity ids: {entity_ids}")
query_body = LOCAL_COMMUNITY_SEARCH_QUERY.format(
topChunks=LOCAL_COMMUNITY_TOP_CHUNKS,
topCommunities=LOCAL_COMMUNITY_TOP_COMMUNITIES,
topOutsideRels=LOCAL_COMMUNITY_TOP_OUTSIDE_RELS
)
query = LOCAL_COMMUNITY_DETAILS_QUERY_PREFIX + query_body + LOCAL_COMMUNITY_DETAILS_QUERY_SUFFIX

records, summary, keys = driver.execute_query(query, entityIds=entity_ids_list)
records, summary, keys = driver.execute_query(query, entityIds=entity_ids)

result = process_records(records)
if records:
result["nodes"].extend(records[0]["nodes"])
result["nodes"] = remove_duplicate_nodes(result["nodes"])

logging.info(f"Nodes and relationships are processed")

result["chunk_data"] = records[0]["chunks"]
result["community_data"] = records[0]["communities"]
else:
result["chunk_data"] = list()
result["community_data"] = list()
logging.info(f"Query process completed successfully for chunk ids: {chunk_ids}")
logging.info(f"Query process completed successfully for chunk ids: {entity_ids}")
return result
except Exception as e:
logging.error(f"chunkid_entities module: Error processing entity ids: {chunk_ids}. Error: {e}")
logging.error(f"chunkid_entities module: Error processing entity ids: {entity_ids}. Error: {e}")
raise

def get_entities_from_chunkids(uri, username, password, database ,chunk_ids,is_entity=False):
"""
Retrieve and process nodes and relationships from a graph database given a list of chunk IDs.
def process_communityids(driver, community_ids):
"""Processes community IDs to retrieve community data."""
try:
logging.info(f"Starting graph query process for community ids: {community_ids}")
query = GLOBAL_COMMUNITY_DETAILS_QUERY
records, summary, keys = driver.execute_query(query, communityids=community_ids)

result = {"nodes": [], "relationships": [], "chunk_data": []}
result["community_data"] = records[0]["communities"] if records else []

Parameters:
uri (str): The URI of the graph database.
username (str): The username for the database authentication.
password (str): The password for the database authentication.
chunk_ids (str): A comma-separated string of chunk IDs.
logging.info(f"Query process completed successfully for community ids: {community_ids}")
return result
except Exception as e:
logging.error(f"chunkid_entities module: Error processing community ids: {community_ids}. Error: {e}")
raise

Returns:
dict: A dictionary with 'nodes' and 'relationships' keys containing processed data, or an error message.
"""
def get_entities_from_chunkids(uri, username, password, database ,nodedetails,entities,mode):
try:

driver = get_graphDB_driver(uri, username, password,database)
if not is_entity:
if chunk_ids:
logging.info(f"chunkid_entities module: Starting for chunk ids : {chunk_ids}")
result = process_chunkids(driver,chunk_ids)
default_response = {"nodes": list(),"relationships": list(),"chunk_data": list(),"community_data": list(),}

nodedetails = json.loads(nodedetails)
entities = json.loads(entities)

if mode == CHAT_GLOBAL_VECTOR_FULLTEXT_MODE:

if "communitydetails" in nodedetails and nodedetails["communitydetails"]:
community_ids = [item["id"] for item in nodedetails["communitydetails"]]
logging.info(f"chunkid_entities module: Starting for community ids: {community_ids}")
return process_communityids(driver, community_ids)
else:
logging.info("chunkid_entities module: No community ids are passed")
return default_response

elif mode == CHAT_ENTITY_VECTOR_MODE:

if "entitydetails" in nodedetails and nodedetails["entitydetails"]:
entity_ids = [item["id"] for item in nodedetails["entitydetails"]]
logging.info(f"chunkid_entities module: Starting for entity ids: {entity_ids}")
return process_entityids(driver, entity_ids)
else:
logging.info(f"chunkid_entities module: No chunk ids are passed")
result = {
"nodes": [],
"relationships": [],
"chunk_data":[]
}
return result
if chunk_ids:
result = process_entityids(driver,chunk_ids)
logging.info(f"chunkid_entities module: Starting for entity ids : {chunk_ids}")
logging.info("chunkid_entities module: No entity ids are passed")
return default_response

else:
logging.info(f"chunkid_entities module: No entity ids are passed")
result = {
"nodes": [],
"relationships": [],
"chunk_data":[],
"community_data":[]
}
return result

if "chunkdetails" in nodedetails and nodedetails["chunkdetails"]:
chunk_ids = [item["id"] for item in nodedetails["chunkdetails"]]
logging.info(f"chunkid_entities module: Starting for chunk ids: {chunk_ids}")
return process_chunkids(driver, chunk_ids, entities)
else:
logging.info("chunkid_entities module: No chunk ids are passed")
return default_response

except Exception as e:
logging.error(f"chunkid_entities module: An error occurred in get_entities_from_chunkids. Error: {str(e)}")
raise Exception(f"chunkid_entities module: An error occurred in get_entities_from_chunkids. Please check the logs for more details.") from e
raise Exception(f"chunkid_entities module: An error occurred in get_entities_from_chunkids. Please check the logs for more details.") from e

Loading