neo4j-labs · kartikpersistent · Sep 27, 2024 · Sep 23, 2024 · Sep 23, 2024 · Sep 24, 2024
diff --git a/backend/score.py b/backend/score.py
@@ -271,8 +271,8 @@ async def post_processing(uri=Form(), userName=Form(), password=Form(), database
             await asyncio.to_thread(create_entity_embedding, graph)
             json_obj = {'api_name': 'post_processing/create_entity_embedding', 'db_url': uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
             logging.info(f'Entity Embeddings created')
-
-        if "create_communities" in tasks:
+            
+        if "enable_communities" in tasks:
             model = "openai-gpt-4o"
             await asyncio.to_thread(create_communities, uri, userName, password, database,model)
             josn_obj = {'api_name': 'post_processing/create_communities', 'db_url': uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
@@ -321,10 +321,9 @@ async def chat_bot(uri=Form(),model=Form(None),userName=Form(), password=Form(),
         gc.collect()
 
 @app.post("/chunk_entities")
-async def chunk_entities(uri=Form(),userName=Form(), password=Form(), database=Form(), chunk_ids=Form(None),is_entity=Form()):
+async def chunk_entities(uri=Form(),userName=Form(), password=Form(), database=Form(), nodedetails=Form(None),entities=Form(),mode=Form()):
     try:
-        logging.info(f"URI: {uri}, Username: {userName}, chunk_ids: {chunk_ids}")
-        result = await asyncio.to_thread(get_entities_from_chunkids,uri=uri, username=userName, password=password, database=database,chunk_ids=chunk_ids,is_entity=json.loads(is_entity.lower()))
+        result = await asyncio.to_thread(get_entities_from_chunkids,uri=uri, username=userName, password=password, database=database,nodedetails=nodedetails,entities=entities,mode=mode)
         json_obj = {'api_name':'chunk_entities','db_url':uri, 'logging_time': formatted_time(datetime.now(timezone.utc))}
         logger.log_struct(json_obj, "INFO")
         return create_api_response('Success',data=result)

diff --git a/backend/src/QA_integration.py b/backend/src/QA_integration.py
@@ -147,7 +147,6 @@ def get_sources_and_chunks(sources_used, docs):
     result = {
         'sources': sources_used,
         'chunkdetails': chunkdetails_list,
-        "entities" : list()
     }
     return result
 
@@ -182,16 +181,19 @@ def format_documents(documents, model):
     sorted_documents = sorted(documents, key=lambda doc: doc.state.get("query_similarity_score", 0), reverse=True)
     sorted_documents = sorted_documents[:prompt_token_cutoff]
 
-    formatted_docs = []
+    formatted_docs = list()
     sources = set()
-    lc_entities = {'entities':list()}
+    entities = dict()
+    global_communities = list()
+
 
     for doc in sorted_documents:
         try:
             source = doc.metadata.get('source', "unknown")
             sources.add(source)
 
-            lc_entities = doc.metadata if 'entities'in doc.metadata.keys() else lc_entities
+            entities = doc.metadata['entities'] if 'entities'in doc.metadata.keys() else entities
+            global_communities = doc.metadata["communitydetails"] if 'communitydetails'in doc.metadata.keys() else global_communities
 
             formatted_doc = (
                 "Document start\n"
@@ -204,13 +206,13 @@ def format_documents(documents, model):
         except Exception as e:
             logging.error(f"Error formatting document: {e}")
 
-    return "\n\n".join(formatted_docs), sources,lc_entities
+    return "\n\n".join(formatted_docs), sources,entities,global_communities
 
 def process_documents(docs, question, messages, llm, model,chat_mode_settings):
     start_time = time.time()
 
     try:
-        formatted_docs, sources,lc_entities = format_documents(docs, model)
+        formatted_docs, sources, entitydetails, communities = format_documents(docs, model)
 
         rag_chain = get_rag_chain(llm=llm)
 
@@ -219,12 +221,25 @@ def process_documents(docs, question, messages, llm, model,chat_mode_settings):
             "context": formatted_docs,
             "input": question
         })
-        if chat_mode_settings["mode"] == "entity search+vector":
-            result = {'sources': list(),
-                      'chunkdetails': list()}
-            result.update(lc_entities)
+
+        result = {'sources': list(), 'nodedetails': dict(), 'entities': dict()}
+        node_details = {"chunkdetails":list(),"entitydetails":list(),"communitydetails":list()}
+        entities = {'entityids':list(),"relationshipids":list()}
+
+        if chat_mode_settings["mode"] == CHAT_ENTITY_VECTOR_MODE:
+            node_details["entitydetails"] = entitydetails
+
+        elif chat_mode_settings["mode"] == CHAT_GLOBAL_VECTOR_FULLTEXT_MODE:
+            node_details["communitydetails"] = communities
         else:
-            result = get_sources_and_chunks(sources, docs)
+            sources_and_chunks = get_sources_and_chunks(sources, docs)
+            result['sources'] = sources_and_chunks['sources']
+            node_details["chunkdetails"] = sources_and_chunks["chunkdetails"]
+            entities.update(entitydetails)
+
+        result["nodedetails"] = node_details
+        result["entities"] = entities
+
         content = ai_response.content
         total_tokens = get_total_tokens(ai_response, llm)
 
@@ -295,10 +310,13 @@ def create_document_retriever_chain(llm, retriever):
 
 def initialize_neo4j_vector(graph, chat_mode_settings):
     try:
-        mode = chat_mode_settings.get('mode', 'undefined')
         retrieval_query = chat_mode_settings.get("retrieval_query")
         index_name = chat_mode_settings.get("index_name")
         keyword_index = chat_mode_settings.get("keyword_index", "")
+        node_label = chat_mode_settings.get("node_label")
+        embedding_node_property = chat_mode_settings.get("embedding_node_property")
+        text_node_properties = chat_mode_settings.get("text_node_properties")
+
 
         if not retrieval_query or not index_name:
             raise ValueError("Required settings 'retrieval_query' or 'index_name' are missing.")
@@ -310,28 +328,21 @@ def initialize_neo4j_vector(graph, chat_mode_settings):
                 retrieval_query=retrieval_query,
                 graph=graph,
                 search_type="hybrid",
-                node_label="Chunk",
-                embedding_node_property="embedding",
-                text_node_properties=["text"],
+                node_label=node_label,
+                embedding_node_property=embedding_node_property,
+                text_node_properties=text_node_properties,
                 keyword_index_name=keyword_index
             )
             logging.info(f"Successfully retrieved Neo4jVector Fulltext index '{index_name}' and keyword index '{keyword_index}'")
-        elif mode == "entity search+vector":
-            neo_db = Neo4jVector.from_existing_index(
-                embedding=EMBEDDING_FUNCTION,
-                index_name=index_name,
-                retrieval_query=retrieval_query,
-                graph=graph
-            )
         else:
             neo_db = Neo4jVector.from_existing_graph(
                 embedding=EMBEDDING_FUNCTION,
                 index_name=index_name,
                 retrieval_query=retrieval_query,
                 graph=graph,
-                node_label="Chunk",
-                embedding_node_property="embedding",
-                text_node_properties=["text"]
+                node_label=node_label,
+                embedding_node_property=embedding_node_property,
+                text_node_properties=text_node_properties
             )
             logging.info(f"Successfully retrieved Neo4jVector index '{index_name}'")
     except Exception as e:
@@ -359,12 +370,12 @@ def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_
         logging.info(f"Successfully created retriever with search_k={search_k}, score_threshold={score_threshold}")
     return retriever
 
-def get_neo4j_retriever(graph, document_names,chat_mode_settings, search_k=CHAT_SEARCH_KWARG_K, score_threshold=CHAT_SEARCH_KWARG_SCORE_THRESHOLD):
+def get_neo4j_retriever(graph, document_names,chat_mode_settings, score_threshold=CHAT_SEARCH_KWARG_SCORE_THRESHOLD):
     try:
-        
+
         neo_db = initialize_neo4j_vector(graph, chat_mode_settings)
         document_names= list(map(str.strip, json.loads(document_names)))
-        search_k = LOCAL_COMMUNITY_TOP_K if chat_mode_settings["mode"] == "entity search+vector" else CHAT_SEARCH_KWARG_K
+        search_k = chat_mode_settings["top_k"]
         retriever = create_retriever(neo_db, document_names,chat_mode_settings, search_k, score_threshold)
         return retriever
     except Exception as e:
@@ -397,12 +408,13 @@ def process_chat_response(messages, history, question, model, graph, document_na
     try:
         llm, doc_retriever, model_version = setup_chat(model, graph, document_names, chat_mode_settings)
 
-        docs = retrieve_documents(doc_retriever, messages)   
+        docs = retrieve_documents(doc_retriever, messages)  
+
         if docs:
             content, result, total_tokens = process_documents(docs, question, messages, llm, model, chat_mode_settings)
         else:
             content = "I couldn't find any relevant documents to answer your question."
-            result = {"sources": [], "chunkdetails": [], "entities": []}
+            result = {"sources": list(), "nodedetails": list(), "entities": list()}
             total_tokens = 0
 
         ai_response = AIMessage(content=content)
@@ -412,18 +424,18 @@ def process_chat_response(messages, history, question, model, graph, document_na
         summarization_thread.start()
         logging.info("Summarization thread started.")
         # summarize_and_log(history, messages, llm)
-
+        
         return {
             "session_id": "",  
             "message": content,
             "info": {
                 "sources": result["sources"],
                 "model": model_version,
-                "chunkdetails": result["chunkdetails"],
+                "nodedetails": result["nodedetails"],
                 "total_tokens": total_tokens,
                 "response_time": 0,
                 "mode": chat_mode_settings["mode"],
-                "entities": result["entities"]
+                "entities": result["entities"],
             },
             "user": "chatbot"
         }
@@ -435,12 +447,12 @@ def process_chat_response(messages, history, question, model, graph, document_na
             "message": "Something went wrong",
             "info": {
                 "sources": [],
-                "chunkdetails": [],
+                "nodedetails": [],
                 "total_tokens": 0,
                 "response_time": 0,
                 "error": f"{type(e).__name__}: {str(e)}",
                 "mode": chat_mode_settings["mode"],
-                "entities": []
+                "entities": [],
             },
             "user": "chatbot"
         }
@@ -593,7 +605,7 @@ def create_neo4j_chat_message_history(graph, session_id, write_access=True):
         raise 
 
 def get_chat_mode_settings(mode,settings_map=CHAT_MODE_CONFIG_MAP):
-    default_settings = settings_map["default"]
+    default_settings = settings_map[CHAT_DEFAULT_MODE]
     try:
         chat_mode_settings = settings_map.get(mode, default_settings)
         chat_mode_settings["mode"] = mode
@@ -615,7 +627,7 @@ def QA_RAG(graph,model, question, document_names, session_id, mode, write_access
     user_question = HumanMessage(content=question)
     messages.append(user_question)
 
-    if mode == "graph":
+    if mode == CHAT_GRAPH_MODE:
         result = process_graph_response(model, graph, question, messages, history)
     else:
         chat_mode_settings = get_chat_mode_settings(mode=mode)

diff --git a/backend/src/chunkid_entities.py b/backend/src/chunkid_entities.py
@@ -81,16 +81,16 @@ def process_chunk_data(chunk_data):
     except Exception as e:
         logging.error(f"chunkid_entities module: An error occurred while extracting the Chunk text from records: {e}")
 
-def process_chunkids(driver, chunk_ids):
+def process_chunkids(driver, chunk_ids, entities):
     """
     Processes chunk IDs to retrieve chunk data.
     """
     try:
         logging.info(f"Starting graph query process for chunk ids: {chunk_ids}")
-        chunk_ids_list = chunk_ids.split(",")
-
-        records, summary, keys = driver.execute_query(CHUNK_QUERY, chunksIds=chunk_ids_list)
+        records, summary, keys = driver.execute_query(CHUNK_QUERY, chunksIds=chunk_ids,entityIds=entities["entityids"], relationshipIds=entities["relationshipids"])
         result = process_records(records)
+        result["nodes"].extend(records[0]["nodes"])
+        result["nodes"] = remove_duplicate_nodes(result["nodes"])
         logging.info(f"Nodes and relationships are processed")
 
         result["chunk_data"] = process_chunk_data(records)
@@ -118,79 +118,95 @@ def remove_duplicate_nodes(nodes,property="element_id"):
 
     return unique_nodes
 
-def process_entityids(driver, chunk_ids):
+def process_entityids(driver, entity_ids):
     """
     Processes entity IDs to retrieve local community data.
     """
     try:
-        logging.info(f"Starting graph query process for entity ids: {chunk_ids}")
-        entity_ids_list = chunk_ids.split(",")
+        logging.info(f"Starting graph query process for entity ids: {entity_ids}")
         query_body = LOCAL_COMMUNITY_SEARCH_QUERY.format(
             topChunks=LOCAL_COMMUNITY_TOP_CHUNKS,
             topCommunities=LOCAL_COMMUNITY_TOP_COMMUNITIES,
             topOutsideRels=LOCAL_COMMUNITY_TOP_OUTSIDE_RELS
         )
         query = LOCAL_COMMUNITY_DETAILS_QUERY_PREFIX + query_body + LOCAL_COMMUNITY_DETAILS_QUERY_SUFFIX
 
-        records, summary, keys = driver.execute_query(query, entityIds=entity_ids_list)
+        records, summary, keys = driver.execute_query(query, entityIds=entity_ids)
 
         result = process_records(records)
         if records:
             result["nodes"].extend(records[0]["nodes"])
             result["nodes"] = remove_duplicate_nodes(result["nodes"])
+
             logging.info(f"Nodes and relationships are processed")
+
             result["chunk_data"] = records[0]["chunks"]
             result["community_data"] = records[0]["communities"]
         else:
             result["chunk_data"] = list()
             result["community_data"] = list()
-        logging.info(f"Query process completed successfully for chunk ids: {chunk_ids}")
+        logging.info(f"Query process completed successfully for chunk ids: {entity_ids}")
         return result
     except Exception as e:
-        logging.error(f"chunkid_entities module: Error processing entity ids: {chunk_ids}. Error: {e}")
+        logging.error(f"chunkid_entities module: Error processing entity ids: {entity_ids}. Error: {e}")
         raise  
 
-def get_entities_from_chunkids(uri, username, password, database ,chunk_ids,is_entity=False):
-    """
-    Retrieve and process nodes and relationships from a graph database given a list of chunk IDs.
+def process_communityids(driver, community_ids):
+    """Processes community IDs to retrieve community data."""
+    try:
+        logging.info(f"Starting graph query process for community ids: {community_ids}")
+        query = GLOBAL_COMMUNITY_DETAILS_QUERY
+        records, summary, keys = driver.execute_query(query, communityids=community_ids)
+
+        result = {"nodes": [], "relationships": [], "chunk_data": []}
+        result["community_data"] = records[0]["communities"] if records else []
 
-    Parameters:
-    uri (str): The URI of the graph database.
-    username (str): The username for the database authentication.
-    password (str): The password for the database authentication.
-    chunk_ids (str): A comma-separated string of chunk IDs.
+        logging.info(f"Query process completed successfully for community ids: {community_ids}")
+        return result
+    except Exception as e:
+        logging.error(f"chunkid_entities module: Error processing community ids: {community_ids}. Error: {e}")
+        raise 
 
-    Returns:
-    dict: A dictionary with 'nodes' and 'relationships' keys containing processed data, or an error message.
-    """    
+def get_entities_from_chunkids(uri, username, password, database ,nodedetails,entities,mode):   
     try:
 
         driver = get_graphDB_driver(uri, username, password,database)
-        if not is_entity:
-            if chunk_ids:
-                logging.info(f"chunkid_entities module: Starting for chunk ids : {chunk_ids}")
-                result = process_chunkids(driver,chunk_ids)
+        default_response = {"nodes": list(),"relationships": list(),"chunk_data": list(),"community_data": list(),}
+
+        nodedetails = json.loads(nodedetails)
+        entities = json.loads(entities)
+
+        if mode == CHAT_GLOBAL_VECTOR_FULLTEXT_MODE:
+
+            if "communitydetails" in nodedetails and nodedetails["communitydetails"]:
+                community_ids = [item["id"] for item in nodedetails["communitydetails"]]
+                logging.info(f"chunkid_entities module: Starting for community ids: {community_ids}")
+                return process_communityids(driver, community_ids)
+            else:
+                logging.info("chunkid_entities module: No community ids are passed")
+                return default_response
+
+        elif mode == CHAT_ENTITY_VECTOR_MODE:
+
+            if "entitydetails" in nodedetails and nodedetails["entitydetails"]:
+                entity_ids = [item["id"] for item in nodedetails["entitydetails"]]
+                logging.info(f"chunkid_entities module: Starting for entity ids: {entity_ids}")
+                return process_entityids(driver, entity_ids)
             else:
-                logging.info(f"chunkid_entities module: No chunk ids are passed")
-                result = {
-                    "nodes": [],
-                    "relationships": [],
-                    "chunk_data":[]
-                }
-            return result
-        if chunk_ids:
-            result = process_entityids(driver,chunk_ids)
-            logging.info(f"chunkid_entities module: Starting for entity ids : {chunk_ids}")
+                logging.info("chunkid_entities module: No entity ids are passed")
+                return default_response  
+
         else:
-            logging.info(f"chunkid_entities module: No entity ids are passed")
-            result = {
-                "nodes": [],
-                "relationships": [],
-                "chunk_data":[],
-                "community_data":[]
-            }
-        return result
+
+            if "chunkdetails" in nodedetails and nodedetails["chunkdetails"]:
+                chunk_ids = [item["id"] for item in nodedetails["chunkdetails"]]
+                logging.info(f"chunkid_entities module: Starting for chunk ids: {chunk_ids}")
+                return process_chunkids(driver, chunk_ids, entities)
+            else:
+                logging.info("chunkid_entities module: No chunk ids are passed")
+                return default_response
 
     except Exception as e:
         logging.error(f"chunkid_entities module: An error occurred in get_entities_from_chunkids. Error: {str(e)}")
-        raise Exception(f"chunkid_entities module: An error occurred in get_entities_from_chunkids. Please check the logs for more details.") from e
+        raise Exception(f"chunkid_entities module: An error occurred in get_entities_from_chunkids. Please check the logs for more details.") from e 
+