Graph consolidation prompt updated (#1013)

kaustubh-darekar · web-flow · commit 12ce5e8d3b6b · 2025-01-20T20:27:32.000+05:30
diff --git a/backend/src/graphDB_dataAccess.py b/backend/src/graphDB_dataAccess.py
@@ -535,4 +535,30 @@ def update_node_relationship_count(self,document_name):
                     "nodeCount" : nodeCount,
                     "relationshipCount" : relationshipCount
                     }
-        return response
+        return response
+    
+    def get_nodelabels_relationships(self):
+        node_query = """
+                    CALL db.labels() YIELD label
+                    WITH label
+                    WHERE NOT label IN ['Document', 'Chunk', '_Bloom_Perspective_', '__Community__', '__Entity__']
+                    CALL apoc.cypher.run("MATCH (n:`" + label + "`) RETURN count(n) AS count",{}) YIELD value
+                    WHERE value.count > 0
+                    RETURN label order by label
+                    """
+
+        relation_query = """
+                CALL db.relationshipTypes() yield relationshipType
+                WHERE NOT relationshipType  IN ['PART_OF', 'NEXT_CHUNK', 'HAS_ENTITY', '_Bloom_Perspective_','FIRST_CHUNK','SIMILAR','IN_COMMUNITY','PARENT_COMMUNITY'] 
+                return relationshipType order by relationshipType
+                """
+            
+        try:
+            node_result = self.execute_query(node_query)
+            node_labels = [record["label"] for record in node_result]
+            relationship_result = self.execute_query(relation_query)
+            relationship_types = [record["relationshipType"] for record in relationship_result]
+            return node_labels,relationship_types
+        except Exception as e:
+            print(f"Error in getting node labels/relationship types from db: {e}")
+            return []
diff --git a/backend/src/post_processing.py b/backend/src/post_processing.py
@@ -8,7 +8,9 @@
 from langchain_core.prompts import ChatPromptTemplate
 from src.shared.constants import GRAPH_CLEANUP_PROMPT
 from src.llm import get_llm
-from src.main import get_labels_and_relationtypes
+from src.graphDB_dataAccess import graphDBdataAccess
+import time 
+
 
 DROP_INDEX_QUERY = "DROP INDEX entities IF EXISTS;"
 LABELS_QUERY = "CALL db.labels()"
@@ -195,58 +197,35 @@ def update_embeddings(rows, graph):
     return graph.query(query,params={'rows':rows})          
 
 def graph_schema_consolidation(graph):
-    nodes_and_relations = get_labels_and_relationtypes(graph)
-    logging.info(f"nodes_and_relations in existing graph : {nodes_and_relations}")
-    node_labels = []
-    relation_labels = []
-    
-    node_labels.extend(nodes_and_relations[0]['labels'])
-    relation_labels.extend(nodes_and_relations[0]['relationshipTypes'])
-    
-    exclude_node_labels = ['Document','Chunk','_Bloom_Perspective_', '__Community__', '__Entity__']
-    exclude_relationship_labels = ['PART_OF', 'NEXT_CHUNK', 'HAS_ENTITY', '_Bloom_Perspective_','FIRST_CHUNK','SIMILAR','IN_COMMUNITY','PARENT_COMMUNITY']
-
-    node_labels = [i for i in node_labels if i not in exclude_node_labels ]
-    relation_labels = [i for i in relation_labels if i not in exclude_relationship_labels]
-
+    graphDb_data_Access = graphDBdataAccess(graph)
+    node_labels,relation_labels = graphDb_data_Access.get_nodelabels_relationships()
     parser = JsonOutputParser()
-    prompt = ChatPromptTemplate(messages=[("system",GRAPH_CLEANUP_PROMPT),("human", "{input}")],
-                                            partial_variables={"format_instructions": parser.get_format_instructions()})
-    
-    graph_cleanup_model = os.getenv("GRAPH_CLEANUP_MODEL",'openai_gpt_4o')
+    prompt = ChatPromptTemplate(
+        messages=[("system", GRAPH_CLEANUP_PROMPT), ("human", "{input}")],
+        partial_variables={"format_instructions": parser.get_format_instructions()}
+    )
+    graph_cleanup_model = os.getenv("GRAPH_CLEANUP_MODEL", 'openai_gpt_4o')
     llm, _ = get_llm(graph_cleanup_model)
     chain = prompt | llm | parser
-    nodes_dict = chain.invoke({'input':node_labels})
-    relation_dict = chain.invoke({'input':relation_labels})  
-    
-    node_match = {}
-    relation_match = {}
-    for new_label , values in nodes_dict.items() :
-        for old_label in values:
-            if new_label != old_label:
-                node_match[old_label]=new_label
-            
-    for new_label , values in relation_dict.items() :
-        for old_label in values:
-            if new_label != old_label:
-                relation_match[old_label]=new_label 
-
-    logging.info(f"updated node labels : {node_match}")
-    logging.info(f"Reduced node counts from {len(node_labels)} to {len(node_match.items())}")   
-    logging.info(f"updated relationship labels : {relation_match}") 
-    logging.info(f"Reduced relationship counts from {len(relation_labels)} to {len(relation_match.items())}")
 
-    # Update node labels in graph
-    for old_label, new_label in node_match.items():
-        query = f"""
-                MATCH (n:`{old_label}`)
-                SET n:`{new_label}`
-                REMOVE n:`{old_label}`
-                """
-        graph.query(query)
+    nodes_relations_input = {'nodes': node_labels, 'relationships': relation_labels}
+    mappings = chain.invoke({'input': nodes_relations_input})
+    node_mapping = {old: new for new, old_list in mappings['nodes'].items() for old in old_list if new != old}
+    relation_mapping = {old: new for new, old_list in mappings['relationships'].items() for old in old_list if new != old}
+
+    logging.info(f"Node Labels: Total = {len(node_labels)}, Reduced to = {len(set(node_mapping.values()))} (from {len(node_mapping)})")
+    logging.info(f"Relationship Types: Total = {len(relation_labels)}, Reduced to = {len(set(relation_mapping.values()))} (from {len(relation_mapping)})")
+
+    if node_mapping:
+        for old_label, new_label in node_mapping.items():
+            query = f"""
+                    MATCH (n:`{old_label}`)
+                    SET n:`{new_label}`
+                    REMOVE n:`{old_label}`
+                    """
+            graph.query(query)
     
-    # Update relation types in graph
-    for old_label, new_label in relation_match.items():
+    for old_label, new_label in relation_mapping.items():
         query = f"""
                 MATCH (n)-[r:`{old_label}`]->(m)
                 CREATE (n)-[r2:`{new_label}`]->(m)
diff --git a/backend/src/shared/constants.py b/backend/src/shared/constants.py
@@ -831,27 +831,62 @@
 DELETE_ENTITIES_AND_START_FROM_BEGINNING = "delete_entities_and_start_from_beginning"
 START_FROM_LAST_PROCESSED_POSITION = "start_from_last_processed_position"                                                    
 
-GRAPH_CLEANUP_PROMPT = """Please consolidate the following list of types into a smaller set of more general, semantically 
-related types. The consolidated types must be drawn from the original list; do not introduce new types.  
-Return a JSON object representing the mapping of original types to consolidated types. Every key is the consolidated type
-and value is list of the original types that were merged into the consolidated type. Prioritize using the most generic and 
-repeated term when merging. If a type doesn't merge with any other type, it should still be included in the output, 
-mapped to itself.
-
-**Input:** A list of strings representing the types to be consolidated. These types may represent either node 
-labels or relationship labels Your algorithm should do appropriate groupings based on semantic similarity.
-
-Example 1:
-Input: 
-[ "Person", "Human", "People", "Company", "Organization", "Product"]
-Output :
-[Person": ["Person", "Human", "People"], Organization": ["Company", "Organization"], Product": ["Product"]]
-
-Example 2:
-Input :
-["CREATED_FOR", "CREATED_TO", "CREATED", "PLACE", "LOCATION", "VENUE"]
+GRAPH_CLEANUP_PROMPT = """
+You are tasked with organizing a list of types into semantic categories based on their meanings, including synonyms or morphological similarities. The input will include two separate lists: one for **Node Labels** and one for **Relationship Types**. Follow these rules strictly:
+### 1. Input Format
+The input will include two keys:
+- `nodes`: A list of node labels.
+- `relationships`: A list of relationship types.
+### 2. Grouping Rules
+- Group similar items into **semantic categories** based on their meaning or morphological similarities.
+- The name of each category must be chosen from the types in the input list (node labels or relationship types). **Do not create or infer new names for categories**.
+- Items that cannot be grouped must remain in their own category.
+### 3. Naming Rules
+- The category name must reflect the grouped items and must be an existing type in the input list.
+- Use a widely applicable type as the category name.
+- **Do not introduce new names or types** under any circumstances.
+### 4. Output Rules
+- Return the output as a JSON object with two keys:
+ - `nodes`: A dictionary where each key represents a category name for nodes, and its value is a list of original node labels in that category.
+ - `relationships`: A dictionary where each key represents a category name for relationships, and its value is a list of original relationship types in that category.
+- Every key and value must come from the provided input lists.
+### 5. Examples
+#### Example 1:
+Input:
+{{
+ "nodes": ["Person", "Human", "People", "Company", "Organization", "Product"],
+ "relationships": ["CREATED_FOR", "CREATED_TO", "CREATED", "PUBLISHED","PUBLISHED_BY", "PUBLISHED_IN", "PUBLISHED_ON"]
+}}
+Output in JSON:
+{{
+ "nodes": {{
+   "Person": ["Person", "Human", "People"],
+   "Organization": ["Company", "Organization"],
+   "Product": ["Product"]
+ }},
+ "relationships": {{
+   "CREATED": ["CREATED_FOR", "CREATED_TO", "CREATED"],
+   "PUBLISHED": ["PUBLISHED_BY", "PUBLISHED_IN", "PUBLISHED_ON"]
+ }}
+}}
+#### Example 2: Avoid redundant or incorrect grouping
+Input:
+{{
+ "nodes": ["Process", "Process_Step", "Step", "Procedure", "Method", "Natural Process", "Step"],
+ "relationships": ["USED_FOR", "USED_BY", "USED_WITH", "USED_IN"]
+}}
 Output:
-["CREATED": ["CREATED_FOR", "CREATED_TO", "CREATED"],"PLACE": ["PLACE", "LOCATION", "VENUE"]]
+{{
+ "nodes": {{
+   "Process": ["Process", "Process_Step", "Step", "Procedure", "Method", "Natural Process"]
+ }},
+ "relationships": {{
+   "USED": ["USED_FOR", "USED_BY", "USED_WITH", "USED_IN"]
+ }}
+}}
+### 6. Key Rule
+If any item cannot be grouped, it must remain in its own category using its original name. Do not repeat values or create incorrect mappings.
+Use these rules to group and name categories accurately without introducing errors or new types.
 """
 
 ADDITIONAL_INSTRUCTIONS = """Your goal is to identify and categorize entities while ensuring that specific data