Skip to content

Commit 12ce5e8

Browse files
Graph consolidation prompt updated (#1013)
1 parent d29f688 commit 12ce5e8

File tree

3 files changed

+109
-69
lines changed

3 files changed

+109
-69
lines changed

Diff for: backend/src/graphDB_dataAccess.py

+27-1
Original file line numberDiff line numberDiff line change
@@ -535,4 +535,30 @@ def update_node_relationship_count(self,document_name):
535535
"nodeCount" : nodeCount,
536536
"relationshipCount" : relationshipCount
537537
}
538-
return response
538+
return response
539+
540+
def get_nodelabels_relationships(self):
541+
node_query = """
542+
CALL db.labels() YIELD label
543+
WITH label
544+
WHERE NOT label IN ['Document', 'Chunk', '_Bloom_Perspective_', '__Community__', '__Entity__']
545+
CALL apoc.cypher.run("MATCH (n:`" + label + "`) RETURN count(n) AS count",{}) YIELD value
546+
WHERE value.count > 0
547+
RETURN label order by label
548+
"""
549+
550+
relation_query = """
551+
CALL db.relationshipTypes() yield relationshipType
552+
WHERE NOT relationshipType IN ['PART_OF', 'NEXT_CHUNK', 'HAS_ENTITY', '_Bloom_Perspective_','FIRST_CHUNK','SIMILAR','IN_COMMUNITY','PARENT_COMMUNITY']
553+
return relationshipType order by relationshipType
554+
"""
555+
556+
try:
557+
node_result = self.execute_query(node_query)
558+
node_labels = [record["label"] for record in node_result]
559+
relationship_result = self.execute_query(relation_query)
560+
relationship_types = [record["relationshipType"] for record in relationship_result]
561+
return node_labels,relationship_types
562+
except Exception as e:
563+
print(f"Error in getting node labels/relationship types from db: {e}")
564+
return []

Diff for: backend/src/post_processing.py

+27-48
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
from langchain_core.prompts import ChatPromptTemplate
99
from src.shared.constants import GRAPH_CLEANUP_PROMPT
1010
from src.llm import get_llm
11-
from src.main import get_labels_and_relationtypes
11+
from src.graphDB_dataAccess import graphDBdataAccess
12+
import time
13+
1214

1315
DROP_INDEX_QUERY = "DROP INDEX entities IF EXISTS;"
1416
LABELS_QUERY = "CALL db.labels()"
@@ -195,58 +197,35 @@ def update_embeddings(rows, graph):
195197
return graph.query(query,params={'rows':rows})
196198

197199
def graph_schema_consolidation(graph):
198-
nodes_and_relations = get_labels_and_relationtypes(graph)
199-
logging.info(f"nodes_and_relations in existing graph : {nodes_and_relations}")
200-
node_labels = []
201-
relation_labels = []
202-
203-
node_labels.extend(nodes_and_relations[0]['labels'])
204-
relation_labels.extend(nodes_and_relations[0]['relationshipTypes'])
205-
206-
exclude_node_labels = ['Document','Chunk','_Bloom_Perspective_', '__Community__', '__Entity__']
207-
exclude_relationship_labels = ['PART_OF', 'NEXT_CHUNK', 'HAS_ENTITY', '_Bloom_Perspective_','FIRST_CHUNK','SIMILAR','IN_COMMUNITY','PARENT_COMMUNITY']
208-
209-
node_labels = [i for i in node_labels if i not in exclude_node_labels ]
210-
relation_labels = [i for i in relation_labels if i not in exclude_relationship_labels]
211-
200+
graphDb_data_Access = graphDBdataAccess(graph)
201+
node_labels,relation_labels = graphDb_data_Access.get_nodelabels_relationships()
212202
parser = JsonOutputParser()
213-
prompt = ChatPromptTemplate(messages=[("system",GRAPH_CLEANUP_PROMPT),("human", "{input}")],
214-
partial_variables={"format_instructions": parser.get_format_instructions()})
215-
216-
graph_cleanup_model = os.getenv("GRAPH_CLEANUP_MODEL",'openai_gpt_4o')
203+
prompt = ChatPromptTemplate(
204+
messages=[("system", GRAPH_CLEANUP_PROMPT), ("human", "{input}")],
205+
partial_variables={"format_instructions": parser.get_format_instructions()}
206+
)
207+
graph_cleanup_model = os.getenv("GRAPH_CLEANUP_MODEL", 'openai_gpt_4o')
217208
llm, _ = get_llm(graph_cleanup_model)
218209
chain = prompt | llm | parser
219-
nodes_dict = chain.invoke({'input':node_labels})
220-
relation_dict = chain.invoke({'input':relation_labels})
221-
222-
node_match = {}
223-
relation_match = {}
224-
for new_label , values in nodes_dict.items() :
225-
for old_label in values:
226-
if new_label != old_label:
227-
node_match[old_label]=new_label
228-
229-
for new_label , values in relation_dict.items() :
230-
for old_label in values:
231-
if new_label != old_label:
232-
relation_match[old_label]=new_label
233-
234-
logging.info(f"updated node labels : {node_match}")
235-
logging.info(f"Reduced node counts from {len(node_labels)} to {len(node_match.items())}")
236-
logging.info(f"updated relationship labels : {relation_match}")
237-
logging.info(f"Reduced relationship counts from {len(relation_labels)} to {len(relation_match.items())}")
238210

239-
# Update node labels in graph
240-
for old_label, new_label in node_match.items():
241-
query = f"""
242-
MATCH (n:`{old_label}`)
243-
SET n:`{new_label}`
244-
REMOVE n:`{old_label}`
245-
"""
246-
graph.query(query)
211+
nodes_relations_input = {'nodes': node_labels, 'relationships': relation_labels}
212+
mappings = chain.invoke({'input': nodes_relations_input})
213+
node_mapping = {old: new for new, old_list in mappings['nodes'].items() for old in old_list if new != old}
214+
relation_mapping = {old: new for new, old_list in mappings['relationships'].items() for old in old_list if new != old}
215+
216+
logging.info(f"Node Labels: Total = {len(node_labels)}, Reduced to = {len(set(node_mapping.values()))} (from {len(node_mapping)})")
217+
logging.info(f"Relationship Types: Total = {len(relation_labels)}, Reduced to = {len(set(relation_mapping.values()))} (from {len(relation_mapping)})")
218+
219+
if node_mapping:
220+
for old_label, new_label in node_mapping.items():
221+
query = f"""
222+
MATCH (n:`{old_label}`)
223+
SET n:`{new_label}`
224+
REMOVE n:`{old_label}`
225+
"""
226+
graph.query(query)
247227

248-
# Update relation types in graph
249-
for old_label, new_label in relation_match.items():
228+
for old_label, new_label in relation_mapping.items():
250229
query = f"""
251230
MATCH (n)-[r:`{old_label}`]->(m)
252231
CREATE (n)-[r2:`{new_label}`]->(m)

Diff for: backend/src/shared/constants.py

+55-20
Original file line numberDiff line numberDiff line change
@@ -831,27 +831,62 @@
831831
DELETE_ENTITIES_AND_START_FROM_BEGINNING = "delete_entities_and_start_from_beginning"
832832
START_FROM_LAST_PROCESSED_POSITION = "start_from_last_processed_position"
833833

834-
GRAPH_CLEANUP_PROMPT = """Please consolidate the following list of types into a smaller set of more general, semantically
835-
related types. The consolidated types must be drawn from the original list; do not introduce new types.
836-
Return a JSON object representing the mapping of original types to consolidated types. Every key is the consolidated type
837-
and value is list of the original types that were merged into the consolidated type. Prioritize using the most generic and
838-
repeated term when merging. If a type doesn't merge with any other type, it should still be included in the output,
839-
mapped to itself.
840-
841-
**Input:** A list of strings representing the types to be consolidated. These types may represent either node
842-
labels or relationship labels Your algorithm should do appropriate groupings based on semantic similarity.
843-
844-
Example 1:
845-
Input:
846-
[ "Person", "Human", "People", "Company", "Organization", "Product"]
847-
Output :
848-
[Person": ["Person", "Human", "People"], Organization": ["Company", "Organization"], Product": ["Product"]]
849-
850-
Example 2:
851-
Input :
852-
["CREATED_FOR", "CREATED_TO", "CREATED", "PLACE", "LOCATION", "VENUE"]
834+
GRAPH_CLEANUP_PROMPT = """
835+
You are tasked with organizing a list of types into semantic categories based on their meanings, including synonyms or morphological similarities. The input will include two separate lists: one for **Node Labels** and one for **Relationship Types**. Follow these rules strictly:
836+
### 1. Input Format
837+
The input will include two keys:
838+
- `nodes`: A list of node labels.
839+
- `relationships`: A list of relationship types.
840+
### 2. Grouping Rules
841+
- Group similar items into **semantic categories** based on their meaning or morphological similarities.
842+
- The name of each category must be chosen from the types in the input list (node labels or relationship types). **Do not create or infer new names for categories**.
843+
- Items that cannot be grouped must remain in their own category.
844+
### 3. Naming Rules
845+
- The category name must reflect the grouped items and must be an existing type in the input list.
846+
- Use a widely applicable type as the category name.
847+
- **Do not introduce new names or types** under any circumstances.
848+
### 4. Output Rules
849+
- Return the output as a JSON object with two keys:
850+
- `nodes`: A dictionary where each key represents a category name for nodes, and its value is a list of original node labels in that category.
851+
- `relationships`: A dictionary where each key represents a category name for relationships, and its value is a list of original relationship types in that category.
852+
- Every key and value must come from the provided input lists.
853+
### 5. Examples
854+
#### Example 1:
855+
Input:
856+
{{
857+
"nodes": ["Person", "Human", "People", "Company", "Organization", "Product"],
858+
"relationships": ["CREATED_FOR", "CREATED_TO", "CREATED", "PUBLISHED","PUBLISHED_BY", "PUBLISHED_IN", "PUBLISHED_ON"]
859+
}}
860+
Output in JSON:
861+
{{
862+
"nodes": {{
863+
"Person": ["Person", "Human", "People"],
864+
"Organization": ["Company", "Organization"],
865+
"Product": ["Product"]
866+
}},
867+
"relationships": {{
868+
"CREATED": ["CREATED_FOR", "CREATED_TO", "CREATED"],
869+
"PUBLISHED": ["PUBLISHED_BY", "PUBLISHED_IN", "PUBLISHED_ON"]
870+
}}
871+
}}
872+
#### Example 2: Avoid redundant or incorrect grouping
873+
Input:
874+
{{
875+
"nodes": ["Process", "Process_Step", "Step", "Procedure", "Method", "Natural Process", "Step"],
876+
"relationships": ["USED_FOR", "USED_BY", "USED_WITH", "USED_IN"]
877+
}}
853878
Output:
854-
["CREATED": ["CREATED_FOR", "CREATED_TO", "CREATED"],"PLACE": ["PLACE", "LOCATION", "VENUE"]]
879+
{{
880+
"nodes": {{
881+
"Process": ["Process", "Process_Step", "Step", "Procedure", "Method", "Natural Process"]
882+
}},
883+
"relationships": {{
884+
"USED": ["USED_FOR", "USED_BY", "USED_WITH", "USED_IN"]
885+
}}
886+
}}
887+
### 6. Key Rule
888+
If any item cannot be grouped, it must remain in its own category using its original name. Do not repeat values or create incorrect mappings.
889+
Use these rules to group and name categories accurately without introducing errors or new types.
855890
"""
856891

857892
ADDITIONAL_INSTRUCTIONS = """Your goal is to identify and categorize entities while ensuring that specific data

0 commit comments

Comments
 (0)