Skip to content

Graph consolidation changes #1013

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion backend/src/graphDB_dataAccess.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,4 +535,30 @@ def update_node_relationship_count(self,document_name):
"nodeCount" : nodeCount,
"relationshipCount" : relationshipCount
}
return response
return response

def get_nodelabels_relationships(self):
node_query = """
CALL db.labels() YIELD label
WITH label
WHERE NOT label IN ['Document', 'Chunk', '_Bloom_Perspective_', '__Community__', '__Entity__']
CALL apoc.cypher.run("MATCH (n:`" + label + "`) RETURN count(n) AS count",{}) YIELD value
WHERE value.count > 0
RETURN label order by label
"""

relation_query = """
CALL db.relationshipTypes() yield relationshipType
WHERE NOT relationshipType IN ['PART_OF', 'NEXT_CHUNK', 'HAS_ENTITY', '_Bloom_Perspective_','FIRST_CHUNK','SIMILAR','IN_COMMUNITY','PARENT_COMMUNITY']
return relationshipType order by relationshipType
"""

try:
node_result = self.execute_query(node_query)
node_labels = [record["label"] for record in node_result]
relationship_result = self.execute_query(relation_query)
relationship_types = [record["relationshipType"] for record in relationship_result]
return node_labels,relationship_types
except Exception as e:
print(f"Error in getting node labels/relationship types from db: {e}")
return []
75 changes: 27 additions & 48 deletions backend/src/post_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
from langchain_core.prompts import ChatPromptTemplate
from src.shared.constants import GRAPH_CLEANUP_PROMPT
from src.llm import get_llm
from src.main import get_labels_and_relationtypes
from src.graphDB_dataAccess import graphDBdataAccess
import time


DROP_INDEX_QUERY = "DROP INDEX entities IF EXISTS;"
LABELS_QUERY = "CALL db.labels()"
Expand Down Expand Up @@ -195,58 +197,35 @@ def update_embeddings(rows, graph):
return graph.query(query,params={'rows':rows})

def graph_schema_consolidation(graph):
nodes_and_relations = get_labels_and_relationtypes(graph)
logging.info(f"nodes_and_relations in existing graph : {nodes_and_relations}")
node_labels = []
relation_labels = []

node_labels.extend(nodes_and_relations[0]['labels'])
relation_labels.extend(nodes_and_relations[0]['relationshipTypes'])

exclude_node_labels = ['Document','Chunk','_Bloom_Perspective_', '__Community__', '__Entity__']
exclude_relationship_labels = ['PART_OF', 'NEXT_CHUNK', 'HAS_ENTITY', '_Bloom_Perspective_','FIRST_CHUNK','SIMILAR','IN_COMMUNITY','PARENT_COMMUNITY']

node_labels = [i for i in node_labels if i not in exclude_node_labels ]
relation_labels = [i for i in relation_labels if i not in exclude_relationship_labels]

graphDb_data_Access = graphDBdataAccess(graph)
node_labels,relation_labels = graphDb_data_Access.get_nodelabels_relationships()
parser = JsonOutputParser()
prompt = ChatPromptTemplate(messages=[("system",GRAPH_CLEANUP_PROMPT),("human", "{input}")],
partial_variables={"format_instructions": parser.get_format_instructions()})

graph_cleanup_model = os.getenv("GRAPH_CLEANUP_MODEL",'openai_gpt_4o')
prompt = ChatPromptTemplate(
messages=[("system", GRAPH_CLEANUP_PROMPT), ("human", "{input}")],
partial_variables={"format_instructions": parser.get_format_instructions()}
)
graph_cleanup_model = os.getenv("GRAPH_CLEANUP_MODEL", 'openai_gpt_4o')
llm, _ = get_llm(graph_cleanup_model)
chain = prompt | llm | parser
nodes_dict = chain.invoke({'input':node_labels})
relation_dict = chain.invoke({'input':relation_labels})

node_match = {}
relation_match = {}
for new_label , values in nodes_dict.items() :
for old_label in values:
if new_label != old_label:
node_match[old_label]=new_label

for new_label , values in relation_dict.items() :
for old_label in values:
if new_label != old_label:
relation_match[old_label]=new_label

logging.info(f"updated node labels : {node_match}")
logging.info(f"Reduced node counts from {len(node_labels)} to {len(node_match.items())}")
logging.info(f"updated relationship labels : {relation_match}")
logging.info(f"Reduced relationship counts from {len(relation_labels)} to {len(relation_match.items())}")

# Update node labels in graph
for old_label, new_label in node_match.items():
query = f"""
MATCH (n:`{old_label}`)
SET n:`{new_label}`
REMOVE n:`{old_label}`
"""
graph.query(query)
nodes_relations_input = {'nodes': node_labels, 'relationships': relation_labels}
mappings = chain.invoke({'input': nodes_relations_input})
node_mapping = {old: new for new, old_list in mappings['nodes'].items() for old in old_list if new != old}
relation_mapping = {old: new for new, old_list in mappings['relationships'].items() for old in old_list if new != old}

logging.info(f"Node Labels: Total = {len(node_labels)}, Reduced to = {len(set(node_mapping.values()))} (from {len(node_mapping)})")
logging.info(f"Relationship Types: Total = {len(relation_labels)}, Reduced to = {len(set(relation_mapping.values()))} (from {len(relation_mapping)})")

if node_mapping:
for old_label, new_label in node_mapping.items():
query = f"""
MATCH (n:`{old_label}`)
SET n:`{new_label}`
REMOVE n:`{old_label}`
"""
graph.query(query)

# Update relation types in graph
for old_label, new_label in relation_match.items():
for old_label, new_label in relation_mapping.items():
query = f"""
MATCH (n)-[r:`{old_label}`]->(m)
CREATE (n)-[r2:`{new_label}`]->(m)
Expand Down
75 changes: 55 additions & 20 deletions backend/src/shared/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,27 +831,62 @@
DELETE_ENTITIES_AND_START_FROM_BEGINNING = "delete_entities_and_start_from_beginning"
START_FROM_LAST_PROCESSED_POSITION = "start_from_last_processed_position"

GRAPH_CLEANUP_PROMPT = """Please consolidate the following list of types into a smaller set of more general, semantically
related types. The consolidated types must be drawn from the original list; do not introduce new types.
Return a JSON object representing the mapping of original types to consolidated types. Every key is the consolidated type
and value is list of the original types that were merged into the consolidated type. Prioritize using the most generic and
repeated term when merging. If a type doesn't merge with any other type, it should still be included in the output,
mapped to itself.

**Input:** A list of strings representing the types to be consolidated. These types may represent either node
labels or relationship labels Your algorithm should do appropriate groupings based on semantic similarity.

Example 1:
Input:
[ "Person", "Human", "People", "Company", "Organization", "Product"]
Output :
[Person": ["Person", "Human", "People"], Organization": ["Company", "Organization"], Product": ["Product"]]

Example 2:
Input :
["CREATED_FOR", "CREATED_TO", "CREATED", "PLACE", "LOCATION", "VENUE"]
GRAPH_CLEANUP_PROMPT = """
You are tasked with organizing a list of types into semantic categories based on their meanings, including synonyms or morphological similarities. The input will include two separate lists: one for **Node Labels** and one for **Relationship Types**. Follow these rules strictly:
### 1. Input Format
The input will include two keys:
- `nodes`: A list of node labels.
- `relationships`: A list of relationship types.
### 2. Grouping Rules
- Group similar items into **semantic categories** based on their meaning or morphological similarities.
- The name of each category must be chosen from the types in the input list (node labels or relationship types). **Do not create or infer new names for categories**.
- Items that cannot be grouped must remain in their own category.
### 3. Naming Rules
- The category name must reflect the grouped items and must be an existing type in the input list.
- Use a widely applicable type as the category name.
- **Do not introduce new names or types** under any circumstances.
### 4. Output Rules
- Return the output as a JSON object with two keys:
- `nodes`: A dictionary where each key represents a category name for nodes, and its value is a list of original node labels in that category.
- `relationships`: A dictionary where each key represents a category name for relationships, and its value is a list of original relationship types in that category.
- Every key and value must come from the provided input lists.
### 5. Examples
#### Example 1:
Input:
{{
"nodes": ["Person", "Human", "People", "Company", "Organization", "Product"],
"relationships": ["CREATED_FOR", "CREATED_TO", "CREATED", "PUBLISHED","PUBLISHED_BY", "PUBLISHED_IN", "PUBLISHED_ON"]
}}
Output in JSON:
{{
"nodes": {{
"Person": ["Person", "Human", "People"],
"Organization": ["Company", "Organization"],
"Product": ["Product"]
}},
"relationships": {{
"CREATED": ["CREATED_FOR", "CREATED_TO", "CREATED"],
"PUBLISHED": ["PUBLISHED_BY", "PUBLISHED_IN", "PUBLISHED_ON"]
}}
}}
#### Example 2: Avoid redundant or incorrect grouping
Input:
{{
"nodes": ["Process", "Process_Step", "Step", "Procedure", "Method", "Natural Process", "Step"],
"relationships": ["USED_FOR", "USED_BY", "USED_WITH", "USED_IN"]
}}
Output:
["CREATED": ["CREATED_FOR", "CREATED_TO", "CREATED"],"PLACE": ["PLACE", "LOCATION", "VENUE"]]
{{
"nodes": {{
"Process": ["Process", "Process_Step", "Step", "Procedure", "Method", "Natural Process"]
}},
"relationships": {{
"USED": ["USED_FOR", "USED_BY", "USED_WITH", "USED_IN"]
}}
}}
### 6. Key Rule
If any item cannot be grouped, it must remain in its own category using its original name. Do not repeat values or create incorrect mappings.
Use these rules to group and name categories accurately without introducing errors or new types.
"""

ADDITIONAL_INSTRUCTIONS = """Your goal is to identify and categorize entities while ensuring that specific data
Expand Down
Loading