diff --git a/backend/src/graphDB_dataAccess.py b/backend/src/graphDB_dataAccess.py index 89a1c0651..d352fecc5 100644 --- a/backend/src/graphDB_dataAccess.py +++ b/backend/src/graphDB_dataAccess.py @@ -535,4 +535,30 @@ def update_node_relationship_count(self,document_name): "nodeCount" : nodeCount, "relationshipCount" : relationshipCount } - return response \ No newline at end of file + return response + + def get_nodelabels_relationships(self): + node_query = """ + CALL db.labels() YIELD label + WITH label + WHERE NOT label IN ['Document', 'Chunk', '_Bloom_Perspective_', '__Community__', '__Entity__'] + CALL apoc.cypher.run("MATCH (n:`" + label + "`) RETURN count(n) AS count",{}) YIELD value + WHERE value.count > 0 + RETURN label order by label + """ + + relation_query = """ + CALL db.relationshipTypes() yield relationshipType + WHERE NOT relationshipType IN ['PART_OF', 'NEXT_CHUNK', 'HAS_ENTITY', '_Bloom_Perspective_','FIRST_CHUNK','SIMILAR','IN_COMMUNITY','PARENT_COMMUNITY'] + return relationshipType order by relationshipType + """ + + try: + node_result = self.execute_query(node_query) + node_labels = [record["label"] for record in node_result] + relationship_result = self.execute_query(relation_query) + relationship_types = [record["relationshipType"] for record in relationship_result] + return node_labels,relationship_types + except Exception as e: + print(f"Error in getting node labels/relationship types from db: {e}") + return [] \ No newline at end of file diff --git a/backend/src/post_processing.py b/backend/src/post_processing.py index cb7993a1c..f80ae529d 100644 --- a/backend/src/post_processing.py +++ b/backend/src/post_processing.py @@ -8,7 +8,9 @@ from langchain_core.prompts import ChatPromptTemplate from src.shared.constants import GRAPH_CLEANUP_PROMPT from src.llm import get_llm -from src.main import get_labels_and_relationtypes +from src.graphDB_dataAccess import graphDBdataAccess +import time + DROP_INDEX_QUERY = "DROP INDEX entities IF EXISTS;" LABELS_QUERY = "CALL db.labels()" @@ -195,58 +197,35 @@ def update_embeddings(rows, graph): return graph.query(query,params={'rows':rows}) def graph_schema_consolidation(graph): - nodes_and_relations = get_labels_and_relationtypes(graph) - logging.info(f"nodes_and_relations in existing graph : {nodes_and_relations}") - node_labels = [] - relation_labels = [] - - node_labels.extend(nodes_and_relations[0]['labels']) - relation_labels.extend(nodes_and_relations[0]['relationshipTypes']) - - exclude_node_labels = ['Document','Chunk','_Bloom_Perspective_', '__Community__', '__Entity__'] - exclude_relationship_labels = ['PART_OF', 'NEXT_CHUNK', 'HAS_ENTITY', '_Bloom_Perspective_','FIRST_CHUNK','SIMILAR','IN_COMMUNITY','PARENT_COMMUNITY'] - - node_labels = [i for i in node_labels if i not in exclude_node_labels ] - relation_labels = [i for i in relation_labels if i not in exclude_relationship_labels] - + graphDb_data_Access = graphDBdataAccess(graph) + node_labels,relation_labels = graphDb_data_Access.get_nodelabels_relationships() parser = JsonOutputParser() - prompt = ChatPromptTemplate(messages=[("system",GRAPH_CLEANUP_PROMPT),("human", "{input}")], - partial_variables={"format_instructions": parser.get_format_instructions()}) - - graph_cleanup_model = os.getenv("GRAPH_CLEANUP_MODEL",'openai_gpt_4o') + prompt = ChatPromptTemplate( + messages=[("system", GRAPH_CLEANUP_PROMPT), ("human", "{input}")], + partial_variables={"format_instructions": parser.get_format_instructions()} + ) + graph_cleanup_model = os.getenv("GRAPH_CLEANUP_MODEL", 'openai_gpt_4o') llm, _ = get_llm(graph_cleanup_model) chain = prompt | llm | parser - nodes_dict = chain.invoke({'input':node_labels}) - relation_dict = chain.invoke({'input':relation_labels}) - - node_match = {} - relation_match = {} - for new_label , values in nodes_dict.items() : - for old_label in values: - if new_label != old_label: - node_match[old_label]=new_label - - for new_label , values in relation_dict.items() : - for old_label in values: - if new_label != old_label: - relation_match[old_label]=new_label - - logging.info(f"updated node labels : {node_match}") - logging.info(f"Reduced node counts from {len(node_labels)} to {len(node_match.items())}") - logging.info(f"updated relationship labels : {relation_match}") - logging.info(f"Reduced relationship counts from {len(relation_labels)} to {len(relation_match.items())}") - # Update node labels in graph - for old_label, new_label in node_match.items(): - query = f""" - MATCH (n:`{old_label}`) - SET n:`{new_label}` - REMOVE n:`{old_label}` - """ - graph.query(query) + nodes_relations_input = {'nodes': node_labels, 'relationships': relation_labels} + mappings = chain.invoke({'input': nodes_relations_input}) + node_mapping = {old: new for new, old_list in mappings['nodes'].items() for old in old_list if new != old} + relation_mapping = {old: new for new, old_list in mappings['relationships'].items() for old in old_list if new != old} + + logging.info(f"Node Labels: Total = {len(node_labels)}, Reduced to = {len(set(node_mapping.values()))} (from {len(node_mapping)})") + logging.info(f"Relationship Types: Total = {len(relation_labels)}, Reduced to = {len(set(relation_mapping.values()))} (from {len(relation_mapping)})") + + if node_mapping: + for old_label, new_label in node_mapping.items(): + query = f""" + MATCH (n:`{old_label}`) + SET n:`{new_label}` + REMOVE n:`{old_label}` + """ + graph.query(query) - # Update relation types in graph - for old_label, new_label in relation_match.items(): + for old_label, new_label in relation_mapping.items(): query = f""" MATCH (n)-[r:`{old_label}`]->(m) CREATE (n)-[r2:`{new_label}`]->(m) diff --git a/backend/src/shared/constants.py b/backend/src/shared/constants.py index 6a69d166d..eeb603245 100644 --- a/backend/src/shared/constants.py +++ b/backend/src/shared/constants.py @@ -831,27 +831,62 @@ DELETE_ENTITIES_AND_START_FROM_BEGINNING = "delete_entities_and_start_from_beginning" START_FROM_LAST_PROCESSED_POSITION = "start_from_last_processed_position" -GRAPH_CLEANUP_PROMPT = """Please consolidate the following list of types into a smaller set of more general, semantically -related types. The consolidated types must be drawn from the original list; do not introduce new types. -Return a JSON object representing the mapping of original types to consolidated types. Every key is the consolidated type -and value is list of the original types that were merged into the consolidated type. Prioritize using the most generic and -repeated term when merging. If a type doesn't merge with any other type, it should still be included in the output, -mapped to itself. - -**Input:** A list of strings representing the types to be consolidated. These types may represent either node -labels or relationship labels Your algorithm should do appropriate groupings based on semantic similarity. - -Example 1: -Input: -[ "Person", "Human", "People", "Company", "Organization", "Product"] -Output : -[Person": ["Person", "Human", "People"], Organization": ["Company", "Organization"], Product": ["Product"]] - -Example 2: -Input : -["CREATED_FOR", "CREATED_TO", "CREATED", "PLACE", "LOCATION", "VENUE"] +GRAPH_CLEANUP_PROMPT = """ +You are tasked with organizing a list of types into semantic categories based on their meanings, including synonyms or morphological similarities. The input will include two separate lists: one for **Node Labels** and one for **Relationship Types**. Follow these rules strictly: +### 1. Input Format +The input will include two keys: +- `nodes`: A list of node labels. +- `relationships`: A list of relationship types. +### 2. Grouping Rules +- Group similar items into **semantic categories** based on their meaning or morphological similarities. +- The name of each category must be chosen from the types in the input list (node labels or relationship types). **Do not create or infer new names for categories**. +- Items that cannot be grouped must remain in their own category. +### 3. Naming Rules +- The category name must reflect the grouped items and must be an existing type in the input list. +- Use a widely applicable type as the category name. +- **Do not introduce new names or types** under any circumstances. +### 4. Output Rules +- Return the output as a JSON object with two keys: + - `nodes`: A dictionary where each key represents a category name for nodes, and its value is a list of original node labels in that category. + - `relationships`: A dictionary where each key represents a category name for relationships, and its value is a list of original relationship types in that category. +- Every key and value must come from the provided input lists. +### 5. Examples +#### Example 1: +Input: +{{ + "nodes": ["Person", "Human", "People", "Company", "Organization", "Product"], + "relationships": ["CREATED_FOR", "CREATED_TO", "CREATED", "PUBLISHED","PUBLISHED_BY", "PUBLISHED_IN", "PUBLISHED_ON"] +}} +Output in JSON: +{{ + "nodes": {{ + "Person": ["Person", "Human", "People"], + "Organization": ["Company", "Organization"], + "Product": ["Product"] + }}, + "relationships": {{ + "CREATED": ["CREATED_FOR", "CREATED_TO", "CREATED"], + "PUBLISHED": ["PUBLISHED_BY", "PUBLISHED_IN", "PUBLISHED_ON"] + }} +}} +#### Example 2: Avoid redundant or incorrect grouping +Input: +{{ + "nodes": ["Process", "Process_Step", "Step", "Procedure", "Method", "Natural Process", "Step"], + "relationships": ["USED_FOR", "USED_BY", "USED_WITH", "USED_IN"] +}} Output: -["CREATED": ["CREATED_FOR", "CREATED_TO", "CREATED"],"PLACE": ["PLACE", "LOCATION", "VENUE"]] +{{ + "nodes": {{ + "Process": ["Process", "Process_Step", "Step", "Procedure", "Method", "Natural Process"] + }}, + "relationships": {{ + "USED": ["USED_FOR", "USED_BY", "USED_WITH", "USED_IN"] + }} +}} +### 6. Key Rule +If any item cannot be grouped, it must remain in its own category using its original name. Do not repeat values or create incorrect mappings. +Use these rules to group and name categories accurately without introducing errors or new types. """ ADDITIONAL_INSTRUCTIONS = """Your goal is to identify and categorize entities while ensuring that specific data