Skip to content

Commit 01fcb08

Browse files
prakriti-solankeykartikpersistent
authored andcommitted
__ changes (#656)
1 parent a1262a2 commit 01fcb08

File tree

13 files changed

+108
-122
lines changed

13 files changed

+108
-122
lines changed

backend/src/chunkid_entities.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33
from src.graph_query import *
44

55
CHUNK_QUERY = """
6-
match (chunk:__Chunk__) where chunk.id IN $chunksIds
6+
match (chunk:Chunk) where chunk.id IN $chunksIds
77
8-
MATCH (chunk)-[:__PART_OF__]->(d:__Document__)
8+
MATCH (chunk)-[:PART_OF]->(d:Document)
99
CALL {WITH chunk
10-
MATCH (chunk)-[:__HAS_ENTITY__]->(e)
11-
MATCH path=(e)(()-[rels:!__HAS_ENTITY__&!__PART_OF__]-()){0,2}(:!__Chunk__&!__Document__)
10+
MATCH (chunk)-[:HAS_ENTITY]->(e)
11+
MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk &! Document)
1212
UNWIND rels as r
1313
RETURN collect(distinct r) as rels
1414
}

backend/src/graphDB_dataAccess.py

+22-22
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def update_exception_db(self, file_name, exp_msg):
2020
is_cancelled_status = result[0]['is_cancelled']
2121
if bool(is_cancelled_status) == True:
2222
job_status = 'Cancelled'
23-
self.graph.query("""MERGE(d:__Document__ {fileName :$fName}) SET d.status = $status, d.errorMessage = $error_msg""",
23+
self.graph.query("""MERGE(d:Document {fileName :$fName}) SET d.status = $status, d.errorMessage = $error_msg""",
2424
{"fName":file_name, "status":job_status, "error_msg":exp_msg})
2525
except Exception as e:
2626
error_message = str(e)
@@ -31,7 +31,7 @@ def create_source_node(self, obj_source_node:sourceNode):
3131
try:
3232
job_status = "New"
3333
logging.info("creating source node if does not exist")
34-
self.graph.query("""MERGE(d:__Document__ {fileName :$fn}) SET d.fileSize = $fs, d.fileType = $ft ,
34+
self.graph.query("""MERGE(d:Document {fileName :$fn}) SET d.fileSize = $fs, d.fileType = $ft ,
3535
d.status = $st, d.url = $url, d.awsAccessKeyId = $awsacc_key_id,
3636
d.fileSource = $f_source, d.createdAt = $c_at, d.updatedAt = $u_at,
3737
d.processingTime = $pt, d.errorMessage = $e_message, d.nodeCount= $n_count,
@@ -95,7 +95,7 @@ def update_source_node(self, obj_source_node:sourceNode):
9595
param= {"props":params}
9696

9797
print(f'Base Param value 1 : {param}')
98-
query = "MERGE(d:__Document__ {fileName :$props.fileName}) SET d += $props"
98+
query = "MERGE(d:Document {fileName :$props.fileName}) SET d += $props"
9999
logging.info("Update source node properties")
100100
self.graph.query(query,param)
101101
except Exception as e:
@@ -117,7 +117,7 @@ def get_source_list(self):
117117
sorting the list by the last updated date.
118118
"""
119119
logging.info("Get existing files list from graph")
120-
query = "MATCH(d:__Document__) WHERE d.fileName IS NOT NULL RETURN d ORDER BY d.updatedAt DESC"
120+
query = "MATCH(d:Document) WHERE d.fileName IS NOT NULL RETURN d ORDER BY d.updatedAt DESC"
121121
result = self.graph.query(query)
122122
list_of_json_objects = [entry['d'] for entry in result]
123123
return list_of_json_objects
@@ -131,10 +131,10 @@ def update_KNN_graph(self):
131131
knn_min_score = os.environ.get('KNN_MIN_SCORE')
132132
if len(index) > 0:
133133
logging.info('update KNN graph')
134-
self.graph.query("""MATCH (c:__Chunk__)
135-
WHERE c.embedding IS NOT NULL AND count { (c)-[:__SIMILAR__]-() } < 5
134+
self.graph.query("""MATCH (c:Chunk)
135+
WHERE c.embedding IS NOT NULL AND count { (c)-[:SIMILAR]-() } < 5
136136
CALL db.index.vector.queryNodes('vector', 6, c.embedding) yield node, score
137-
WHERE node <> c and score >= $score MERGE (c)-[rel:__SIMILAR__]-(node) SET rel.score = score
137+
WHERE node <> c and score >= $score MERGE (c)-[rel:SIMILAR]-(node) SET rel.score = score
138138
""",
139139
{"score":float(knn_min_score)}
140140
)
@@ -174,7 +174,7 @@ def execute_query(self, query, param=None):
174174

175175
def get_current_status_document_node(self, file_name):
176176
query = """
177-
MATCH(d:__Document__ {fileName : $file_name}) RETURN d.status AS Status , d.processingTime AS processingTime,
177+
MATCH(d:Document {fileName : $file_name}) RETURN d.status AS Status , d.processingTime AS processingTime,
178178
d.nodeCount AS nodeCount, d.model as model, d.relationshipCount as relationshipCount,
179179
d.total_pages AS total_pages, d.total_chunks AS total_chunks , d.fileSize as fileSize,
180180
d.is_cancelled as is_cancelled, d.processed_chunk as processed_chunk, d.fileSource as fileSource
@@ -197,23 +197,23 @@ def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, me
197197
logging.info(f'Deleted File Path: {merged_file_path} and Deleted File Name : {file_name}')
198198
delete_uploaded_local_file(merged_file_path,file_name)
199199
query_to_delete_document="""
200-
MATCH (d:__Document__) where d.fileName in $filename_list and d.fileSource in $source_types_list
200+
MATCH (d:Document) where d.fileName in $filename_list and d.fileSource in $source_types_list
201201
with collect(d) as documents
202202
unwind documents as d
203-
optional match (d)<-[:__PART_OF__]-(c:__Chunk__)
203+
optional match (d)<-[:PART_OF]-(c:Chunk)
204204
detach delete c, d
205205
return count(*) as deletedChunks
206206
"""
207207
query_to_delete_document_and_entities="""
208-
MATCH (d:__Document__) where d.fileName in $filename_list and d.fileSource in $source_types_list
208+
MATCH (d:Document) where d.fileName in $filename_list and d.fileSource in $source_types_list
209209
with collect(d) as documents
210210
unwind documents as d
211-
optional match (d)<-[:__PART_OF__]-(c:__Chunk__)
211+
optional match (d)<-[:PART_OF]-(c:Chunk)
212212
// if delete-entities checkbox is set
213213
call { with c, documents
214-
match (c)-[:__HAS_ENTITY__]->(e)
214+
match (c)-[:HAS_ENTITY]->(e)
215215
// belongs to another document
216-
where not exists { (d2)<-[:__PART_OF__]-()-[:__HAS_ENTITY__]->(e) WHERE NOT d2 IN documents }
216+
where not exists { (d2)<-[:PART_OF]-()-[:HAS_ENTITY]->(e) WHERE NOT d2 IN documents }
217217
detach delete e
218218
return count(*) as entities
219219
}
@@ -231,17 +231,17 @@ def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, me
231231

232232
def list_unconnected_nodes(self):
233233
query = """
234-
MATCH (e:!__Chunk__&!__Document__)
235-
WHERE NOT exists { (e)--(:!__Chunk__&!__Document__) }
236-
OPTIONAL MATCH (doc:__Document__)<-[:__PART_OF__]-(c:__Chunk__)-[:__HAS_ENTITY__]->(e)
234+
MATCH (e:!Chunk&!Document)
235+
WHERE NOT exists { (e)--(:!Chunk&!Document) }
236+
OPTIONAL MATCH (doc:Document)<-[:PART_OF]-(c:Chunk)-[:HAS_ENTITY]->(e)
237237
RETURN e {.*, embedding:null, elementId:elementId(e), labels:labels(e)} as e,
238238
collect(distinct doc.fileName) as documents, count(distinct c) as chunkConnections
239239
ORDER BY e.id ASC
240240
LIMIT 100
241241
"""
242242
query_total_nodes = """
243-
MATCH (e:!__Chunk__&!__Document__)
244-
WHERE NOT exists { (e)--(:!__Chunk__&!__Document__) }
243+
MATCH (e:!Chunk&!Document)
244+
WHERE NOT exists { (e)--(:!Chunk&!Document) }
245245
RETURN count(*) as total
246246
"""
247247
nodes_list = self.execute_query(query)
@@ -261,7 +261,7 @@ def get_duplicate_nodes_list(self):
261261
score_value = float(os.environ.get('DUPLICATE_SCORE_VALUE'))
262262
text_distance = int(os.environ.get('DUPLICATE_TEXT_DISTANCE'))
263263
query_duplicate_nodes = """
264-
MATCH (n:!__Chunk__&!__Document__) with n
264+
MATCH (n:!Chunk&!Document) with n
265265
WHERE n.embedding is not null and n.id is not null // and size(n.id) > 3
266266
WITH n ORDER BY count {{ (n)--() }} DESC, size(n.id) DESC // updated
267267
WITH collect(n) as nodes
@@ -289,7 +289,7 @@ def get_duplicate_nodes_list(self):
289289
where none(other in all where other <> nodes and size(other) > size(nodes) and size(apoc.coll.subtract(nodes, other))=0)
290290
return head(nodes) as n, tail(nodes) as similar
291291
}}
292-
OPTIONAL MATCH (doc:__Document__)<-[:__PART_OF__]-(c:__Chunk__)-[:__HAS_ENTITY__]->(n)
292+
OPTIONAL MATCH (doc:Document)<-[:PART_OF]-(c:Chunk)-[:HAS_ENTITY]->(n)
293293
{return_statement}
294294
"""
295295
return_query_duplicate_nodes = """
@@ -335,7 +335,7 @@ def drop_create_vector_index(self, is_vector_index_recreate):
335335
if is_vector_index_recreate == 'true':
336336
self.graph.query("""drop index vector""")
337337

338-
self.graph.query("""CREATE VECTOR INDEX `vector` if not exists for (c:__Chunk__) on (c.embedding)
338+
self.graph.query("""CREATE VECTOR INDEX `vector` if not exists for (c:Chunk) on (c.embedding)
339339
OPTIONS {indexConfig: {
340340
`vector.dimensions`: $dimensions,
341341
`vector.similarity_function`: 'cosine'

backend/src/graph_query.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ def get_completed_documents(driver):
162162
"""
163163
Retrieves the names of all documents with the status 'Completed' from the database.
164164
"""
165-
docs_query = "MATCH(node:__Document__ {status:'Completed'}) RETURN node"
165+
docs_query = "MATCH(node:Document {status:'Completed'}) RETURN node"
166166

167167
try:
168168
logging.info("Executing query to retrieve completed documents.")

backend/src/main.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -508,15 +508,11 @@ def get_labels_and_relationtypes(graph):
508508
query = """
509509
RETURN collect {
510510
CALL db.labels() yield label
511-
WHERE NOT label IN ['_Bloom_Perspective_']
512-
AND NOT label STARTS WITH ('__')
513-
AND NOT label ENDS WITH('__')
511+
WHERE NOT label IN ['Chunk','_Bloom_Perspective_']
514512
return label order by label limit 100 } as labels,
515513
collect {
516514
CALL db.relationshipTypes() yield relationshipType as type
517-
WHERE NOT type IN ['_Bloom_Perspective_']
518-
AND NOT type STARTS WITH ('__')
519-
AND NOT type ENDS WITH('__')
515+
WHERE NOT type IN ['PART_OF', 'NEXT_CHUNK', 'HAS_ENTITY', '_Bloom_Perspective_']
520516
return type order by type LIMIT 100 } as relationshipTypes
521517
"""
522518
graphDb_data_Access = graphDBdataAccess(graph)

backend/src/make_relationships.py

+25-25
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_documents_chunk_chunk_Id : list):
1313
batch_data = []
14-
logging.info("Create __HAS_ENTITY__ relationship between chunks and entities")
14+
logging.info("Create HAS_ENTITY relationship between chunks and entities")
1515
chunk_node_id_set = 'id:"{}"'
1616
for graph_doc_chunk_id in graph_documents_chunk_chunk_Id:
1717
for node in graph_doc_chunk_id['graph_doc'].nodes:
@@ -24,14 +24,14 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_docume
2424
#node_id = node.id
2525
#Below query is also unable to change as parametrize because we can't make parameter of Label or node type
2626
#https://neo4j.com/docs/cypher-manual/current/syntax/parameters/
27-
#graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(graph_doc_chunk_id['chunk_id'])+'}) MERGE (n:'+ node.type +'{ id: "'+node_id+'"}) MERGE (c)-[:__HAS_ENTITY__]->(n)')
27+
#graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(graph_doc_chunk_id['chunk_id'])+'}) MERGE (n:'+ node.type +'{ id: "'+node_id+'"}) MERGE (c)-[:HAS_ENTITY]->(n)')
2828

2929
if batch_data:
3030
unwind_query = """
3131
UNWIND $batch_data AS data
32-
MATCH (c:__Chunk__ {id: data.chunk_id})
32+
MATCH (c:Chunk {id: data.chunk_id})
3333
CALL apoc.merge.node([data.node_type], {id: data.node_id}) YIELD node AS n
34-
MERGE (c)-[:__HAS_ENTITY__]->(n)
34+
MERGE (c)-[:HAS_ENTITY]->(n)
3535
"""
3636
graph.query(unwind_query, params={"batch_data": batch_data})
3737

@@ -55,9 +55,9 @@ def update_embedding_create_vector_index(graph, chunkId_chunkDoc_list, file_name
5555
"chunkId": row['chunk_id'],
5656
"embeddings": embeddings_arr
5757
})
58-
# graph.query("""MATCH (d:__Document__ {fileName : $fileName})
59-
# MERGE (c:__Chunk__ {id:$chunkId}) SET c.embedding = $embeddings
60-
# MERGE (c)-[:__PART_OF__]->(d)
58+
# graph.query("""MATCH (d:Document {fileName : $fileName})
59+
# MERGE (c:Chunk {id:$chunkId}) SET c.embedding = $embeddings
60+
# MERGE (c)-[:PART_OF]->(d)
6161
# """,
6262
# {
6363
# "fileName" : file_name,
@@ -66,12 +66,12 @@ def update_embedding_create_vector_index(graph, chunkId_chunkDoc_list, file_name
6666
# }
6767
# )
6868
# logging.info('create vector index on chunk embedding')
69-
result = graph.query("SHOW INDEXES YIELD * WHERE labelsOrTypes = ['Chunk'] and name = 'vector'")
69+
result = graph.query("SHOW INDEXES YIELD * WHERE labelsOrTypes = ['__Chunk__'] and name = 'vector'")
7070
if result:
7171
logging.info(f"vector index dropped for 'Chunk'")
7272
graph.query("DROP INDEX vector IF EXISTS;")
7373

74-
graph.query("""CREATE VECTOR INDEX `vector` if not exists for (c:__Chunk__) on (c.embedding)
74+
graph.query("""CREATE VECTOR INDEX `vector` if not exists for (c:Chunk) on (c.embedding)
7575
OPTIONS {indexConfig: {
7676
`vector.dimensions`: $dimensions,
7777
`vector.similarity_function`: 'cosine'
@@ -84,10 +84,10 @@ def update_embedding_create_vector_index(graph, chunkId_chunkDoc_list, file_name
8484

8585
query_to_create_embedding = """
8686
UNWIND $data AS row
87-
MATCH (d:__Document__ {fileName: $fileName})
88-
MERGE (c:__Chunk__ {id: row.chunkId})
87+
MATCH (d:Document {fileName: $fileName})
88+
MERGE (c:Chunk {id: row.chunkId})
8989
SET c.embedding = row.embeddings
90-
MERGE (c)-[:__PART_OF__]->(d)
90+
MERGE (c)-[:PART_OF]->(d)
9191
"""
9292
graph.query(query_to_create_embedding, params={"fileName":file_name, "data":data_for_query})
9393

@@ -138,44 +138,44 @@ def create_relation_between_chunks(graph, file_name, chunks: List[Document])->li
138138

139139
# create relationships between chunks
140140
if firstChunk:
141-
relationships.append({"type": "__FIRST_CHUNK__", "chunk_id": current_chunk_id})
141+
relationships.append({"type": "FIRST_CHUNK", "chunk_id": current_chunk_id})
142142
else:
143143
relationships.append({
144-
"type": "__NEXT_CHUNK__",
144+
"type": "NEXT_CHUNK",
145145
"previous_chunk_id": previous_chunk_id, # ID of previous chunk
146146
"current_chunk_id": current_chunk_id
147147
})
148148

149149
query_to_create_chunk_and_PART_OF_relation = """
150150
UNWIND $batch_data AS data
151-
MERGE (c:__Chunk__ {id: data.id})
151+
MERGE (c:Chunk {id: data.id})
152152
SET c.text = data.pg_content, c.position = data.position, c.length = data.length, c.fileName=data.f_name, c.content_offset=data.content_offset
153153
WITH data, c
154154
SET c.page_number = CASE WHEN data.page_number IS NOT NULL THEN data.page_number END,
155155
c.start_time = CASE WHEN data.start_time IS NOT NULL THEN data.start_time END,
156156
c.end_time = CASE WHEN data.end_time IS NOT NULL THEN data.end_time END
157157
WITH data, c
158-
MATCH (d:__Document__ {fileName: data.f_name})
159-
MERGE (c)-[:__PART_OF__]->(d)
158+
MATCH (d:Document {fileName: data.f_name})
159+
MERGE (c)-[:PART_OF]->(d)
160160
"""
161161
graph.query(query_to_create_chunk_and_PART_OF_relation, params={"batch_data": batch_data})
162162

163163
query_to_create_FIRST_relation = """
164164
UNWIND $relationships AS relationship
165-
MATCH (d:__Document__ {fileName: $f_name})
166-
MATCH (c:__Chunk__ {id: relationship.chunk_id})
167-
FOREACH(r IN CASE WHEN relationship.type = '__FIRST_CHUNK__' THEN [1] ELSE [] END |
168-
MERGE (d)-[:__FIRST_CHUNK__]->(c))
165+
MATCH (d:Document {fileName: $f_name})
166+
MATCH (c:Chunk {id: relationship.chunk_id})
167+
FOREACH(r IN CASE WHEN relationship.type = 'FIRST_CHUNK' THEN [1] ELSE [] END |
168+
MERGE (d)-[:FIRST_CHUNK]->(c))
169169
"""
170170
graph.query(query_to_create_FIRST_relation, params={"f_name": file_name, "relationships": relationships})
171171

172172
query_to_create_NEXT_CHUNK_relation = """
173173
UNWIND $relationships AS relationship
174-
MATCH (c:__Chunk__ {id: relationship.current_chunk_id})
174+
MATCH (c:Chunk {id: relationship.current_chunk_id})
175175
WITH c, relationship
176-
MATCH (pc:__Chunk__ {id: relationship.previous_chunk_id})
177-
FOREACH(r IN CASE WHEN relationship.type = '__NEXT_CHUNK__' THEN [1] ELSE [] END |
178-
MERGE (c)<-[:__NEXT_CHUNK__]-(pc))
176+
MATCH (pc:Chunk {id: relationship.previous_chunk_id})
177+
FOREACH(r IN CASE WHEN relationship.type = 'NEXT_CHUNK' THEN [1] ELSE [] END |
178+
MERGE (c)<-[:NEXT_CHUNK]-(pc))
179179
"""
180180
graph.query(query_to_create_NEXT_CHUNK_relation, params={"relationships": relationships})
181181

backend/src/post_processing.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@
88
DROP_INDEX_QUERY = "DROP INDEX entities IF EXISTS;"
99
LABELS_QUERY = "CALL db.labels()"
1010
FULL_TEXT_QUERY = "CREATE FULLTEXT INDEX entities FOR (n{labels_str}) ON EACH [n.id, n.description];"
11-
FILTER_LABELS = ["__Chunk__","__Document__"]
11+
FILTER_LABELS = ["Chunk","Document"]
1212

1313

1414
HYBRID_SEARCH_INDEX_DROP_QUERY = "DROP INDEX keyword IF EXISTS;"
15-
HYBRID_SEARCH_FULL_TEXT_QUERY = "CREATE FULLTEXT INDEX keyword FOR (n:__Chunk__) ON EACH [n.text]"
15+
HYBRID_SEARCH_FULL_TEXT_QUERY = "CREATE FULLTEXT INDEX keyword FOR (n:Chunk) ON EACH [n.text]"
1616

1717
def create_fulltext(uri, username, password, database,type):
1818
start_time = time.time()
@@ -80,7 +80,7 @@ def create_entity_embedding(graph:Neo4jGraph):
8080
def fetch_entities_for_embedding(graph):
8181
query = """
8282
MATCH (e)
83-
WHERE NOT (e:__Chunk__ OR e:__Document__) AND e.embedding IS NULL AND e.id IS NOT NULL
83+
WHERE NOT (e:Chunk OR e:Document) AND e.embedding IS NULL AND e.id IS NOT NULL
8484
RETURN elementId(e) AS elementId, e.id + " " + coalesce(e.description, "") AS text
8585
"""
8686
result = graph.query(query)

0 commit comments

Comments
 (0)