Skip to content

Commit 736d764

Browse files
prakriti-solankeykartikpersistent
authored andcommitted
__ changes (#656)
1 parent b32e8ca commit 736d764

File tree

13 files changed

+108
-122
lines changed

13 files changed

+108
-122
lines changed

backend/src/chunkid_entities.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33
from src.graph_query import *
44

55
CHUNK_QUERY = """
6-
match (chunk:__Chunk__) where chunk.id IN $chunksIds
6+
match (chunk:Chunk) where chunk.id IN $chunksIds
77
8-
MATCH (chunk)-[:__PART_OF__]->(d:__Document__)
8+
MATCH (chunk)-[:PART_OF]->(d:Document)
99
CALL {WITH chunk
10-
MATCH (chunk)-[:__HAS_ENTITY__]->(e)
11-
MATCH path=(e)(()-[rels:!__HAS_ENTITY__&!__PART_OF__]-()){0,2}(:!__Chunk__&!__Document__)
10+
MATCH (chunk)-[:HAS_ENTITY]->(e)
11+
MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk &! Document)
1212
UNWIND rels as r
1313
RETURN collect(distinct r) as rels
1414
}

backend/src/graphDB_dataAccess.py

+22-22
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def update_exception_db(self, file_name, exp_msg):
2020
is_cancelled_status = result[0]['is_cancelled']
2121
if is_cancelled_status == 'True':
2222
job_status = 'Cancelled'
23-
self.graph.query("""MERGE(d:__Document__ {fileName :$fName}) SET d.status = $status, d.errorMessage = $error_msg""",
23+
self.graph.query("""MERGE(d:Document {fileName :$fName}) SET d.status = $status, d.errorMessage = $error_msg""",
2424
{"fName":file_name, "status":job_status, "error_msg":exp_msg})
2525
except Exception as e:
2626
error_message = str(e)
@@ -31,7 +31,7 @@ def create_source_node(self, obj_source_node:sourceNode):
3131
try:
3232
job_status = "New"
3333
logging.info("creating source node if does not exist")
34-
self.graph.query("""MERGE(d:__Document__ {fileName :$fn}) SET d.fileSize = $fs, d.fileType = $ft ,
34+
self.graph.query("""MERGE(d:Document {fileName :$fn}) SET d.fileSize = $fs, d.fileType = $ft ,
3535
d.status = $st, d.url = $url, d.awsAccessKeyId = $awsacc_key_id,
3636
d.fileSource = $f_source, d.createdAt = $c_at, d.updatedAt = $u_at,
3737
d.processingTime = $pt, d.errorMessage = $e_message, d.nodeCount= $n_count,
@@ -92,7 +92,7 @@ def update_source_node(self, obj_source_node:sourceNode):
9292
param= {"props":params}
9393

9494
print(f'Base Param value 1 : {param}')
95-
query = "MERGE(d:__Document__ {fileName :$props.fileName}) SET d += $props"
95+
query = "MERGE(d:Document {fileName :$props.fileName}) SET d += $props"
9696
logging.info("Update source node properties")
9797
self.graph.query(query,param)
9898
except Exception as e:
@@ -114,7 +114,7 @@ def get_source_list(self):
114114
sorting the list by the last updated date.
115115
"""
116116
logging.info("Get existing files list from graph")
117-
query = "MATCH(d:__Document__) WHERE d.fileName IS NOT NULL RETURN d ORDER BY d.updatedAt DESC"
117+
query = "MATCH(d:Document) WHERE d.fileName IS NOT NULL RETURN d ORDER BY d.updatedAt DESC"
118118
result = self.graph.query(query)
119119
list_of_json_objects = [entry['d'] for entry in result]
120120
return list_of_json_objects
@@ -128,10 +128,10 @@ def update_KNN_graph(self):
128128
knn_min_score = os.environ.get('KNN_MIN_SCORE')
129129
if len(index) > 0:
130130
logging.info('update KNN graph')
131-
self.graph.query("""MATCH (c:__Chunk__)
132-
WHERE c.embedding IS NOT NULL AND count { (c)-[:__SIMILAR__]-() } < 5
131+
self.graph.query("""MATCH (c:Chunk)
132+
WHERE c.embedding IS NOT NULL AND count { (c)-[:SIMILAR]-() } < 5
133133
CALL db.index.vector.queryNodes('vector', 6, c.embedding) yield node, score
134-
WHERE node <> c and score >= $score MERGE (c)-[rel:__SIMILAR__]-(node) SET rel.score = score
134+
WHERE node <> c and score >= $score MERGE (c)-[rel:SIMILAR]-(node) SET rel.score = score
135135
""",
136136
{"score":float(knn_min_score)}
137137
)
@@ -171,7 +171,7 @@ def execute_query(self, query, param=None):
171171

172172
def get_current_status_document_node(self, file_name):
173173
query = """
174-
MATCH(d:__Document__ {fileName : $file_name}) RETURN d.status AS Status , d.processingTime AS processingTime,
174+
MATCH(d:Document {fileName : $file_name}) RETURN d.status AS Status , d.processingTime AS processingTime,
175175
d.nodeCount AS nodeCount, d.model as model, d.relationshipCount as relationshipCount,
176176
d.total_pages AS total_pages, d.total_chunks AS total_chunks , d.fileSize as fileSize,
177177
d.is_cancelled as is_cancelled, d.processed_chunk as processed_chunk
@@ -194,23 +194,23 @@ def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, me
194194
logging.info(f'Deleted File Path: {merged_file_path} and Deleted File Name : {file_name}')
195195
delete_uploaded_local_file(merged_file_path,file_name)
196196
query_to_delete_document="""
197-
MATCH (d:__Document__) where d.fileName in $filename_list and d.fileSource in $source_types_list
197+
MATCH (d:Document) where d.fileName in $filename_list and d.fileSource in $source_types_list
198198
with collect(d) as documents
199199
unwind documents as d
200-
optional match (d)<-[:__PART_OF__]-(c:__Chunk__)
200+
optional match (d)<-[:PART_OF]-(c:Chunk)
201201
detach delete c, d
202202
return count(*) as deletedChunks
203203
"""
204204
query_to_delete_document_and_entities="""
205-
MATCH (d:__Document__) where d.fileName in $filename_list and d.fileSource in $source_types_list
205+
MATCH (d:Document) where d.fileName in $filename_list and d.fileSource in $source_types_list
206206
with collect(d) as documents
207207
unwind documents as d
208-
optional match (d)<-[:__PART_OF__]-(c:__Chunk__)
208+
optional match (d)<-[:PART_OF]-(c:Chunk)
209209
// if delete-entities checkbox is set
210210
call { with c, documents
211-
match (c)-[:__HAS_ENTITY__]->(e)
211+
match (c)-[:HAS_ENTITY]->(e)
212212
// belongs to another document
213-
where not exists { (d2)<-[:__PART_OF__]-()-[:__HAS_ENTITY__]->(e) WHERE NOT d2 IN documents }
213+
where not exists { (d2)<-[:PART_OF]-()-[:HAS_ENTITY]->(e) WHERE NOT d2 IN documents }
214214
detach delete e
215215
return count(*) as entities
216216
}
@@ -228,17 +228,17 @@ def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, me
228228

229229
def list_unconnected_nodes(self):
230230
query = """
231-
MATCH (e:!__Chunk__&!__Document__)
232-
WHERE NOT exists { (e)--(:!__Chunk__&!__Document__) }
233-
OPTIONAL MATCH (doc:__Document__)<-[:__PART_OF__]-(c:__Chunk__)-[:__HAS_ENTITY__]->(e)
231+
MATCH (e:!Chunk&!Document)
232+
WHERE NOT exists { (e)--(:!Chunk&!Document) }
233+
OPTIONAL MATCH (doc:Document)<-[:PART_OF]-(c:Chunk)-[:HAS_ENTITY]->(e)
234234
RETURN e {.*, embedding:null, elementId:elementId(e), labels:labels(e)} as e,
235235
collect(distinct doc.fileName) as documents, count(distinct c) as chunkConnections
236236
ORDER BY e.id ASC
237237
LIMIT 100
238238
"""
239239
query_total_nodes = """
240-
MATCH (e:!__Chunk__&!__Document__)
241-
WHERE NOT exists { (e)--(:!__Chunk__&!__Document__) }
240+
MATCH (e:!Chunk&!Document)
241+
WHERE NOT exists { (e)--(:!Chunk&!Document) }
242242
RETURN count(*) as total
243243
"""
244244
nodes_list = self.execute_query(query)
@@ -258,7 +258,7 @@ def get_duplicate_nodes_list(self):
258258
score_value = float(os.environ.get('DUPLICATE_SCORE_VALUE'))
259259
text_distance = int(os.environ.get('DUPLICATE_TEXT_DISTANCE'))
260260
query_duplicate_nodes = """
261-
MATCH (n:!__Chunk__&!__Document__) with n
261+
MATCH (n:!Chunk&!Document) with n
262262
WHERE n.embedding is not null and n.id is not null // and size(n.id) > 3
263263
WITH n ORDER BY count {{ (n)--() }} DESC, size(n.id) DESC // updated
264264
WITH collect(n) as nodes
@@ -286,7 +286,7 @@ def get_duplicate_nodes_list(self):
286286
where none(other in all where other <> nodes and size(other) > size(nodes) and size(apoc.coll.subtract(nodes, other))=0)
287287
return head(nodes) as n, tail(nodes) as similar
288288
}}
289-
OPTIONAL MATCH (doc:__Document__)<-[:__PART_OF__]-(c:__Chunk__)-[:__HAS_ENTITY__]->(n)
289+
OPTIONAL MATCH (doc:Document)<-[:PART_OF]-(c:Chunk)-[:HAS_ENTITY]->(n)
290290
{return_statement}
291291
"""
292292
return_query_duplicate_nodes = """
@@ -332,7 +332,7 @@ def drop_create_vector_index(self, is_vector_index_recreate):
332332
if is_vector_index_recreate == 'true':
333333
self.graph.query("""drop index vector""")
334334

335-
self.graph.query("""CREATE VECTOR INDEX `vector` if not exists for (c:__Chunk__) on (c.embedding)
335+
self.graph.query("""CREATE VECTOR INDEX `vector` if not exists for (c:Chunk) on (c.embedding)
336336
OPTIONS {indexConfig: {
337337
`vector.dimensions`: $dimensions,
338338
`vector.similarity_function`: 'cosine'

backend/src/graph_query.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ def get_completed_documents(driver):
162162
"""
163163
Retrieves the names of all documents with the status 'Completed' from the database.
164164
"""
165-
docs_query = "MATCH(node:__Document__ {status:'Completed'}) RETURN node"
165+
docs_query = "MATCH(node:Document {status:'Completed'}) RETURN node"
166166

167167
try:
168168
logging.info("Executing query to retrieve completed documents.")

backend/src/main.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -544,15 +544,11 @@ def get_labels_and_relationtypes(graph):
544544
query = """
545545
RETURN collect {
546546
CALL db.labels() yield label
547-
WHERE NOT label IN ['_Bloom_Perspective_']
548-
AND NOT label STARTS WITH ('__')
549-
AND NOT label ENDS WITH('__')
547+
WHERE NOT label IN ['Chunk','_Bloom_Perspective_']
550548
return label order by label limit 100 } as labels,
551549
collect {
552550
CALL db.relationshipTypes() yield relationshipType as type
553-
WHERE NOT type IN ['_Bloom_Perspective_']
554-
AND NOT type STARTS WITH ('__')
555-
AND NOT type ENDS WITH('__')
551+
WHERE NOT type IN ['PART_OF', 'NEXT_CHUNK', 'HAS_ENTITY', '_Bloom_Perspective_']
556552
return type order by type LIMIT 100 } as relationshipTypes
557553
"""
558554
graphDb_data_Access = graphDBdataAccess(graph)

backend/src/make_relationships.py

+25-25
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_documents_chunk_chunk_Id : list):
1616
batch_data = []
17-
logging.info("Create __HAS_ENTITY__ relationship between chunks and entities")
17+
logging.info("Create HAS_ENTITY relationship between chunks and entities")
1818
chunk_node_id_set = 'id:"{}"'
1919
for graph_doc_chunk_id in graph_documents_chunk_chunk_Id:
2020
for node in graph_doc_chunk_id['graph_doc'].nodes:
@@ -27,14 +27,14 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_docume
2727
#node_id = node.id
2828
#Below query is also unable to change as parametrize because we can't make parameter of Label or node type
2929
#https://neo4j.com/docs/cypher-manual/current/syntax/parameters/
30-
#graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(graph_doc_chunk_id['chunk_id'])+'}) MERGE (n:'+ node.type +'{ id: "'+node_id+'"}) MERGE (c)-[:__HAS_ENTITY__]->(n)')
30+
#graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(graph_doc_chunk_id['chunk_id'])+'}) MERGE (n:'+ node.type +'{ id: "'+node_id+'"}) MERGE (c)-[:HAS_ENTITY]->(n)')
3131

3232
if batch_data:
3333
unwind_query = """
3434
UNWIND $batch_data AS data
35-
MATCH (c:__Chunk__ {id: data.chunk_id})
35+
MATCH (c:Chunk {id: data.chunk_id})
3636
CALL apoc.merge.node([data.node_type], {id: data.node_id}) YIELD node AS n
37-
MERGE (c)-[:__HAS_ENTITY__]->(n)
37+
MERGE (c)-[:HAS_ENTITY]->(n)
3838
"""
3939
graph.query(unwind_query, params={"batch_data": batch_data})
4040

@@ -76,9 +76,9 @@ def update_embedding_create_vector_index(graph, chunkId_chunkDoc_list, file_name
7676
"chunkId": row['chunk_id'],
7777
"embeddings": embeddings_arr
7878
})
79-
# graph.query("""MATCH (d:__Document__ {fileName : $fileName})
80-
# MERGE (c:__Chunk__ {id:$chunkId}) SET c.embedding = $embeddings
81-
# MERGE (c)-[:__PART_OF__]->(d)
79+
# graph.query("""MATCH (d:Document {fileName : $fileName})
80+
# MERGE (c:Chunk {id:$chunkId}) SET c.embedding = $embeddings
81+
# MERGE (c)-[:PART_OF]->(d)
8282
# """,
8383
# {
8484
# "fileName" : file_name,
@@ -87,12 +87,12 @@ def update_embedding_create_vector_index(graph, chunkId_chunkDoc_list, file_name
8787
# }
8888
# )
8989
# logging.info('create vector index on chunk embedding')
90-
result = graph.query("SHOW INDEXES YIELD * WHERE labelsOrTypes = ['Chunk'] and name = 'vector'")
90+
result = graph.query("SHOW INDEXES YIELD * WHERE labelsOrTypes = ['__Chunk__'] and name = 'vector'")
9191
if result:
9292
logging.info(f"vector index dropped for 'Chunk'")
9393
graph.query("DROP INDEX vector IF EXISTS;")
9494

95-
graph.query("""CREATE VECTOR INDEX `vector` if not exists for (c:__Chunk__) on (c.embedding)
95+
graph.query("""CREATE VECTOR INDEX `vector` if not exists for (c:Chunk) on (c.embedding)
9696
OPTIONS {indexConfig: {
9797
`vector.dimensions`: $dimensions,
9898
`vector.similarity_function`: 'cosine'
@@ -105,10 +105,10 @@ def update_embedding_create_vector_index(graph, chunkId_chunkDoc_list, file_name
105105

106106
query_to_create_embedding = """
107107
UNWIND $data AS row
108-
MATCH (d:__Document__ {fileName: $fileName})
109-
MERGE (c:__Chunk__ {id: row.chunkId})
108+
MATCH (d:Document {fileName: $fileName})
109+
MERGE (c:Chunk {id: row.chunkId})
110110
SET c.embedding = row.embeddings
111-
MERGE (c)-[:__PART_OF__]->(d)
111+
MERGE (c)-[:PART_OF]->(d)
112112
"""
113113
graph.query(query_to_create_embedding, params={"fileName":file_name, "data":data_for_query})
114114

@@ -165,17 +165,17 @@ def create_relation_between_chunks(graph, file_name, chunks: List[Document])->li
165165
)
166166
# create relationships between chunks
167167
if firstChunk:
168-
relationships.append({"type": "__FIRST_CHUNK__", "chunk_id": current_chunk_id})
168+
relationships.append({"type": "FIRST_CHUNK", "chunk_id": current_chunk_id})
169169
else:
170170
relationships.append({
171-
"type": "__NEXT_CHUNK__",
171+
"type": "NEXT_CHUNK",
172172
"previous_chunk_id": previous_chunk_id, # ID of previous chunk
173173
"current_chunk_id": current_chunk_id
174174
})
175175

176176
query_to_create_chunk_and_PART_OF_relation = """
177177
UNWIND $batch_data AS data
178-
MERGE (c:__Chunk__ {id: data.id})
178+
MERGE (c:Chunk {id: data.id})
179179
SET c.text = data.pg_content, c.position = data.position, c.length = data.length, c.fileName=data.f_name, c.content_offset=data.content_offset
180180
WITH data, c
181181
WHERE data.page_number IS NOT NULL
@@ -184,27 +184,27 @@ def create_relation_between_chunks(graph, file_name, chunks: List[Document])->li
184184
WHERE data.page_number IS NOT NULL
185185
SET c.page_number = data.page_number
186186
WITH data, c
187-
MATCH (d:__Document__ {fileName: data.f_name})
188-
MERGE (c)-[:__PART_OF__]->(d)
187+
MATCH (d:Document {fileName: data.f_name})
188+
MERGE (c)-[:PART_OF]->(d)
189189
"""
190190
graph.query(query_to_create_chunk_and_PART_OF_relation, params={"batch_data": batch_data})
191191

192192
query_to_create_FIRST_relation = """
193193
UNWIND $relationships AS relationship
194-
MATCH (d:__Document__ {fileName: $f_name})
195-
MATCH (c:__Chunk__ {id: relationship.chunk_id})
196-
FOREACH(r IN CASE WHEN relationship.type = '__FIRST_CHUNK__' THEN [1] ELSE [] END |
197-
MERGE (d)-[:__FIRST_CHUNK__]->(c))
194+
MATCH (d:Document {fileName: $f_name})
195+
MATCH (c:Chunk {id: relationship.chunk_id})
196+
FOREACH(r IN CASE WHEN relationship.type = 'FIRST_CHUNK' THEN [1] ELSE [] END |
197+
MERGE (d)-[:FIRST_CHUNK]->(c))
198198
"""
199199
graph.query(query_to_create_FIRST_relation, params={"f_name": file_name, "relationships": relationships})
200200

201201
query_to_create_NEXT_CHUNK_relation = """
202202
UNWIND $relationships AS relationship
203-
MATCH (c:__Chunk__ {id: relationship.current_chunk_id})
203+
MATCH (c:Chunk {id: relationship.current_chunk_id})
204204
WITH c, relationship
205-
MATCH (pc:__Chunk__ {id: relationship.previous_chunk_id})
206-
FOREACH(r IN CASE WHEN relationship.type = '__NEXT_CHUNK__' THEN [1] ELSE [] END |
207-
MERGE (c)<-[:__NEXT_CHUNK__]-(pc))
205+
MATCH (pc:Chunk {id: relationship.previous_chunk_id})
206+
FOREACH(r IN CASE WHEN relationship.type = 'NEXT_CHUNK' THEN [1] ELSE [] END |
207+
MERGE (c)<-[:NEXT_CHUNK]-(pc))
208208
"""
209209
graph.query(query_to_create_NEXT_CHUNK_relation, params={"relationships": relationships})
210210

backend/src/post_processing.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@
88
DROP_INDEX_QUERY = "DROP INDEX entities IF EXISTS;"
99
LABELS_QUERY = "CALL db.labels()"
1010
FULL_TEXT_QUERY = "CREATE FULLTEXT INDEX entities FOR (n{labels_str}) ON EACH [n.id, n.description];"
11-
FILTER_LABELS = ["__Chunk__","__Document__"]
11+
FILTER_LABELS = ["Chunk","Document"]
1212

1313

1414
HYBRID_SEARCH_INDEX_DROP_QUERY = "DROP INDEX keyword IF EXISTS;"
15-
HYBRID_SEARCH_FULL_TEXT_QUERY = "CREATE FULLTEXT INDEX keyword FOR (n:__Chunk__) ON EACH [n.text]"
15+
HYBRID_SEARCH_FULL_TEXT_QUERY = "CREATE FULLTEXT INDEX keyword FOR (n:Chunk) ON EACH [n.text]"
1616

1717
def create_fulltext(uri, username, password, database,type):
1818
start_time = time.time()
@@ -80,7 +80,7 @@ def create_entity_embedding(graph:Neo4jGraph):
8080
def fetch_entities_for_embedding(graph):
8181
query = """
8282
MATCH (e)
83-
WHERE NOT (e:__Chunk__ OR e:__Document__) AND e.embedding IS NULL AND e.id IS NOT NULL
83+
WHERE NOT (e:Chunk OR e:Document) AND e.embedding IS NULL AND e.id IS NOT NULL
8484
RETURN elementId(e) AS elementId, e.id + " " + coalesce(e.description, "") AS text
8585
"""
8686
result = graph.query(query)

0 commit comments

Comments
 (0)