Skip to content

Commit 2cc25a5

Browse files
Changed delete query to delete documents in batches for efficient memory management (#983)
1 parent 5b9ab5f commit 2cc25a5

File tree

2 files changed

+20
-15
lines changed

2 files changed

+20
-15
lines changed

backend/src/graphDB_dataAccess.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -280,18 +280,22 @@ def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, me
280280
else:
281281
logging.info(f'Deleted File Path: {merged_file_path} and Deleted File Name : {file_name}')
282282
delete_uploaded_local_file(merged_file_path,file_name)
283-
query_to_delete_document="""
284-
MATCH (d:Document) where d.fileName in $filename_list and coalesce(d.fileSource, "None") IN $source_types_list
285-
with collect(d) as documents
286-
unwind documents as d
283+
284+
query_to_delete_document="""
285+
MATCH (d:Document)
286+
WHERE d.fileName IN $filename_list AND coalesce(d.fileSource, "None") IN $source_types_list
287+
WITH COLLECT(d) AS documents
288+
CALL (documents) {
289+
UNWIND documents AS d
287290
optional match (d)<-[:PART_OF]-(c:Chunk)
288291
detach delete c, d
289-
return count(*) as deletedChunks
292+
} IN TRANSACTIONS OF 1 ROWS
290293
"""
291-
query_to_delete_document_and_entities="""
294+
query_to_delete_document_and_entities = """
292295
MATCH (d:Document)
293296
WHERE d.fileName IN $filename_list AND coalesce(d.fileSource, "None") IN $source_types_list
294297
WITH COLLECT(d) AS documents
298+
CALL (documents) {
295299
UNWIND documents AS d
296300
OPTIONAL MATCH (d)<-[:PART_OF]-(c:Chunk)
297301
OPTIONAL MATCH (c:Chunk)-[:HAS_ENTITY]->(e)
@@ -304,7 +308,8 @@ def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, me
304308
FOREACH (chunk IN chunks | DETACH DELETE chunk)
305309
FOREACH (entity IN entities | DETACH DELETE entity)
306310
DETACH DELETE d
307-
"""
311+
} IN TRANSACTIONS OF 1 ROWS
312+
"""
308313
query_to_delete_communities = """
309314
MATCH (c:`__Community__`)
310315
WHERE c.level = 0 AND NOT EXISTS { ()-[:IN_COMMUNITY]->(c) }

backend/src/main.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def create_source_node_graph_url_s3(graph, model, source_url, aws_access_key_id,
4949
for file_info in files_info:
5050
file_name=file_info['file_key']
5151
obj_source_node = sourceNode()
52-
obj_source_node.file_name = file_name.split('/')[-1]
52+
obj_source_node.file_name = file_name.split('/')[-1].strip() if isinstance(file_name.split('/')[-1], str) else file_name.split('/')[-1]
5353
obj_source_node.file_type = 'pdf'
5454
obj_source_node.file_size = file_info['file_size_bytes']
5555
obj_source_node.file_source = source_type
@@ -83,7 +83,7 @@ def create_source_node_graph_url_gcs(graph, model, gcs_project_id, gcs_bucket_na
8383
lst_file_metadata= get_gcs_bucket_files_info(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, credentials)
8484
for file_metadata in lst_file_metadata :
8585
obj_source_node = sourceNode()
86-
obj_source_node.file_name = file_metadata['fileName']
86+
obj_source_node.file_name = file_metadata['fileName'].strip() if isinstance(file_metadata['fileName'], str) else file_metadata['fileName']
8787
obj_source_node.file_size = file_metadata['fileSize']
8888
obj_source_node.url = file_metadata['url']
8989
obj_source_node.file_source = source_type
@@ -135,7 +135,7 @@ def create_source_node_graph_web_url(graph, model, source_url, source_type):
135135
obj_source_node.model = model
136136
obj_source_node.url = urllib.parse.unquote(source_url)
137137
obj_source_node.created_at = datetime.now()
138-
obj_source_node.file_name = title
138+
obj_source_node.file_name = title.strip() if isinstance(title, str) else title
139139
obj_source_node.language = language
140140
obj_source_node.file_size = sys.getsizeof(pages[0].page_content)
141141
obj_source_node.chunkNodeCount=0
@@ -338,7 +338,7 @@ async def processing_source(uri, userName, password, database, model, file_name,
338338
if result[0]['Status'] != 'Processing':
339339
obj_source_node = sourceNode()
340340
status = "Processing"
341-
obj_source_node.file_name = file_name
341+
obj_source_node.file_name = file_name.strip() if isinstance(file_name, str) else file_name
342342
obj_source_node.status = status
343343
obj_source_node.total_chunks = total_chunks
344344
obj_source_node.model = model
@@ -412,7 +412,7 @@ async def processing_source(uri, userName, password, database, model, file_name,
412412
end_time = datetime.now()
413413
processed_time = end_time - start_time
414414
obj_source_node = sourceNode()
415-
obj_source_node.file_name = file_name
415+
obj_source_node.file_name = file_name.strip() if isinstance(file_name, str) else file_name
416416
obj_source_node.status = job_status
417417
obj_source_node.processing_time = processed_time
418418

@@ -650,7 +650,7 @@ def upload_file(graph, model, chunk, chunk_number:int, total_chunks:int, origina
650650
logging.info("File merged successfully")
651651
file_extension = originalname.split('.')[-1]
652652
obj_source_node = sourceNode()
653-
obj_source_node.file_name = originalname
653+
obj_source_node.file_name = originalname.strip() if isinstance(originalname, str) else originalname
654654
obj_source_node.file_type = file_extension
655655
obj_source_node.file_size = file_size
656656
obj_source_node.file_source = 'local file'
@@ -693,7 +693,7 @@ def manually_cancelled_job(graph, filenames, source_types, merged_dir, uri):
693693

694694
for (file_name,source_type) in zip(filename_list, source_types_list):
695695
obj_source_node = sourceNode()
696-
obj_source_node.file_name = file_name
696+
obj_source_node.file_name = file_name.strip() if isinstance(file_name, str) else file_name
697697
obj_source_node.is_cancelled = True
698698
obj_source_node.status = 'Cancelled'
699699
obj_source_node.updated_at = datetime.now()
@@ -728,7 +728,7 @@ def set_status_retry(graph, file_name, retry_condition):
728728
graphDb_data_Access = graphDBdataAccess(graph)
729729
obj_source_node = sourceNode()
730730
status = "Ready to Reprocess"
731-
obj_source_node.file_name = file_name
731+
obj_source_node.file_name = file_name.strip() if isinstance(file_name, str) else file_name
732732
obj_source_node.status = status
733733
obj_source_node.retry_condition = retry_condition
734734
obj_source_node.is_cancelled = False

0 commit comments

Comments
 (0)