@@ -222,7 +222,7 @@ def create_source_node_graph_url_wikipedia(graph, model, wiki_query, source_type
222
222
lst_file_name .append ({'fileName' :obj_source_node .file_name ,'fileSize' :obj_source_node .file_size ,'url' :obj_source_node .url , 'language' :obj_source_node .language , 'status' :'Success' })
223
223
return lst_file_name ,success_count ,failed_count
224
224
225
- async def extract_graph_from_file_local_file (uri , userName , password , database , model , merged_file_path , fileName , allowedNodes , allowedRelationship , retry_condition ):
225
+ async def extract_graph_from_file_local_file (uri , userName , password , database , model , merged_file_path , fileName , allowedNodes , allowedRelationship , retry_condition , additional_instructions ):
226
226
227
227
logging .info (f'Process file name :{ fileName } ' )
228
228
if not retry_condition :
@@ -234,11 +234,11 @@ async def extract_graph_from_file_local_file(uri, userName, password, database,
234
234
file_name , pages , file_extension = get_documents_from_file_by_path (merged_file_path ,fileName )
235
235
if pages == None or len (pages )== 0 :
236
236
raise Exception (f'File content is not available for file : { file_name } ' )
237
- return await processing_source (uri , userName , password , database , model , file_name , pages , allowedNodes , allowedRelationship , True , merged_file_path )
237
+ return await processing_source (uri , userName , password , database , model , file_name , pages , allowedNodes , allowedRelationship , True , merged_file_path , additional_instructions = additional_instructions )
238
238
else :
239
- return await processing_source (uri , userName , password , database , model , fileName , [], allowedNodes , allowedRelationship , True , merged_file_path , retry_condition )
239
+ return await processing_source (uri , userName , password , database , model , fileName , [], allowedNodes , allowedRelationship , True , merged_file_path , retry_condition , additional_instructions = additional_instructions )
240
240
241
- async def extract_graph_from_file_s3 (uri , userName , password , database , model , source_url , aws_access_key_id , aws_secret_access_key , file_name , allowedNodes , allowedRelationship , retry_condition ):
241
+ async def extract_graph_from_file_s3 (uri , userName , password , database , model , source_url , aws_access_key_id , aws_secret_access_key , file_name , allowedNodes , allowedRelationship , retry_condition , additional_instructions ):
242
242
if not retry_condition :
243
243
if (aws_access_key_id == None or aws_secret_access_key == None ):
244
244
raise Exception ('Please provide AWS access and secret keys' )
@@ -248,48 +248,48 @@ async def extract_graph_from_file_s3(uri, userName, password, database, model, s
248
248
249
249
if pages == None or len (pages )== 0 :
250
250
raise Exception (f'File content is not available for file : { file_name } ' )
251
- return await processing_source (uri , userName , password , database , model , file_name , pages , allowedNodes , allowedRelationship )
251
+ return await processing_source (uri , userName , password , database , model , file_name , pages , allowedNodes , allowedRelationship , additional_instructions = additional_instructions )
252
252
else :
253
- return await processing_source (uri , userName , password , database , model , file_name , [], allowedNodes , allowedRelationship , retry_condition = retry_condition )
253
+ return await processing_source (uri , userName , password , database , model , file_name , [], allowedNodes , allowedRelationship , retry_condition = retry_condition , additional_instructions = additional_instructions )
254
254
255
- async def extract_graph_from_web_page (uri , userName , password , database , model , source_url , file_name , allowedNodes , allowedRelationship , retry_condition ):
255
+ async def extract_graph_from_web_page (uri , userName , password , database , model , source_url , file_name , allowedNodes , allowedRelationship , retry_condition , additional_instructions ):
256
256
if not retry_condition :
257
257
file_name , pages = get_documents_from_web_page (source_url )
258
258
if pages == None or len (pages )== 0 :
259
259
raise Exception (f'Content is not available for given URL : { file_name } ' )
260
- return await processing_source (uri , userName , password , database , model , file_name , pages , allowedNodes , allowedRelationship )
260
+ return await processing_source (uri , userName , password , database , model , file_name , pages , allowedNodes , allowedRelationship , additional_instructions = additional_instructions )
261
261
else :
262
- return await processing_source (uri , userName , password , database , model , file_name , [], allowedNodes , allowedRelationship , retry_condition = retry_condition )
262
+ return await processing_source (uri , userName , password , database , model , file_name , [], allowedNodes , allowedRelationship , retry_condition = retry_condition , additional_instructions = additional_instructions )
263
263
264
- async def extract_graph_from_file_youtube (uri , userName , password , database , model , source_url , file_name , allowedNodes , allowedRelationship , retry_condition ):
264
+ async def extract_graph_from_file_youtube (uri , userName , password , database , model , source_url , file_name , allowedNodes , allowedRelationship , retry_condition , additional_instructions ):
265
265
if not retry_condition :
266
266
file_name , pages = get_documents_from_youtube (source_url )
267
267
268
268
if pages == None or len (pages )== 0 :
269
269
raise Exception (f'Youtube transcript is not available for file : { file_name } ' )
270
- return await processing_source (uri , userName , password , database , model , file_name , pages , allowedNodes , allowedRelationship )
270
+ return await processing_source (uri , userName , password , database , model , file_name , pages , allowedNodes , allowedRelationship , additional_instructions = additional_instructions )
271
271
else :
272
- return await processing_source (uri , userName , password , database , model , file_name , [], allowedNodes , allowedRelationship , retry_condition = retry_condition )
272
+ return await processing_source (uri , userName , password , database , model , file_name , [], allowedNodes , allowedRelationship , retry_condition = retry_condition , additional_instructions = additional_instructions )
273
273
274
- async def extract_graph_from_file_Wikipedia (uri , userName , password , database , model , wiki_query , language , file_name , allowedNodes , allowedRelationship , retry_condition ):
274
+ async def extract_graph_from_file_Wikipedia (uri , userName , password , database , model , wiki_query , language , file_name , allowedNodes , allowedRelationship , retry_condition , additional_instructions ):
275
275
if not retry_condition :
276
276
file_name , pages = get_documents_from_Wikipedia (wiki_query , language )
277
277
if pages == None or len (pages )== 0 :
278
278
raise Exception (f'Wikipedia page is not available for file : { file_name } ' )
279
- return await processing_source (uri , userName , password , database , model , file_name , pages , allowedNodes , allowedRelationship )
279
+ return await processing_source (uri , userName , password , database , model , file_name , pages , allowedNodes , allowedRelationship , additional_instructions = additional_instructions )
280
280
else :
281
- return await processing_source (uri , userName , password , database , model , file_name ,[], allowedNodes , allowedRelationship , retry_condition = retry_condition )
281
+ return await processing_source (uri , userName , password , database , model , file_name ,[], allowedNodes , allowedRelationship , retry_condition = retry_condition , additional_instructions = additional_instructions )
282
282
283
- async def extract_graph_from_file_gcs (uri , userName , password , database , model , gcs_project_id , gcs_bucket_name , gcs_bucket_folder , gcs_blob_filename , access_token , file_name , allowedNodes , allowedRelationship , retry_condition ):
283
+ async def extract_graph_from_file_gcs (uri , userName , password , database , model , gcs_project_id , gcs_bucket_name , gcs_bucket_folder , gcs_blob_filename , access_token , file_name , allowedNodes , allowedRelationship , retry_condition , additional_instructions ):
284
284
if not retry_condition :
285
285
file_name , pages = get_documents_from_gcs (gcs_project_id , gcs_bucket_name , gcs_bucket_folder , gcs_blob_filename , access_token )
286
286
if pages == None or len (pages )== 0 :
287
287
raise Exception (f'File content is not available for file : { file_name } ' )
288
- return await processing_source (uri , userName , password , database , model , file_name , pages , allowedNodes , allowedRelationship )
288
+ return await processing_source (uri , userName , password , database , model , file_name , pages , allowedNodes , allowedRelationship , additional_instructions = additional_instructions )
289
289
else :
290
- return await processing_source (uri , userName , password , database , model , file_name , [], allowedNodes , allowedRelationship , retry_condition = retry_condition )
290
+ return await processing_source (uri , userName , password , database , model , file_name , [], allowedNodes , allowedRelationship , retry_condition = retry_condition , additional_instructions = additional_instructions )
291
291
292
- async def processing_source (uri , userName , password , database , model , file_name , pages , allowedNodes , allowedRelationship , is_uploaded_from_local = None , merged_file_path = None , retry_condition = None ):
292
+ async def processing_source (uri , userName , password , database , model , file_name , pages , allowedNodes , allowedRelationship , is_uploaded_from_local = None , merged_file_path = None , retry_condition = None , additional_instructions = None ):
293
293
"""
294
294
Extracts a Neo4jGraph from a PDF file based on the model.
295
295
@@ -381,7 +381,7 @@ async def processing_source(uri, userName, password, database, model, file_name,
381
381
break
382
382
else :
383
383
processing_chunks_start_time = time .time ()
384
- node_count ,rel_count ,latency_processed_chunk = await processing_chunks (selected_chunks ,graph ,uri , userName , password , database ,file_name ,model ,allowedNodes ,allowedRelationship ,node_count , rel_count )
384
+ node_count ,rel_count ,latency_processed_chunk = await processing_chunks (selected_chunks ,graph ,uri , userName , password , database ,file_name ,model ,allowedNodes ,allowedRelationship ,node_count , rel_count , additional_instructions )
385
385
processing_chunks_end_time = time .time ()
386
386
processing_chunks_elapsed_end_time = processing_chunks_end_time - processing_chunks_start_time
387
387
logging .info (f"Time taken { update_graph_chunk_processed } chunks processed upto { select_chunks_upto } completed in { processing_chunks_elapsed_end_time :.2f} seconds for file name { file_name } " )
@@ -458,7 +458,7 @@ async def processing_source(uri, userName, password, database, model, file_name,
458
458
logging .error (error_message )
459
459
raise Exception (error_message )
460
460
461
- async def processing_chunks (chunkId_chunkDoc_list ,graph ,uri , userName , password , database ,file_name ,model ,allowedNodes ,allowedRelationship , node_count , rel_count ):
461
+ async def processing_chunks (chunkId_chunkDoc_list ,graph ,uri , userName , password , database ,file_name ,model ,allowedNodes ,allowedRelationship , node_count , rel_count , additional_instructions = None ):
462
462
#create vector index and update chunk node with embedding
463
463
latency_processing_chunk = {}
464
464
if graph is not None :
@@ -476,7 +476,7 @@ async def processing_chunks(chunkId_chunkDoc_list,graph,uri, userName, password,
476
476
logging .info ("Get graph document list from models" )
477
477
478
478
start_entity_extraction = time .time ()
479
- graph_documents = await get_graph_from_llm (model , chunkId_chunkDoc_list , allowedNodes , allowedRelationship )
479
+ graph_documents = await get_graph_from_llm (model , chunkId_chunkDoc_list , allowedNodes , allowedRelationship , additional_instructions )
480
480
end_entity_extraction = time .time ()
481
481
elapsed_entity_extraction = end_entity_extraction - start_entity_extraction
482
482
logging .info (f'Time taken to extract enitities from LLM Graph Builder: { elapsed_entity_extraction :.2f} seconds' )
@@ -678,7 +678,7 @@ def get_labels_and_relationtypes(graph):
678
678
return label order by label limit 100 } as labels,
679
679
collect {
680
680
CALL db.relationshipTypes() yield relationshipType as type
681
- WHERE NOT type IN ['PART_OF', 'NEXT_CHUNK', 'HAS_ENTITY', '_Bloom_Perspective_','FIRST_CHUNK']
681
+ WHERE NOT type IN ['PART_OF', 'NEXT_CHUNK', 'HAS_ENTITY', '_Bloom_Perspective_','FIRST_CHUNK','SIMILAR','IN_COMMUNITY','PARENT_COMMUNITY' ]
682
682
return type order by type LIMIT 100 } as relationshipTypes
683
683
"""
684
684
graphDb_data_Access = graphDBdataAccess (graph )
0 commit comments