neo4j-labs · praveshkumar1988 · Dec 31, 2024 · Dec 27, 2024
diff --git a/backend/score.py b/backend/score.py
@@ -76,8 +76,6 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send):
         )
         await gzip_middleware(scope, receive, send)
 app = FastAPI()
-# SecWeb(app=app, Option={'referrer': False, 'xframe': False})
-# app.add_middleware(ContentSecurityPolicy, Option={'default-src': ["'self'"], 'base-uri': ["'self'"], 'block-all-mixed-content': []}, script_nonce=False, style_nonce=False, report_only=False)
 app.add_middleware(XContentTypeOptions)
 app.add_middleware(XFrame, Option={'X-Frame-Options': 'DENY'})
 app.add_middleware(CustomGZipMiddleware, minimum_size=1000, compresslevel=5,paths=["/sources_list","/url/scan","/extract","/chat_bot","/chunk_entities","/get_neighbours","/graph_query","/schema","/populate_graph_schema","/get_unconnected_nodes_list","/get_duplicate_nodes","/fetch_chunktext"])
@@ -99,7 +97,6 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send):
 
 @app.post("/url/scan")
 async def create_source_knowledge_graph_url(
-    request: Request,
     uri=Form(),
     userName=Form(),
     password=Form(),
@@ -172,7 +169,6 @@ async def extract_knowledge_graph_from_file(
     aws_access_key_id=Form(None),
     aws_secret_access_key=Form(None),
     wiki_query=Form(None),
-    max_sources=Form(None),
     gcs_project_id=Form(None),
     gcs_bucket_name=Form(None),
     gcs_bucket_folder=Form(None),
@@ -345,7 +341,7 @@ async def post_processing(uri=Form(), userName=Form(), password=Form(), database
         end = time.time()
         elapsed_time = end - start
         json_obj = {'api_name': api_name, 'db_url': uri, 'userName':userName, 'database':database, 'tasks':tasks, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}'}
-        # logger.log_struct(json_obj)
+        logger.log_struct(json_obj)
         return create_api_response('Success', data=count_response, message='All tasks completed successfully')
 
     except Exception as e:
@@ -615,8 +611,7 @@ async def delete_document_and_entities(uri=Form(),
         start = time.time()
         graph = create_graph_database_connection(uri, userName, password, database)
         graphDb_data_Access = graphDBdataAccess(graph)
-        result, files_list_size = await asyncio.to_thread(graphDb_data_Access.delete_file_from_graph, filenames, source_types, deleteEntities, MERGED_DIR, uri)
-        # entities_count = result[0]['deletedEntities'] if 'deletedEntities' in result[0] else 0
+        files_list_size = await asyncio.to_thread(graphDb_data_Access.delete_file_from_graph, filenames, source_types, deleteEntities, MERGED_DIR, uri)
         message = f"Deleted {files_list_size} documents with entities from database"
         end = time.time()
         elapsed_time = end - start

diff --git a/backend/src/diffbot_transformer.py b/backend/src/diffbot_transformer.py
@@ -1,11 +1,5 @@
-from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
-#from langchain_community.graphs import Neo4jGraph
-from langchain_neo4j import Neo4jGraph
-from langchain.docstore.document import Document
 from typing import List
-import os
 import logging
-import uuid
 from src.llm import get_combined_chunks, get_llm
 
 logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO')
@@ -14,6 +8,4 @@ def get_graph_from_diffbot(graph,chunkId_chunkDoc_list:List):
     combined_chunk_document_list = get_combined_chunks(chunkId_chunkDoc_list)
     llm,model_name = get_llm('diffbot')
     graph_documents = llm.convert_to_graph_documents(combined_chunk_document_list)
-    return graph_documents
-
-
+    return graph_documents
diff --git a/backend/src/document_sources/gcs_bucket.py b/backend/src/document_sources/gcs_bucket.py
@@ -123,7 +123,6 @@ def merge_file_gcs(bucket_name, original_file_name: str, folder_name_sha1_hashed
       logging.info('save the merged file from chunks in gcs')
       file_io = io.BytesIO(merged_file)
       blob.upload_from_file(file_io)
-      # pdf_reader = PdfReader(file_io)
       file_size = len(merged_file)
 
       return file_size

diff --git a/backend/src/document_sources/local_file.py b/backend/src/document_sources/local_file.py
@@ -1,23 +1,9 @@
 import logging
-import shutil
 from pathlib import Path
-from tempfile import NamedTemporaryFile
-# from langchain_community.document_loaders import PyPDFLoader
 from langchain_community.document_loaders import PyMuPDFLoader
 from langchain_community.document_loaders import UnstructuredFileLoader
 from langchain_core.documents import Document
 
-# def get_documents_from_file_by_bytes(file):
-#     file_name = file.filename
-#     logging.info(f"get_documents_from_file called for filename = {file_name}")
-#     suffix = Path(file.filename).suffix
-#     with NamedTemporaryFile(delete=True, suffix=suffix) as tmp:
-#         shutil.copyfileobj(file.file, tmp)
-#         tmp_path = Path(tmp.name)
-#         loader = PyPDFLoader(str(tmp_path))
-#         pages = loader.load_and_split()
-#     return file_name, pages
-
 def load_document_content(file_path):
     if Path(file_path).suffix.lower() == '.pdf':
         return PyMuPDFLoader(file_path)
@@ -27,8 +13,7 @@ def load_document_content(file_path):
 def get_documents_from_file_by_path(file_path,file_name):
     file_path = Path(file_path)
     if file_path.exists():
-        logging.info(f'file {file_name} processing')
-        # loader = PyPDFLoader(str(file_path))
+        logging.info(f'file {file_name} processing')        
         file_extension = file_path.suffix.lower()
         try:
             loader = load_document_content(file_path)
@@ -58,14 +43,10 @@ def get_pages_with_page_numbers(unstructured_pages):
 
             if page.metadata['page_number']>page_number:
                 page_number+=1
-                # if not metadata:
-                #     metadata = {'total_pages':unstructured_pages[-1].metadata['page_number']}
                 pages.append(Document(page_content = page_content))
                 page_content='' 
 
             if page == unstructured_pages[-1]:
-                # if not metadata:
-                #     metadata = {'total_pages':unstructured_pages[-1].metadata['page_number']}
                 pages.append(Document(page_content = page_content))
 
         elif page.metadata['category']=='PageBreak' and page!=unstructured_pages[0]:

diff --git a/backend/src/document_sources/youtube.py b/backend/src/document_sources/youtube.py
@@ -5,7 +5,6 @@
 from difflib import SequenceMatcher
 from datetime import timedelta
 from src.shared.constants import YOUTUBE_CHUNK_SIZE_SECONDS
-from typing import List, Dict, Any 
 import os
 import re
 

diff --git a/backend/src/graphDB_dataAccess.py b/backend/src/graphDB_dataAccess.py
@@ -267,11 +267,11 @@ def get_current_status_document_node(self, file_name):
         return self.execute_query(query, param)
 
     def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, merged_dir:str, uri):
-        # filename_list = filenames.split(',')
+
         filename_list= list(map(str.strip, json.loads(filenames)))
         source_types_list= list(map(str.strip, json.loads(source_types)))
         gcs_file_cache = os.environ.get('GCS_FILE_CACHE')
-        # source_types_list = source_types.split(',')
+
         for (file_name,source_type) in zip(filename_list, source_types_list):
             merged_file_path = os.path.join(merged_dir, file_name)
             if source_type == 'local file' and gcs_file_cache == 'True':
@@ -326,7 +326,7 @@ def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, me
         else :
             result = self.execute_query(query_to_delete_document, param)    
             logging.info(f"Deleting {len(filename_list)} documents = '{filename_list}' from '{source_types_list}' with their entities from database")
-        return result, len(filename_list)
+        return len(filename_list)
 
     def list_unconnected_nodes(self):
         query = """

diff --git a/backend/src/graph_query.py b/backend/src/graph_query.py
@@ -4,9 +4,6 @@
 import os
 import json
 from src.shared.constants import GRAPH_CHUNK_LIMIT,GRAPH_QUERY,CHUNK_TEXT_QUERY,COUNT_CHUNKS_QUERY
-# from neo4j.debug import watch
-
-# watch("neo4j")
 
 def get_graphDB_driver(uri, username, password,database="neo4j"):
     """
@@ -28,7 +25,6 @@ def get_graphDB_driver(uri, username, password,database="neo4j"):
     except Exception as e:
         error_message = f"graph_query module: Failed to connect to the database at {uri}."
         logging.error(error_message, exc_info=True)
-        # raise Exception(error_message) from e 
 
 
 def execute_query(driver, query,document_names,doc_limit=None):

diff --git a/backend/src/main.py b/backend/src/main.py
@@ -70,7 +70,6 @@ def create_source_node_graph_url_s3(graph, model, source_url, aws_access_key_id,
 
         except Exception as e:
           failed_count+=1
-          # error_message = str(e)
           lst_file_name.append({'fileName':obj_source_node.file_name,'fileSize':obj_source_node.file_size,'url':obj_source_node.url,'status':'Failed'})
     return lst_file_name,success_count,failed_count
 
@@ -170,7 +169,6 @@ def create_source_node_graph_url_youtube(graph, model, source_url, source_type):
     obj_source_node.communityRelCount=0
     match = re.search(r'(?:v=)([0-9A-Za-z_-]{11})\s*',obj_source_node.url)
     logging.info(f"match value: {match}")
-    video_id = parse_qs(urlparse(youtube_url).query).get('v')
     obj_source_node.file_name = match.group(1)
     transcript= get_youtube_combined_transcript(match.group(1))
     logging.info(f"Youtube transcript : {transcript}")
@@ -192,7 +190,6 @@ def create_source_node_graph_url_wikipedia(graph, model, wiki_query, source_type
     success_count=0
     failed_count=0
     lst_file_name=[]
-    #queries_list =  wiki_query.split(',')
     wiki_query_id, language = check_url_source(source_type=source_type, wiki_query=wiki_query)
     logging.info(f"Creating source node for {wiki_query_id.strip()}, {language}")
     pages = WikipediaLoader(query=wiki_query_id.strip(), lang=language, load_max_docs=1, load_all_available_meta=True).load()
@@ -354,7 +351,7 @@ async def processing_source(uri, userName, password, database, model, file_name,
 
       start_update_source_node = time.time()
       graphDb_data_Access.update_source_node(obj_source_node)
-      count_response = graphDb_data_Access.update_node_relationship_count(file_name)
+      graphDb_data_Access.update_node_relationship_count(file_name)
       end_update_source_node = time.time()
       elapsed_update_source_node = end_update_source_node - start_update_source_node
       logging.info(f'Time taken to update the document source node: {elapsed_update_source_node:.2f} seconds')
@@ -403,7 +400,7 @@ async def processing_source(uri, userName, password, database, model, file_name,
             obj_source_node.node_count = node_count
             obj_source_node.relationship_count = rel_count
           graphDb_data_Access.update_source_node(obj_source_node)
-          count_response = graphDb_data_Access.update_node_relationship_count(file_name)
+          graphDb_data_Access.update_node_relationship_count(file_name)
 
       result = graphDb_data_Access.get_current_status_document_node(file_name)
       is_cancelled_status = result[0]['is_cancelled']
@@ -419,7 +416,7 @@ async def processing_source(uri, userName, password, database, model, file_name,
       obj_source_node.processing_time = processed_time
 
       graphDb_data_Access.update_source_node(obj_source_node)
-      count_response = graphDb_data_Access.update_node_relationship_count(file_name)
+      graphDb_data_Access.update_node_relationship_count(file_name)
       logging.info('Updated the nodeCount and relCount properties in Document node')
       logging.info(f'file:{file_name} extraction has been completed')
 

diff --git a/backend/src/make_relationships.py b/backend/src/make_relationships.py
@@ -16,7 +16,7 @@
 def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_documents_chunk_chunk_Id : list):
     batch_data = []
     logging.info("Create HAS_ENTITY relationship between chunks and entities")
-    chunk_node_id_set = 'id:"{}"'
+
     for graph_doc_chunk_id in graph_documents_chunk_chunk_Id:
         for node in graph_doc_chunk_id['graph_doc'].nodes:
             query_data={
@@ -25,10 +25,6 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_docume
                 'node_id': node.id
             }
             batch_data.append(query_data)
-            #node_id = node.id
-            #Below query is also unable to change as parametrize because we can't make parameter of Label or node type
-            #https://neo4j.com/docs/cypher-manual/current/syntax/parameters/
-            #graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(graph_doc_chunk_id['chunk_id'])+'}) MERGE (n:'+ node.type +'{ id: "'+node_id+'"}) MERGE (c)-[:HAS_ENTITY]->(n)')
 
     if batch_data:
         unwind_query = """
@@ -41,19 +37,16 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_docume
 
 
 def create_chunk_embeddings(graph, chunkId_chunkDoc_list, file_name):
-    #create embedding
+
     isEmbedding = os.getenv('IS_EMBEDDING')
-    # embedding_model = os.getenv('EMBEDDING_MODEL')
 
     embeddings, dimension = EMBEDDING_FUNCTION , EMBEDDING_DIMENSION
     logging.info(f'embedding model:{embeddings} and dimesion:{dimension}')
     data_for_query = []
     logging.info(f"update embedding and vector index for chunks")
     for row in chunkId_chunkDoc_list:
-        # for graph_document in row['graph_doc']:
         if isEmbedding.upper() == "TRUE":
             embeddings_arr = embeddings.embed_query(row['chunk_doc'].page_content)
-            # logging.info(f'Embedding list {embeddings_arr}')
 
             data_for_query.append({
                 "chunkId": row['chunk_id'],
@@ -82,7 +75,6 @@ def create_relation_between_chunks(graph, file_name, chunks: List[Document])->li
         current_chunk_id = page_content_sha1.hexdigest()
         position = i + 1 
         if i>0:
-            #offset += len(tiktoken.encoding_for_model("gpt2").encode(chunk.page_content))
             offset += len(chunks[i-1].page_content)
         if i == 0:
             firstChunk = True

diff --git a/backend/src/shared/schema_extraction.py b/backend/src/shared/schema_extraction.py
@@ -1,5 +1,4 @@
 from typing import List
-#from langchain_core.pydantic_v1 import BaseModel, Field
 from pydantic.v1 import BaseModel, Field
 from src.llm import get_llm
 from langchain_core.prompts import ChatPromptTemplate

diff --git a/frontend/src/types.ts b/frontend/src/types.ts
@@ -74,7 +74,6 @@ export type ExtractParams = Pick<CustomFile, 'wikiQuery' | 'model' | 'sourceUrl'
   file?: File;
   aws_access_key_id?: string | null;
   aws_secret_access_key?: string | null;
-  max_sources?: number;
   gcs_bucket_name?: string;
   gcs_bucket_folder?: string;
   gcs_blob_filename?: string;