diff --git a/backend/score.py b/backend/score.py index af4eac282..492a0c786 100644 --- a/backend/score.py +++ b/backend/score.py @@ -76,8 +76,6 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send): ) await gzip_middleware(scope, receive, send) app = FastAPI() -# SecWeb(app=app, Option={'referrer': False, 'xframe': False}) -# app.add_middleware(ContentSecurityPolicy, Option={'default-src': ["'self'"], 'base-uri': ["'self'"], 'block-all-mixed-content': []}, script_nonce=False, style_nonce=False, report_only=False) app.add_middleware(XContentTypeOptions) app.add_middleware(XFrame, Option={'X-Frame-Options': 'DENY'}) app.add_middleware(CustomGZipMiddleware, minimum_size=1000, compresslevel=5,paths=["/sources_list","/url/scan","/extract","/chat_bot","/chunk_entities","/get_neighbours","/graph_query","/schema","/populate_graph_schema","/get_unconnected_nodes_list","/get_duplicate_nodes","/fetch_chunktext"]) @@ -99,7 +97,6 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send): @app.post("/url/scan") async def create_source_knowledge_graph_url( - request: Request, uri=Form(), userName=Form(), password=Form(), @@ -172,7 +169,6 @@ async def extract_knowledge_graph_from_file( aws_access_key_id=Form(None), aws_secret_access_key=Form(None), wiki_query=Form(None), - max_sources=Form(None), gcs_project_id=Form(None), gcs_bucket_name=Form(None), gcs_bucket_folder=Form(None), @@ -345,7 +341,7 @@ async def post_processing(uri=Form(), userName=Form(), password=Form(), database end = time.time() elapsed_time = end - start json_obj = {'api_name': api_name, 'db_url': uri, 'userName':userName, 'database':database, 'tasks':tasks, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}'} - # logger.log_struct(json_obj) + logger.log_struct(json_obj) return create_api_response('Success', data=count_response, message='All tasks completed successfully') except Exception as e: @@ -615,8 +611,7 @@ async def delete_document_and_entities(uri=Form(), start = time.time() graph = create_graph_database_connection(uri, userName, password, database) graphDb_data_Access = graphDBdataAccess(graph) - result, files_list_size = await asyncio.to_thread(graphDb_data_Access.delete_file_from_graph, filenames, source_types, deleteEntities, MERGED_DIR, uri) - # entities_count = result[0]['deletedEntities'] if 'deletedEntities' in result[0] else 0 + files_list_size = await asyncio.to_thread(graphDb_data_Access.delete_file_from_graph, filenames, source_types, deleteEntities, MERGED_DIR, uri) message = f"Deleted {files_list_size} documents with entities from database" end = time.time() elapsed_time = end - start diff --git a/backend/src/diffbot_transformer.py b/backend/src/diffbot_transformer.py index e16e54efb..03e1ba69e 100644 --- a/backend/src/diffbot_transformer.py +++ b/backend/src/diffbot_transformer.py @@ -1,11 +1,5 @@ -from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer -#from langchain_community.graphs import Neo4jGraph -from langchain_neo4j import Neo4jGraph -from langchain.docstore.document import Document from typing import List -import os import logging -import uuid from src.llm import get_combined_chunks, get_llm logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO') @@ -14,6 +8,4 @@ def get_graph_from_diffbot(graph,chunkId_chunkDoc_list:List): combined_chunk_document_list = get_combined_chunks(chunkId_chunkDoc_list) llm,model_name = get_llm('diffbot') graph_documents = llm.convert_to_graph_documents(combined_chunk_document_list) - return graph_documents - - \ No newline at end of file + return graph_documents \ No newline at end of file diff --git a/backend/src/document_sources/gcs_bucket.py b/backend/src/document_sources/gcs_bucket.py index 3aaf42e12..21cec22e1 100644 --- a/backend/src/document_sources/gcs_bucket.py +++ b/backend/src/document_sources/gcs_bucket.py @@ -123,7 +123,6 @@ def merge_file_gcs(bucket_name, original_file_name: str, folder_name_sha1_hashed logging.info('save the merged file from chunks in gcs') file_io = io.BytesIO(merged_file) blob.upload_from_file(file_io) - # pdf_reader = PdfReader(file_io) file_size = len(merged_file) return file_size diff --git a/backend/src/document_sources/local_file.py b/backend/src/document_sources/local_file.py index 3d5bc08db..f674a202f 100644 --- a/backend/src/document_sources/local_file.py +++ b/backend/src/document_sources/local_file.py @@ -1,23 +1,9 @@ import logging -import shutil from pathlib import Path -from tempfile import NamedTemporaryFile -# from langchain_community.document_loaders import PyPDFLoader from langchain_community.document_loaders import PyMuPDFLoader from langchain_community.document_loaders import UnstructuredFileLoader from langchain_core.documents import Document -# def get_documents_from_file_by_bytes(file): -# file_name = file.filename -# logging.info(f"get_documents_from_file called for filename = {file_name}") -# suffix = Path(file.filename).suffix -# with NamedTemporaryFile(delete=True, suffix=suffix) as tmp: -# shutil.copyfileobj(file.file, tmp) -# tmp_path = Path(tmp.name) -# loader = PyPDFLoader(str(tmp_path)) -# pages = loader.load_and_split() -# return file_name, pages - def load_document_content(file_path): if Path(file_path).suffix.lower() == '.pdf': return PyMuPDFLoader(file_path) @@ -27,8 +13,7 @@ def load_document_content(file_path): def get_documents_from_file_by_path(file_path,file_name): file_path = Path(file_path) if file_path.exists(): - logging.info(f'file {file_name} processing') - # loader = PyPDFLoader(str(file_path)) + logging.info(f'file {file_name} processing') file_extension = file_path.suffix.lower() try: loader = load_document_content(file_path) @@ -58,14 +43,10 @@ def get_pages_with_page_numbers(unstructured_pages): if page.metadata['page_number']>page_number: page_number+=1 - # if not metadata: - # metadata = {'total_pages':unstructured_pages[-1].metadata['page_number']} pages.append(Document(page_content = page_content)) page_content='' if page == unstructured_pages[-1]: - # if not metadata: - # metadata = {'total_pages':unstructured_pages[-1].metadata['page_number']} pages.append(Document(page_content = page_content)) elif page.metadata['category']=='PageBreak' and page!=unstructured_pages[0]: diff --git a/backend/src/document_sources/youtube.py b/backend/src/document_sources/youtube.py index dee97e230..1b3776db5 100644 --- a/backend/src/document_sources/youtube.py +++ b/backend/src/document_sources/youtube.py @@ -5,7 +5,6 @@ from difflib import SequenceMatcher from datetime import timedelta from src.shared.constants import YOUTUBE_CHUNK_SIZE_SECONDS -from typing import List, Dict, Any import os import re diff --git a/backend/src/graphDB_dataAccess.py b/backend/src/graphDB_dataAccess.py index e578f3be2..60b584a27 100644 --- a/backend/src/graphDB_dataAccess.py +++ b/backend/src/graphDB_dataAccess.py @@ -267,11 +267,11 @@ def get_current_status_document_node(self, file_name): return self.execute_query(query, param) def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, merged_dir:str, uri): - # filename_list = filenames.split(',') + filename_list= list(map(str.strip, json.loads(filenames))) source_types_list= list(map(str.strip, json.loads(source_types))) gcs_file_cache = os.environ.get('GCS_FILE_CACHE') - # source_types_list = source_types.split(',') + for (file_name,source_type) in zip(filename_list, source_types_list): merged_file_path = os.path.join(merged_dir, file_name) if source_type == 'local file' and gcs_file_cache == 'True': @@ -326,7 +326,7 @@ def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, me else : result = self.execute_query(query_to_delete_document, param) logging.info(f"Deleting {len(filename_list)} documents = '{filename_list}' from '{source_types_list}' with their entities from database") - return result, len(filename_list) + return len(filename_list) def list_unconnected_nodes(self): query = """ diff --git a/backend/src/graph_query.py b/backend/src/graph_query.py index 8583d8cc0..aefaacbd1 100644 --- a/backend/src/graph_query.py +++ b/backend/src/graph_query.py @@ -4,9 +4,6 @@ import os import json from src.shared.constants import GRAPH_CHUNK_LIMIT,GRAPH_QUERY,CHUNK_TEXT_QUERY,COUNT_CHUNKS_QUERY -# from neo4j.debug import watch - -# watch("neo4j") def get_graphDB_driver(uri, username, password,database="neo4j"): """ @@ -28,7 +25,6 @@ def get_graphDB_driver(uri, username, password,database="neo4j"): except Exception as e: error_message = f"graph_query module: Failed to connect to the database at {uri}." logging.error(error_message, exc_info=True) - # raise Exception(error_message) from e def execute_query(driver, query,document_names,doc_limit=None): diff --git a/backend/src/main.py b/backend/src/main.py index 03ee9a4b0..1558aa427 100644 --- a/backend/src/main.py +++ b/backend/src/main.py @@ -70,7 +70,6 @@ def create_source_node_graph_url_s3(graph, model, source_url, aws_access_key_id, except Exception as e: failed_count+=1 - # error_message = str(e) lst_file_name.append({'fileName':obj_source_node.file_name,'fileSize':obj_source_node.file_size,'url':obj_source_node.url,'status':'Failed'}) return lst_file_name,success_count,failed_count @@ -170,7 +169,6 @@ def create_source_node_graph_url_youtube(graph, model, source_url, source_type): obj_source_node.communityRelCount=0 match = re.search(r'(?:v=)([0-9A-Za-z_-]{11})\s*',obj_source_node.url) logging.info(f"match value: {match}") - video_id = parse_qs(urlparse(youtube_url).query).get('v') obj_source_node.file_name = match.group(1) transcript= get_youtube_combined_transcript(match.group(1)) logging.info(f"Youtube transcript : {transcript}") @@ -192,7 +190,6 @@ def create_source_node_graph_url_wikipedia(graph, model, wiki_query, source_type success_count=0 failed_count=0 lst_file_name=[] - #queries_list = wiki_query.split(',') wiki_query_id, language = check_url_source(source_type=source_type, wiki_query=wiki_query) logging.info(f"Creating source node for {wiki_query_id.strip()}, {language}") pages = WikipediaLoader(query=wiki_query_id.strip(), lang=language, load_max_docs=1, load_all_available_meta=True).load() @@ -354,7 +351,7 @@ async def processing_source(uri, userName, password, database, model, file_name, start_update_source_node = time.time() graphDb_data_Access.update_source_node(obj_source_node) - count_response = graphDb_data_Access.update_node_relationship_count(file_name) + graphDb_data_Access.update_node_relationship_count(file_name) end_update_source_node = time.time() elapsed_update_source_node = end_update_source_node - start_update_source_node logging.info(f'Time taken to update the document source node: {elapsed_update_source_node:.2f} seconds') @@ -403,7 +400,7 @@ async def processing_source(uri, userName, password, database, model, file_name, obj_source_node.node_count = node_count obj_source_node.relationship_count = rel_count graphDb_data_Access.update_source_node(obj_source_node) - count_response = graphDb_data_Access.update_node_relationship_count(file_name) + graphDb_data_Access.update_node_relationship_count(file_name) result = graphDb_data_Access.get_current_status_document_node(file_name) is_cancelled_status = result[0]['is_cancelled'] @@ -419,7 +416,7 @@ async def processing_source(uri, userName, password, database, model, file_name, obj_source_node.processing_time = processed_time graphDb_data_Access.update_source_node(obj_source_node) - count_response = graphDb_data_Access.update_node_relationship_count(file_name) + graphDb_data_Access.update_node_relationship_count(file_name) logging.info('Updated the nodeCount and relCount properties in Document node') logging.info(f'file:{file_name} extraction has been completed') diff --git a/backend/src/make_relationships.py b/backend/src/make_relationships.py index 410a383fd..28f4f9efb 100644 --- a/backend/src/make_relationships.py +++ b/backend/src/make_relationships.py @@ -16,7 +16,7 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_documents_chunk_chunk_Id : list): batch_data = [] logging.info("Create HAS_ENTITY relationship between chunks and entities") - chunk_node_id_set = 'id:"{}"' + for graph_doc_chunk_id in graph_documents_chunk_chunk_Id: for node in graph_doc_chunk_id['graph_doc'].nodes: query_data={ @@ -25,10 +25,6 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_docume 'node_id': node.id } batch_data.append(query_data) - #node_id = node.id - #Below query is also unable to change as parametrize because we can't make parameter of Label or node type - #https://neo4j.com/docs/cypher-manual/current/syntax/parameters/ - #graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(graph_doc_chunk_id['chunk_id'])+'}) MERGE (n:'+ node.type +'{ id: "'+node_id+'"}) MERGE (c)-[:HAS_ENTITY]->(n)') if batch_data: unwind_query = """ @@ -41,19 +37,16 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_docume def create_chunk_embeddings(graph, chunkId_chunkDoc_list, file_name): - #create embedding + isEmbedding = os.getenv('IS_EMBEDDING') - # embedding_model = os.getenv('EMBEDDING_MODEL') embeddings, dimension = EMBEDDING_FUNCTION , EMBEDDING_DIMENSION logging.info(f'embedding model:{embeddings} and dimesion:{dimension}') data_for_query = [] logging.info(f"update embedding and vector index for chunks") for row in chunkId_chunkDoc_list: - # for graph_document in row['graph_doc']: if isEmbedding.upper() == "TRUE": embeddings_arr = embeddings.embed_query(row['chunk_doc'].page_content) - # logging.info(f'Embedding list {embeddings_arr}') data_for_query.append({ "chunkId": row['chunk_id'], @@ -82,7 +75,6 @@ def create_relation_between_chunks(graph, file_name, chunks: List[Document])->li current_chunk_id = page_content_sha1.hexdigest() position = i + 1 if i>0: - #offset += len(tiktoken.encoding_for_model("gpt2").encode(chunk.page_content)) offset += len(chunks[i-1].page_content) if i == 0: firstChunk = True diff --git a/backend/src/shared/schema_extraction.py b/backend/src/shared/schema_extraction.py index 1b7f76c92..d57703e37 100644 --- a/backend/src/shared/schema_extraction.py +++ b/backend/src/shared/schema_extraction.py @@ -1,5 +1,4 @@ from typing import List -#from langchain_core.pydantic_v1 import BaseModel, Field from pydantic.v1 import BaseModel, Field from src.llm import get_llm from langchain_core.prompts import ChatPromptTemplate diff --git a/frontend/src/types.ts b/frontend/src/types.ts index ef4d00ad9..915920028 100644 --- a/frontend/src/types.ts +++ b/frontend/src/types.ts @@ -74,7 +74,6 @@ export type ExtractParams = Pick