Skip to content

Removed commented code and unused library #973

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 2 additions & 7 deletions backend/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,6 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send):
)
await gzip_middleware(scope, receive, send)
app = FastAPI()
# SecWeb(app=app, Option={'referrer': False, 'xframe': False})
# app.add_middleware(ContentSecurityPolicy, Option={'default-src': ["'self'"], 'base-uri': ["'self'"], 'block-all-mixed-content': []}, script_nonce=False, style_nonce=False, report_only=False)
app.add_middleware(XContentTypeOptions)
app.add_middleware(XFrame, Option={'X-Frame-Options': 'DENY'})
app.add_middleware(CustomGZipMiddleware, minimum_size=1000, compresslevel=5,paths=["/sources_list","/url/scan","/extract","/chat_bot","/chunk_entities","/get_neighbours","/graph_query","/schema","/populate_graph_schema","/get_unconnected_nodes_list","/get_duplicate_nodes","/fetch_chunktext"])
Expand All @@ -99,7 +97,6 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send):

@app.post("/url/scan")
async def create_source_knowledge_graph_url(
request: Request,
uri=Form(),
userName=Form(),
password=Form(),
Expand Down Expand Up @@ -172,7 +169,6 @@ async def extract_knowledge_graph_from_file(
aws_access_key_id=Form(None),
aws_secret_access_key=Form(None),
wiki_query=Form(None),
max_sources=Form(None),
gcs_project_id=Form(None),
gcs_bucket_name=Form(None),
gcs_bucket_folder=Form(None),
Expand Down Expand Up @@ -345,7 +341,7 @@ async def post_processing(uri=Form(), userName=Form(), password=Form(), database
end = time.time()
elapsed_time = end - start
json_obj = {'api_name': api_name, 'db_url': uri, 'userName':userName, 'database':database, 'tasks':tasks, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}'}
# logger.log_struct(json_obj)
logger.log_struct(json_obj)
return create_api_response('Success', data=count_response, message='All tasks completed successfully')

except Exception as e:
Expand Down Expand Up @@ -615,8 +611,7 @@ async def delete_document_and_entities(uri=Form(),
start = time.time()
graph = create_graph_database_connection(uri, userName, password, database)
graphDb_data_Access = graphDBdataAccess(graph)
result, files_list_size = await asyncio.to_thread(graphDb_data_Access.delete_file_from_graph, filenames, source_types, deleteEntities, MERGED_DIR, uri)
# entities_count = result[0]['deletedEntities'] if 'deletedEntities' in result[0] else 0
files_list_size = await asyncio.to_thread(graphDb_data_Access.delete_file_from_graph, filenames, source_types, deleteEntities, MERGED_DIR, uri)
message = f"Deleted {files_list_size} documents with entities from database"
end = time.time()
elapsed_time = end - start
Expand Down
10 changes: 1 addition & 9 deletions backend/src/diffbot_transformer.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,5 @@
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
#from langchain_community.graphs import Neo4jGraph
from langchain_neo4j import Neo4jGraph
from langchain.docstore.document import Document
from typing import List
import os
import logging
import uuid
from src.llm import get_combined_chunks, get_llm

logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO')
Expand All @@ -14,6 +8,4 @@ def get_graph_from_diffbot(graph,chunkId_chunkDoc_list:List):
combined_chunk_document_list = get_combined_chunks(chunkId_chunkDoc_list)
llm,model_name = get_llm('diffbot')
graph_documents = llm.convert_to_graph_documents(combined_chunk_document_list)
return graph_documents


return graph_documents
1 change: 0 additions & 1 deletion backend/src/document_sources/gcs_bucket.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ def merge_file_gcs(bucket_name, original_file_name: str, folder_name_sha1_hashed
logging.info('save the merged file from chunks in gcs')
file_io = io.BytesIO(merged_file)
blob.upload_from_file(file_io)
# pdf_reader = PdfReader(file_io)
file_size = len(merged_file)

return file_size
Expand Down
21 changes: 1 addition & 20 deletions backend/src/document_sources/local_file.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,9 @@
import logging
import shutil
from pathlib import Path
from tempfile import NamedTemporaryFile
# from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_core.documents import Document

# def get_documents_from_file_by_bytes(file):
# file_name = file.filename
# logging.info(f"get_documents_from_file called for filename = {file_name}")
# suffix = Path(file.filename).suffix
# with NamedTemporaryFile(delete=True, suffix=suffix) as tmp:
# shutil.copyfileobj(file.file, tmp)
# tmp_path = Path(tmp.name)
# loader = PyPDFLoader(str(tmp_path))
# pages = loader.load_and_split()
# return file_name, pages

def load_document_content(file_path):
if Path(file_path).suffix.lower() == '.pdf':
return PyMuPDFLoader(file_path)
Expand All @@ -27,8 +13,7 @@ def load_document_content(file_path):
def get_documents_from_file_by_path(file_path,file_name):
file_path = Path(file_path)
if file_path.exists():
logging.info(f'file {file_name} processing')
# loader = PyPDFLoader(str(file_path))
logging.info(f'file {file_name} processing')
file_extension = file_path.suffix.lower()
try:
loader = load_document_content(file_path)
Expand Down Expand Up @@ -58,14 +43,10 @@ def get_pages_with_page_numbers(unstructured_pages):

if page.metadata['page_number']>page_number:
page_number+=1
# if not metadata:
# metadata = {'total_pages':unstructured_pages[-1].metadata['page_number']}
pages.append(Document(page_content = page_content))
page_content=''

if page == unstructured_pages[-1]:
# if not metadata:
# metadata = {'total_pages':unstructured_pages[-1].metadata['page_number']}
pages.append(Document(page_content = page_content))

elif page.metadata['category']=='PageBreak' and page!=unstructured_pages[0]:
Expand Down
1 change: 0 additions & 1 deletion backend/src/document_sources/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from difflib import SequenceMatcher
from datetime import timedelta
from src.shared.constants import YOUTUBE_CHUNK_SIZE_SECONDS
from typing import List, Dict, Any
import os
import re

Expand Down
6 changes: 3 additions & 3 deletions backend/src/graphDB_dataAccess.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,11 +267,11 @@ def get_current_status_document_node(self, file_name):
return self.execute_query(query, param)

def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, merged_dir:str, uri):
# filename_list = filenames.split(',')

filename_list= list(map(str.strip, json.loads(filenames)))
source_types_list= list(map(str.strip, json.loads(source_types)))
gcs_file_cache = os.environ.get('GCS_FILE_CACHE')
# source_types_list = source_types.split(',')

for (file_name,source_type) in zip(filename_list, source_types_list):
merged_file_path = os.path.join(merged_dir, file_name)
if source_type == 'local file' and gcs_file_cache == 'True':
Expand Down Expand Up @@ -326,7 +326,7 @@ def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, me
else :
result = self.execute_query(query_to_delete_document, param)
logging.info(f"Deleting {len(filename_list)} documents = '{filename_list}' from '{source_types_list}' with their entities from database")
return result, len(filename_list)
return len(filename_list)

def list_unconnected_nodes(self):
query = """
Expand Down
4 changes: 0 additions & 4 deletions backend/src/graph_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@
import os
import json
from src.shared.constants import GRAPH_CHUNK_LIMIT,GRAPH_QUERY,CHUNK_TEXT_QUERY,COUNT_CHUNKS_QUERY
# from neo4j.debug import watch

# watch("neo4j")

def get_graphDB_driver(uri, username, password,database="neo4j"):
"""
Expand All @@ -28,7 +25,6 @@ def get_graphDB_driver(uri, username, password,database="neo4j"):
except Exception as e:
error_message = f"graph_query module: Failed to connect to the database at {uri}."
logging.error(error_message, exc_info=True)
# raise Exception(error_message) from e


def execute_query(driver, query,document_names,doc_limit=None):
Expand Down
9 changes: 3 additions & 6 deletions backend/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ def create_source_node_graph_url_s3(graph, model, source_url, aws_access_key_id,

except Exception as e:
failed_count+=1
# error_message = str(e)
lst_file_name.append({'fileName':obj_source_node.file_name,'fileSize':obj_source_node.file_size,'url':obj_source_node.url,'status':'Failed'})
return lst_file_name,success_count,failed_count

Expand Down Expand Up @@ -170,7 +169,6 @@ def create_source_node_graph_url_youtube(graph, model, source_url, source_type):
obj_source_node.communityRelCount=0
match = re.search(r'(?:v=)([0-9A-Za-z_-]{11})\s*',obj_source_node.url)
logging.info(f"match value: {match}")
video_id = parse_qs(urlparse(youtube_url).query).get('v')
obj_source_node.file_name = match.group(1)
transcript= get_youtube_combined_transcript(match.group(1))
logging.info(f"Youtube transcript : {transcript}")
Expand All @@ -192,7 +190,6 @@ def create_source_node_graph_url_wikipedia(graph, model, wiki_query, source_type
success_count=0
failed_count=0
lst_file_name=[]
#queries_list = wiki_query.split(',')
wiki_query_id, language = check_url_source(source_type=source_type, wiki_query=wiki_query)
logging.info(f"Creating source node for {wiki_query_id.strip()}, {language}")
pages = WikipediaLoader(query=wiki_query_id.strip(), lang=language, load_max_docs=1, load_all_available_meta=True).load()
Expand Down Expand Up @@ -354,7 +351,7 @@ async def processing_source(uri, userName, password, database, model, file_name,

start_update_source_node = time.time()
graphDb_data_Access.update_source_node(obj_source_node)
count_response = graphDb_data_Access.update_node_relationship_count(file_name)
graphDb_data_Access.update_node_relationship_count(file_name)
end_update_source_node = time.time()
elapsed_update_source_node = end_update_source_node - start_update_source_node
logging.info(f'Time taken to update the document source node: {elapsed_update_source_node:.2f} seconds')
Expand Down Expand Up @@ -403,7 +400,7 @@ async def processing_source(uri, userName, password, database, model, file_name,
obj_source_node.node_count = node_count
obj_source_node.relationship_count = rel_count
graphDb_data_Access.update_source_node(obj_source_node)
count_response = graphDb_data_Access.update_node_relationship_count(file_name)
graphDb_data_Access.update_node_relationship_count(file_name)

result = graphDb_data_Access.get_current_status_document_node(file_name)
is_cancelled_status = result[0]['is_cancelled']
Expand All @@ -419,7 +416,7 @@ async def processing_source(uri, userName, password, database, model, file_name,
obj_source_node.processing_time = processed_time

graphDb_data_Access.update_source_node(obj_source_node)
count_response = graphDb_data_Access.update_node_relationship_count(file_name)
graphDb_data_Access.update_node_relationship_count(file_name)
logging.info('Updated the nodeCount and relCount properties in Document node')
logging.info(f'file:{file_name} extraction has been completed')

Expand Down
12 changes: 2 additions & 10 deletions backend/src/make_relationships.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_documents_chunk_chunk_Id : list):
batch_data = []
logging.info("Create HAS_ENTITY relationship between chunks and entities")
chunk_node_id_set = 'id:"{}"'

for graph_doc_chunk_id in graph_documents_chunk_chunk_Id:
for node in graph_doc_chunk_id['graph_doc'].nodes:
query_data={
Expand All @@ -25,10 +25,6 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_docume
'node_id': node.id
}
batch_data.append(query_data)
#node_id = node.id
#Below query is also unable to change as parametrize because we can't make parameter of Label or node type
#https://neo4j.com/docs/cypher-manual/current/syntax/parameters/
#graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(graph_doc_chunk_id['chunk_id'])+'}) MERGE (n:'+ node.type +'{ id: "'+node_id+'"}) MERGE (c)-[:HAS_ENTITY]->(n)')

if batch_data:
unwind_query = """
Expand All @@ -41,19 +37,16 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_docume


def create_chunk_embeddings(graph, chunkId_chunkDoc_list, file_name):
#create embedding

isEmbedding = os.getenv('IS_EMBEDDING')
# embedding_model = os.getenv('EMBEDDING_MODEL')

embeddings, dimension = EMBEDDING_FUNCTION , EMBEDDING_DIMENSION
logging.info(f'embedding model:{embeddings} and dimesion:{dimension}')
data_for_query = []
logging.info(f"update embedding and vector index for chunks")
for row in chunkId_chunkDoc_list:
# for graph_document in row['graph_doc']:
if isEmbedding.upper() == "TRUE":
embeddings_arr = embeddings.embed_query(row['chunk_doc'].page_content)
# logging.info(f'Embedding list {embeddings_arr}')

data_for_query.append({
"chunkId": row['chunk_id'],
Expand Down Expand Up @@ -82,7 +75,6 @@ def create_relation_between_chunks(graph, file_name, chunks: List[Document])->li
current_chunk_id = page_content_sha1.hexdigest()
position = i + 1
if i>0:
#offset += len(tiktoken.encoding_for_model("gpt2").encode(chunk.page_content))
offset += len(chunks[i-1].page_content)
if i == 0:
firstChunk = True
Expand Down
1 change: 0 additions & 1 deletion backend/src/shared/schema_extraction.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from typing import List
#from langchain_core.pydantic_v1 import BaseModel, Field
from pydantic.v1 import BaseModel, Field
from src.llm import get_llm
from langchain_core.prompts import ChatPromptTemplate
Expand Down
1 change: 0 additions & 1 deletion frontend/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ export type ExtractParams = Pick<CustomFile, 'wikiQuery' | 'model' | 'sourceUrl'
file?: File;
aws_access_key_id?: string | null;
aws_secret_access_key?: string | null;
max_sources?: number;
gcs_bucket_name?: string;
gcs_bucket_folder?: string;
gcs_blob_filename?: string;
Expand Down
Loading