Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chunk limit email check #1196

Draft
wants to merge 9 commits into
base: dev
Choose a base branch
from
72 changes: 65 additions & 7 deletions backend/score.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from src.entities.user import user_info
from fastapi import FastAPI, File, UploadFile, Form, Request, HTTPException
from fastapi_health import health
from fastapi.middleware.cors import CORSMiddleware
Expand Down Expand Up @@ -218,7 +219,8 @@ async def extract_knowledge_graph_from_file(
access_token=Form(None),
retry_condition=Form(None),
additional_instructions=Form(None),
email=Form(None)
email=Form(None),
user_obj: user_info=Form(None)
):
"""
Calls 'extract_graph_from_file' in a new thread to create Neo4jGraph from a
Expand All @@ -241,22 +243,22 @@ async def extract_knowledge_graph_from_file(
if source_type == 'local file':
file_name = sanitize_filename(file_name)
merged_file_path = validate_file_path(MERGED_DIR, file_name)
uri_latency, result = await extract_graph_from_file_local_file(uri, userName, password, database, model, merged_file_path, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)
uri_latency, result = await extract_graph_from_file_local_file(uri, userName, password, database, model, merged_file_path, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions, user_obj)

elif source_type == 's3 bucket' and source_url:
uri_latency, result = await extract_graph_from_file_s3(uri, userName, password, database, model, source_url, aws_access_key_id, aws_secret_access_key, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)
uri_latency, result = await extract_graph_from_file_s3(uri, userName, password, database, model, source_url, aws_access_key_id, aws_secret_access_key, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions, user_obj)

elif source_type == 'web-url':
uri_latency, result = await extract_graph_from_web_page(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)
uri_latency, result = await extract_graph_from_web_page(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions, user_obj)

elif source_type == 'youtube' and source_url:
uri_latency, result = await extract_graph_from_file_youtube(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)
uri_latency, result = await extract_graph_from_file_youtube(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions, user_obj)

elif source_type == 'Wikipedia' and wiki_query:
uri_latency, result = await extract_graph_from_file_Wikipedia(uri, userName, password, database, model, wiki_query, language, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)
uri_latency, result = await extract_graph_from_file_Wikipedia(uri, userName, password, database, model, wiki_query, language, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions, user_obj)

elif source_type == 'gcs bucket' and gcs_bucket_name:
uri_latency, result = await extract_graph_from_file_gcs(uri, userName, password, database, model, gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions)
uri_latency, result = await extract_graph_from_file_gcs(uri, userName, password, database, model, gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions, user_obj)
else:
return create_api_response('Failed',message='source_type is other than accepted source')
extract_api_time = time.time() - start_time
Expand Down Expand Up @@ -1096,5 +1098,61 @@ async def get_schema_visualization(uri=Form(None), userName=Form(None), password
finally:
gc.collect()

@app.post("/set_user_info")
async def set_user_info(uri=Form(None),
userName=Form(None),
password=Form(None),
database=Form(None),
email=Form(None)):
try:
start = time.time()
graph = create_graph_database_connection(uri, userName, password, database)
graphDb_data_Access = graphDBdataAccess(graph)
result = graphDb_data_Access.save_user_info(email, database)
end = time.time()
elapsed_time = end - start
return create_api_response('Success', data=result,message=f"Total elapsed API time {elapsed_time:.2f}")
except Exception as e:
message="Unable to save the detail of user in DB"
error_message = str(e)
logging.info(message)
logging.exception(f'Exception:{error_message}')
return create_api_response("Failed", message=message, error=error_message)
finally:
gc.collect()

@app.post("/get_user_info")
async def get_user_info(uri=Form(None),
userName=Form(None),
password=Form(None),
database=Form(None),
email=Form(None),
token_chunk_size:int=Form(None)):
try:
start = time.time()
graph = create_graph_database_connection(uri, userName, password, database)
graphDb_data_Access = graphDBdataAccess(graph)
# result = graphDb_data_Access.get_user_detail(email)
MAX_TOKEN_CHUNK_SIZE = int(os.getenv('MAX_TOKEN_CHUNK_SIZE', 10000))
chunk_to_be_created = int(MAX_TOKEN_CHUNK_SIZE / int(token_chunk_size))
userInfo = user_info()
userInfo.chunk_limits= chunk_to_be_created
userInfo.readonly= False
userInfo.rate_limit= 60000
userInfo.remaining_limit = 40000
userInfo.is_chunk_limit_applicable = True
end = time.time()
elapsed_time = end - start
logger.log_struct(userInfo, "INFO")
return create_api_response('Success', data=userInfo,message=f"Total elapsed API time {elapsed_time:.2f}")
except Exception as e:
message="Unable to get the details of user in DB"
error_message = str(e)
logging.info(message)
logging.exception(f'Exception:{error_message}')
return create_api_response("Failed", message=message, error=error_message)
finally:
gc.collect()

if __name__ == "__main__":
uvicorn.run(app)
5 changes: 3 additions & 2 deletions backend/src/create_chunks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from langchain_text_splitters import TokenTextSplitter
from langchain.docstore.document import Document
from src.entities.user import user_info
from langchain_neo4j import Neo4jGraph
import logging
from src.document_sources.youtube import get_chunks_with_timestamps, get_calculated_timestamps
Expand All @@ -14,7 +15,7 @@ def __init__(self, pages: list[Document], graph: Neo4jGraph):
self.pages = pages
self.graph = graph

def split_file_into_chunks(self,token_chunk_size, chunk_overlap):
def split_file_into_chunks(self,token_chunk_size, chunk_overlap, user_obj: user_info):
"""
Split a list of documents(file pages) into chunks of fixed size.

Expand All @@ -33,7 +34,7 @@ def split_file_into_chunks(self,token_chunk_size, chunk_overlap):
chunks = []
for i, document in enumerate(self.pages):
page_number = i + 1
if len(chunks) >= chunk_to_be_created:
if user_obj.is_chunk_limit_applicable and (len(chunks) >= user_obj.chunk_limits or user_info.remaining_limit <= 0) or len(chunks) >= chunk_to_be_created:
break
else:
for chunk in text_splitter.split_documents([document]):
Expand Down
9 changes: 9 additions & 0 deletions backend/src/entities/user.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from pydantic import BaseModel
from typing import Optional

class user_info(BaseModel):
chunk_limits:Optional[int] = 50
readonly:Optional[bool] = False
rate_limit:Optional[int] = 100000
remaining_limit:Optional[int] = 100000
is_chunk_limit_applicable:Optional[bool] = True
8 changes: 7 additions & 1 deletion backend/src/graphDB_dataAccess.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,4 +583,10 @@ def get_websource_url(self,file_name):
RETURN d.url AS url
"""
param = {"file_name" : file_name}
return self.execute_query(query, param)
return self.execute_query(query, param)

def save_user_info(self,email, database):
domain = "@neo4j.com"
is_neo4j_user = domain in email
write_access = self.check_account_access(database=database)
return {"is_neo4j_user": is_neo4j_user, "write_access": write_access}
Loading