Merge branch 'DEV' of https://github.com/neo4j-labs/llm-graph-builder into DEV

kartikpersistent · kartikpersistent · commit 8de117b9e4ab · 2024-10-10T10:52:41.000Z
diff --git a/README.md b/README.md
@@ -127,8 +127,6 @@ Allow unauthenticated request : Yes
 ## ENV
 | Env Variable Name       | Mandatory/Optional | Default Value | Description                                                                                      |
 |-------------------------|--------------------|---------------|--------------------------------------------------------------------------------------------------|
-| OPENAI_API_KEY          | Mandatory          |               | API key for OpenAI                                                                               |
-| DIFFBOT_API_KEY         | Mandatory          |               | API key for Diffbot                                                                              |
 | EMBEDDING_MODEL         | Optional           | all-MiniLM-L6-v2 | Model for generating the text embedding (all-MiniLM-L6-v2 , openai , vertexai)                |
 | IS_EMBEDDING            | Optional           | true          | Flag to enable text embedding                                                                    |
 | KNN_MIN_SCORE           | Optional           | 0.94          | Minimum score for KNN algorithm                                                                  |
diff --git a/backend/example.env b/backend/example.env
@@ -28,6 +28,11 @@ ENTITY_EMBEDDING="" True or False
 DUPLICATE_SCORE_VALUE = ""
 DUPLICATE_TEXT_DISTANCE = ""
 #examples
+LLM_MODEL_CONFIG_openai_gpt_3.5="gpt-3.5-turbo-0125,openai_api_key"
+LLM_MODEL_CONFIG_openai_gpt_4o_mini="gpt-4o-mini-2024-07-18,openai_api_key"
+LLM_MODEL_CONFIG_gemini_1.5_pro="gemini-1.5-pro-002"
+LLM_MODEL_CONFIG_gemini_1.5_flash="gemini-1.5-flash-002"
+LLM_MODEL_CONFIG_diffbot="diffbot,diffbot_api_key"
 LLM_MODEL_CONFIG_azure_ai_gpt_35="azure_deployment_name,azure_endpoint or base_url,azure_api_key,api_version"
 LLM_MODEL_CONFIG_azure_ai_gpt_4o="gpt-4o,https://YOUR-ENDPOINT.openai.azure.com/,azure_api_key,api_version"
 LLM_MODEL_CONFIG_groq_llama3_70b="model_name,base_url,groq_api_key"
diff --git a/backend/src/llm.py b/backend/src/llm.py
@@ -24,9 +24,11 @@ def get_llm(model: str):
     env_key = "LLM_MODEL_CONFIG_" + model
     env_value = os.environ.get(env_key)
     logging.info("Model: {}".format(env_key))
+    
     if "gemini" in model:
+        model_name = env_value
         credentials, project_id = google.auth.default()
-        model_name = MODEL_VERSIONS[model]
+        #model_name = MODEL_VERSIONS[model]
         llm = ChatVertexAI(
             model_name=model_name,
             #convert_system_message_to_human=True,
@@ -42,9 +44,10 @@ def get_llm(model: str):
             },
         )
     elif "openai" in model:
-        model_name = MODEL_VERSIONS[model]
+        #model_name = MODEL_VERSIONS[model]
+        model_name, api_key = env_value.split(",")
         llm = ChatOpenAI(
-            api_key=os.environ.get("OPENAI_API_KEY"),
+            api_key=api_key,
             model=model_name,
             temperature=0,
         )
@@ -93,9 +96,10 @@ def get_llm(model: str):
         llm = ChatOllama(base_url=base_url, model=model_name)
 
     elif "diffbot" in model:
-        model_name = "diffbot"
+        #model_name = "diffbot"
+        model_name, api_key = env_value.split(",")
         llm = DiffbotGraphTransformer(
-            diffbot_api_key=os.environ.get("DIFFBOT_API_KEY"),
+            diffbot_api_key=api_key,
             extract_types=["entities", "facts"],
         )
     
diff --git a/backend/src/main.py b/backend/src/main.py
@@ -518,26 +518,30 @@ def get_chunkId_chunkDoc_list(graph, file_name, pages, retry_condition):
   else:  
     chunkId_chunkDoc_list=[]
     chunks =  graph.query(QUERY_TO_GET_CHUNKS, params={"filename":file_name})
-    for chunk in chunks:
-      chunk_doc = Document(page_content=chunk['text'], metadata={'id':chunk['id'], 'position':chunk['position']})
-      chunkId_chunkDoc_list.append({'chunk_id': chunk['id'], 'chunk_doc': chunk_doc})
     
-    if retry_condition ==  START_FROM_LAST_PROCESSED_POSITION:
-      logging.info(f"Retry : start_from_last_processed_position")
-      starting_chunk = graph.query(QUERY_TO_GET_LAST_PROCESSED_CHUNK_POSITION, params={"filename":file_name})
-      if starting_chunk[0]["position"] < len(chunkId_chunkDoc_list):
-        return len(chunks), chunkId_chunkDoc_list[starting_chunk[0]["position"] - 1:]
+    if chunks[0]['text'] is None or chunks[0]['text']=="" :
+      raise Exception(f"Chunks are not created for {file_name}. Please re-upload file and try.")    
+    else:
+      for chunk in chunks:
+        chunk_doc = Document(page_content=chunk['text'], metadata={'id':chunk['id'], 'position':chunk['position']})
+        chunkId_chunkDoc_list.append({'chunk_id': chunk['id'], 'chunk_doc': chunk_doc})
       
-      elif starting_chunk[0]["position"] == len(chunkId_chunkDoc_list):
-        starting_chunk = graph.query(QUERY_TO_GET_LAST_PROCESSED_CHUNK_WITHOUT_ENTITY, params={"filename":file_name})
-        return len(chunks), chunkId_chunkDoc_list[starting_chunk[0]["position"] - 1:]
+      if retry_condition ==  START_FROM_LAST_PROCESSED_POSITION:
+        logging.info(f"Retry : start_from_last_processed_position")
+        starting_chunk = graph.query(QUERY_TO_GET_LAST_PROCESSED_CHUNK_POSITION, params={"filename":file_name})
+        if starting_chunk[0]["position"] < len(chunkId_chunkDoc_list):
+          return len(chunks), chunkId_chunkDoc_list[starting_chunk[0]["position"] - 1:]
+        
+        elif starting_chunk[0]["position"] == len(chunkId_chunkDoc_list):
+          starting_chunk = graph.query(QUERY_TO_GET_LAST_PROCESSED_CHUNK_WITHOUT_ENTITY, params={"filename":file_name})
+          return len(chunks), chunkId_chunkDoc_list[starting_chunk[0]["position"] - 1:]
+        
+        else:
+          raise Exception(f"All chunks of {file_name} are alreday processed. If you want to re-process, Please start from begnning")    
       
       else:
-        raise Exception(f"All chunks of {file_name} are alreday processed. If you want to re-process, Please start from begnning")    
-    
-    else:
-      logging.info(f"Retry : start_from_beginning with chunks {len(chunkId_chunkDoc_list)}")    
-      return len(chunks), chunkId_chunkDoc_list
+        logging.info(f"Retry : start_from_beginning with chunks {len(chunkId_chunkDoc_list)}")    
+        return len(chunks), chunkId_chunkDoc_list
   
 def get_source_list_from_graph(uri,userName,password,db_name=None):
   """
diff --git a/example.env b/example.env
@@ -1,7 +1,3 @@
-# Mandatory
-OPENAI_API_KEY=""
-DIFFBOT_API_KEY=""
-
 # Optional Backend
 EMBEDDING_MODEL="all-MiniLM-L6-v2"
 IS_EMBEDDING="true"
diff --git a/frontend/src/utils/Constants.ts b/frontend/src/utils/Constants.ts
@@ -41,11 +41,11 @@ export const llms =
     ? (process.env.VITE_LLM_MODELS?.split(',') as string[])
     : [
         'diffbot',
-        'openai-gpt-3.5',
-        'openai-gpt-4o',
-        'openai-gpt-4o-mini',
-        'gemini-1.5-pro',
-        'gemini-1.5-flash',
+        'openai_gpt_3.5',
+        'openai_gpt_4o',
+        'openai_gpt_4o_mini',
+        'gemini_1.5_pro',
+        'gemini_1.5_flash',
         'azure_ai_gpt_35',
         'azure_ai_gpt_4o',
         'ollama_llama3',