awslabs
diff --git a/‎lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/image_loader.py
+82-45 b/‎lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/image_loader.py
+82-45
diff --git a/‎lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/opensearch_helper.py
+10-4 b/‎lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/opensearch_helper.py
+10-4
diff --git a/‎lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/lambda.py
+65-35 b/‎lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/lambda.py
+65-35
@@ -13,66 +13,127 @@
 import base64
 import json
 import os
+import time
 from typing import List
 from aiohttp import ClientError
+from pathlib import Path
+import numpy as np
+
+
 
 from aws_lambda_powertools import Logger, Tracer
 #from langchain_community.document_loaders.image import UnstructuredImageLoader
 from langchain.docstore.document import Document
 
 import boto3
 
-s3 = boto3.client('s3')
+s3_client = boto3.client('s3')
+bedrock_client = boto3.client('bedrock-runtime')
 
 logger = Logger(service="INGESTION_FILE_TRANSFORMER")
 tracer = Tracer(service="INGESTION_FILE_TRANSFORMER")
 
-@tracer.capture_method
+#@tracer.capture_method
 class image_loader():
     """Loading logic for pdf documents from s3 ."""
 
-    def __init__(self, bucket: str, image_file: str,image_detail_file: str):
+    def __init__(self, bucket: str, image_file: str,image_detail_file: str,modelid:str):
         """Initialize with bucket and key name."""
         self.bucket = bucket
         self.image_file = image_file
         self.image_detail_file = image_detail_file
+        self.modelid=modelid
         print(f"load  image {image_file}, and image txt {image_detail_file} from :: {bucket}")
 
 
+     # convert each file to base64 and store the base64 in a new file
+    def encode_image_to_base64(self,image_file_path,image_file) -> str:
+        with open(image_file_path, "rb") as image_file:
+            b64_image = base64.b64encode(image_file.read()).decode('utf8')
+            b64_image_path = os.path.join("/tmp/", f"{Path(image_file_path).stem}.b64")
+            with open(b64_image_path, "wb") as b64_image_file:
+                b64_image_file.write(bytes(b64_image, 'utf-8'))
+        return b64_image_path
 
-    @tracer.capture_method
+    
+    def BedrockEmbeddings_image(docs,model_id) -> np.ndarray: 
+        
+        for doc in docs:
+            print(f' image {doc}')
+            print(f' page_content {doc.page_content}')
+            print(f' inputImage {doc.page_content}')
+            obj=json.loads(doc.page_content)
+            inputImage=obj["inputImage"]
+            inputText=obj["inputText"]
+
+        body = json.dumps(
+                   { "inputImage":inputImage,
+                    "inputText":inputText
+                    })
+        print(f'body for embeddings :: {body}')  
+        try:
+            response = bedrock_client.invoke_model(
+                body=body, modelId=model_id, accept="application/json", contentType="application/json"
+            )
+            response_body = json.loads(response.get("body").read())
+            embeddings = np.array([response_body.get("embedding")]).astype(np.float32)
+        except Exception as e:
+            logger.error(f" exception={e}")
+            embeddings = None
+
+        return embeddings
+    
+    #@tracer.capture_method
     def load(self):
         """Load documents."""
         try:
             local_file_path = self.download_file(self.image_file)
 
-            with open(f"{local_file_path}", "rb") as image_file:
-                input_image = base64.b64encode(image_file.read()).decode("utf8")
+            # with open(f"{local_file_path}", "rb") as image_file:
+            #     input_image = base64.b64encode(image_file.read()).decode("utf8")
 
-            s3 = boto3.resource('s3')
-            obj = s3.Object(self.bucket, self.image_detail_file)
-            raw_text = obj.get()['Body'].read().decode('utf-8') 
+            b64_image_file_path = self.encode_image_to_base64(local_file_path,self.image_file)
+            print(f'b64_image_file :: {b64_image_file_path}')
+            
+            with open(b64_image_file_path, "rb") as b64_image_file:
+                input_image_b64 = b64_image_file.read().decode('utf-8')
 
-            metadata = {"source": self.image_file}
+            #embeddings=self.get_image_embeddings(input_image_b64,self.modelid)
 
-            docs = json.dumps({
-                    "inputImage": input_image,
-                    "inputText": raw_text,
-                               })
-            print(f'raw_text for titan embeddings {raw_text}')
-            return [Document(page_content=docs, metadata=metadata)]
+            # if embeddings is None:
+            #     logger.error(f"error creating multimodal embeddings for {self.image_file}")
+
+            obj = s3_client.get_object(Bucket=self.bucket, Key=self.image_detail_file)
+            raw_text = obj['Body'].read().decode('utf-8') 
+
+            metadata= {
+                    "filename": self.image_file,
+                    "model_id": self.modelid,                 
+                    "source": self.image_file
+                        }
 
+            docs = json.dumps({
+                    "inputImage": input_image_b64,
+                    "inputText": raw_text
+                    
+                      })
+            documents= [Document(page_content=docs, metadata=metadata)]
+            return documents
         except Exception as exception:
             logger.exception(f"Reason: {exception}")
             return ""
-        
+
+   
+    
+
+
     @tracer.capture_method
     def get_presigned_url(self) -> str:
         try:
-             url = s3.generate_presigned_url(
+             url = s3_client.generate_presigned_url(
                 ClientMethod='get_object', 
                 Params={'Bucket': self.bucket, 'Key': self.image_file},
-                ExpiresIn=900
+                ExpiresIn=2700
                 )
              print(f"presigned url generated for {self.image_file} from {self.bucket}")
              return url
@@ -84,7 +145,7 @@ def get_presigned_url(self) -> str:
     def download_file(self,key )-> str:
         try: 
             file_path = "/tmp/" + os.path.basename(key)
-            s3.download_file(self.bucket, key,file_path)
+            s3_client.download_file(self.bucket, key,file_path)
             print(f"file downloaded {file_path}")
             return file_path
         except ClientError as client_err:
@@ -93,28 +154,4 @@ def download_file(self,key )-> str:
         except Exception as exp:
             print(f"Couldn\'t download file : {exp}")
 
-    @tracer.capture_method
-    def prepare_document_for_direct_load(self)->any:
-            local_file_path = self.download_file(self.image_file)
-            print(f" prepare os_document")
-            
-            with open(f"{local_file_path}", "rb") as image_file:
-                input_image = base64.b64encode(image_file.read()).decode("utf8")
-            
-            s3 = boto3.resource('s3')
-            obj = s3.Object(self.bucket, self.image_detail_file)
-            raw_text = obj.get()['Body'].read().decode('utf-8') 
-
-            metadata = {"source": self.image_file}
-
-            docs = json.dumps({
-                    "inputImage": input_image,
-                    #"inputText": raw_text,
-                               })
-            
-            os_document = {
-                "image_words": raw_text,
-                "image_vector": input_image,
-            }
-            print (f'os_document prepared ')
-            return os_document
+    
@@ -18,6 +18,8 @@
 from aws_lambda_powertools import Logger, Tracer, Metrics
 from aws_lambda_powertools.utilities.typing import LambdaContext
 from aws_lambda_powertools.metrics import MetricUnit
+from helpers.image_loader import image_loader
+
 
 logger = Logger(service="INGESTION_EMBEDDING_JOB")
 tracer = Tracer(service="INGESTION_EMBEDDING_JOB")
@@ -38,11 +40,15 @@ def check_if_index_exists(index_name: str, region: str, host: str, http_auth: Tu
     return exists
 
 def process_shard(shard, os_index_name, os_domain_ep, os_http_auth,model_id) -> int: 
-    print(f'Starting process_shard of {len(shard)} chunks.')
     bedrock_client = boto3.client('bedrock-runtime')
-    embeddings = BedrockEmbeddings(
-        client=bedrock_client, 
-        model_id=model_id)
+
+    # if(model_id=='amazon.titan-embed-image-v1'):
+    #     print(f' save image embeddings in OS')
+    #     embeddings = image_loader.BedrockEmbeddings_image(docs=shard, model_id=model_id,)
+    # else:
+    #     embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
+    embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
+   
     opensearch_url = os_domain_ep if os_domain_ep.startswith("https://") else f"https://{os_domain_ep}"
     docsearch = OpenSearchVectorSearch(index_name=os_index_name,
                                        embedding_function=embeddings,
 
@@ -46,7 +46,6 @@
 
 opensearch_secret_id = os.environ['OPENSEARCH_SECRET_ID']
 bucket_name = os.environ['OUTPUT_BUCKET']
-# TODO: add input_bucket for csv|images
 opensearch_index = os.environ['OPENSEARCH_INDEX']
 opensearch_domain = os.environ['OPENSEARCH_DOMAIN_ENDPOINT']
 opensearch_api_name = os.environ['OPENSEARCH_API_NAME']
@@ -112,9 +111,14 @@ def process_documents_in_es(index_exists, shards, http_auth,model_id):
 def process_documents_in_aoss(index_exists, shards, http_auth,model_id):
     # Reference: https://python.langchain.com/docs/integrations/vectorstores/opensearch#using-aoss-amazon-opensearch-service-serverless
     bedrock_client = boto3.client('bedrock-runtime')
+    # if(model_id=='amazon.titan-embed-image-v1'):
+    #     print(f'image embeddings shards[0] {shards}')
+    #     embeddings = image_loader.BedrockEmbeddings_image(docs=shards[0], model_id=model_id,)
+    # else:
+    #     embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
     embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
-   
-    print(f' Bedrock embeddings model id :: {embeddings.model_id}')
+
+    print(f' check index with :: {shards[0]}')
 
     shard_start_index = 0
     if index_exists is False:
@@ -132,13 +136,14 @@ def process_documents_in_aoss(index_exists, shards, http_auth,model_id):
         )
         # we now need to start the loop below for the second shard
         shard_start_index = 1
-    print(f'statrt processing shard')
     for shard in shards[shard_start_index:]:
+        print(f'processing shard index {shard_start_index}')
         results = process_shard(shard=shard,
                     os_index_name=opensearch_index,
                     os_domain_ep=opensearch_domain,
                     os_http_auth=http_auth,
                     model_id=model_id)
+        
 
 @logger.inject_lambda_context(log_event=True)
 @tracer.capture_lambda_handler
@@ -183,46 +188,42 @@ def handler(event,  context: LambdaContext) -> dict:
     # Images are stored in s3 with presigned url, embeddings is not required.
 
     for transformed_file in event:
-        print(f" staus :: {transformed_file['s3_transformer_result']['Payload']['status']}")
         if transformed_file['s3_transformer_result']['Payload']['status'] == 'File transformed':
             filename = transformed_file['s3_transformer_result']['Payload']['name']
-            name, extension = os.path.splitext(filename)
-            print(f" the name {name} and extension {extension}")
-            # TODO: check file format , if pdf then read raw text from output bucket and update docs[]
-            # if csv|image then read file from input bucket using langchain document loader and update docs[]
+            original_filename = transformed_file['name']
+            name, extension = os.path.splitext(original_filename)
+            print(f" the original_filename {name} and extension {extension}")
             if(extension == '.pdf'):
                 loader = S3TxtFileLoaderInMemory(bucket_name, filename)
                 sub_docs = loader.load()
                 for doc in sub_docs:
-                    doc.metadata['source'] = filename
+                    doc.metadata['source'] = original_filename
                 docs.extend(sub_docs)
+                process_text_embeddings(docs,modelid,http_auth,files,job_id)
             if(extension == '.jpg' or extension == '.jpeg' or extension == '.png' or extension == '.svg'):
-                # Try adding text to document
-                #image_detal_file is created by aws rekognition
-                img_load = image_loader(bucket_name, filename,f"{name}.txt")
-                sub_docs = img_load.load()
-                for doc in sub_docs:
-                    doc.metadata['source'] = filename
-                docs.extend(sub_docs)
-                url = img_load.get_presigned_url()
-                print(f" source :: {filename} ")
-                os_document = img_load.prepare_document_for_direct_load()
-       
+                img_load = image_loader(bucket_name, filename,f"{name}.txt",modelid)
+                docs = img_load.load()
+                url=img_load.get_presigned_url()
+                for doc in docs:
+                    doc.metadata['image_path'] = url
+                process_image_embeddings(docs,modelid,http_auth,files,job_id,url)
 
     if not docs:
             return {
                 'status':'nothing to ingest'
             }
 
+   
+
+def process_text_embeddings(docs,modelid,http_auth,files,job_id):
+    logger.info("process image embeddings with chunks")
     text_splitter = RecursiveCharacterTextSplitter(
                 # Set a really small chunk size, just to show.
                 chunk_size=CHUNCK_SIZE_DOC_SPLIT,
                 chunk_overlap=OVERLAP_FOR_DOC_SPLIT,
                 length_function=len,
             )
 
-    print('Documents loaded locally')
-
     # add a custom metadata field, such as timestamp
     # we can augment data here probably (PII present ? ...)
     for doc in docs:
@@ -233,14 +234,11 @@ def handler(event,  context: LambdaContext) -> dict:
 
     db_shards = (len(chunks) // MAX_OS_DOCS_PER_PUT) + 1
     shards = np.array_split(chunks, db_shards)
-
     # first check if index exists, if it does then call the add_documents function
     # otherwise call the from_documents function which would first create the index
     # and then do a bulk add. Both add_documents and from_documents do a bulk add
     # but it is important to call from_documents first so that the index is created
     # correctly for K-NN
-    
-    print(f'check if index exists shards')
     try:
         index_exists = check_if_index_exists(opensearch_index,
                                                 aws_region,
@@ -254,19 +252,51 @@ def handler(event,  context: LambdaContext) -> dict:
         return {
             'status':'failed'
         }
-
-    print(f'job_id :: {job_id}')
-    if(job_id=="101"):
-        print(f'running for job_id 101, use os directly')
-        create_index_for_image(os_document)
-    else:
-        print(f'Loading chunks into vector store ... using {db_shards} shards')
-        if opensearch_api_name == "es":
+    
+    if opensearch_api_name == "es":
             process_documents_in_es(index_exists, shards, http_auth,modelid)
-        elif opensearch_api_name == "aoss":
+    elif opensearch_api_name == "aoss":
             process_documents_in_aoss(index_exists, shards, http_auth,modelid)
 
+    for file in files:
+        if file['status'] == 'File transformed':
+           file['status'] = 'Ingested'
+        else:
+            file['status'] = 'Error_'+file['status']
+    updateIngestionJobStatus({'jobid': job_id, 'files': files})
+
+    return {
+        'status':'succeed'
+    }
+
+def process_image_embeddings(docs,modelid,http_auth,files,job_id,url):
+    logger.info("process image embeddings")
+    print(f' docs :: {docs}')
+    
+    for doc in docs:
+        doc.metadata['timestamp'] = time.time()
+        doc.metadata['embeddings_model'] = modelid
+   
+    shards = np.array_split(docs,1)
+
+    try:
+        index_exists = check_if_index_exists(opensearch_index,
+                                                aws_region,
+                                                opensearch_domain,
+                                                http_auth)
+    except Exception as e:
+        logger.exception(f'Failed to verify the existence of the os index : {e}')
+        for file in files:
+            file['status'] = 'Error - internal os error cannot connect'
+        updateIngestionJobStatus({'jobid': job_id, 'files': files})
+        return {
+            'status':'failed'
+        }
 
+    if opensearch_api_name == "es":
+            process_documents_in_es(index_exists, shards, http_auth,modelid)
+    elif opensearch_api_name == "aoss":
+            process_documents_in_aoss(index_exists, shards, http_auth,modelid)
 
     for file in files:
         if file['status'] == 'File transformed':