Skip to content

Commit b3d1e12

Browse files
author
Dinesh Sajwan
committed
feat(construct): updated image ingestion
1 parent 5ecd867 commit b3d1e12

File tree

8 files changed

+231
-207
lines changed

8 files changed

+231
-207
lines changed

lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/image_loader.py

+82-45
Original file line numberDiff line numberDiff line change
@@ -13,66 +13,127 @@
1313
import base64
1414
import json
1515
import os
16+
import time
1617
from typing import List
1718
from aiohttp import ClientError
19+
from pathlib import Path
20+
import numpy as np
21+
22+
1823

1924
from aws_lambda_powertools import Logger, Tracer
2025
#from langchain_community.document_loaders.image import UnstructuredImageLoader
2126
from langchain.docstore.document import Document
2227

2328
import boto3
2429

25-
s3 = boto3.client('s3')
30+
s3_client = boto3.client('s3')
31+
bedrock_client = boto3.client('bedrock-runtime')
2632

2733
logger = Logger(service="INGESTION_FILE_TRANSFORMER")
2834
tracer = Tracer(service="INGESTION_FILE_TRANSFORMER")
2935

30-
@tracer.capture_method
36+
#@tracer.capture_method
3137
class image_loader():
3238
"""Loading logic for pdf documents from s3 ."""
3339

34-
def __init__(self, bucket: str, image_file: str,image_detail_file: str):
40+
def __init__(self, bucket: str, image_file: str,image_detail_file: str,modelid:str):
3541
"""Initialize with bucket and key name."""
3642
self.bucket = bucket
3743
self.image_file = image_file
3844
self.image_detail_file = image_detail_file
45+
self.modelid=modelid
3946
print(f"load image {image_file}, and image txt {image_detail_file} from :: {bucket}")
4047

4148

49+
# convert each file to base64 and store the base64 in a new file
50+
def encode_image_to_base64(self,image_file_path,image_file) -> str:
51+
with open(image_file_path, "rb") as image_file:
52+
b64_image = base64.b64encode(image_file.read()).decode('utf8')
53+
b64_image_path = os.path.join("/tmp/", f"{Path(image_file_path).stem}.b64")
54+
with open(b64_image_path, "wb") as b64_image_file:
55+
b64_image_file.write(bytes(b64_image, 'utf-8'))
56+
return b64_image_path
4257

43-
@tracer.capture_method
58+
59+
def BedrockEmbeddings_image(docs,model_id) -> np.ndarray:
60+
61+
for doc in docs:
62+
print(f' image {doc}')
63+
print(f' page_content {doc.page_content}')
64+
print(f' inputImage {doc.page_content}')
65+
obj=json.loads(doc.page_content)
66+
inputImage=obj["inputImage"]
67+
inputText=obj["inputText"]
68+
69+
body = json.dumps(
70+
{ "inputImage":inputImage,
71+
"inputText":inputText
72+
})
73+
print(f'body for embeddings :: {body}')
74+
try:
75+
response = bedrock_client.invoke_model(
76+
body=body, modelId=model_id, accept="application/json", contentType="application/json"
77+
)
78+
response_body = json.loads(response.get("body").read())
79+
embeddings = np.array([response_body.get("embedding")]).astype(np.float32)
80+
except Exception as e:
81+
logger.error(f" exception={e}")
82+
embeddings = None
83+
84+
return embeddings
85+
86+
#@tracer.capture_method
4487
def load(self):
4588
"""Load documents."""
4689
try:
4790
local_file_path = self.download_file(self.image_file)
4891

49-
with open(f"{local_file_path}", "rb") as image_file:
50-
input_image = base64.b64encode(image_file.read()).decode("utf8")
92+
# with open(f"{local_file_path}", "rb") as image_file:
93+
# input_image = base64.b64encode(image_file.read()).decode("utf8")
5194

52-
s3 = boto3.resource('s3')
53-
obj = s3.Object(self.bucket, self.image_detail_file)
54-
raw_text = obj.get()['Body'].read().decode('utf-8')
95+
b64_image_file_path = self.encode_image_to_base64(local_file_path,self.image_file)
96+
print(f'b64_image_file :: {b64_image_file_path}')
97+
98+
with open(b64_image_file_path, "rb") as b64_image_file:
99+
input_image_b64 = b64_image_file.read().decode('utf-8')
55100

56-
metadata = {"source": self.image_file}
101+
#embeddings=self.get_image_embeddings(input_image_b64,self.modelid)
57102

58-
docs = json.dumps({
59-
"inputImage": input_image,
60-
"inputText": raw_text,
61-
})
62-
print(f'raw_text for titan embeddings {raw_text}')
63-
return [Document(page_content=docs, metadata=metadata)]
103+
# if embeddings is None:
104+
# logger.error(f"error creating multimodal embeddings for {self.image_file}")
105+
106+
obj = s3_client.get_object(Bucket=self.bucket, Key=self.image_detail_file)
107+
raw_text = obj['Body'].read().decode('utf-8')
108+
109+
metadata= {
110+
"filename": self.image_file,
111+
"model_id": self.modelid,
112+
"source": self.image_file
113+
}
64114

115+
docs = json.dumps({
116+
"inputImage": input_image_b64,
117+
"inputText": raw_text
118+
119+
})
120+
documents= [Document(page_content=docs, metadata=metadata)]
121+
return documents
65122
except Exception as exception:
66123
logger.exception(f"Reason: {exception}")
67124
return ""
68-
125+
126+
127+
128+
129+
69130
@tracer.capture_method
70131
def get_presigned_url(self) -> str:
71132
try:
72-
url = s3.generate_presigned_url(
133+
url = s3_client.generate_presigned_url(
73134
ClientMethod='get_object',
74135
Params={'Bucket': self.bucket, 'Key': self.image_file},
75-
ExpiresIn=900
136+
ExpiresIn=2700
76137
)
77138
print(f"presigned url generated for {self.image_file} from {self.bucket}")
78139
return url
@@ -84,7 +145,7 @@ def get_presigned_url(self) -> str:
84145
def download_file(self,key )-> str:
85146
try:
86147
file_path = "/tmp/" + os.path.basename(key)
87-
s3.download_file(self.bucket, key,file_path)
148+
s3_client.download_file(self.bucket, key,file_path)
88149
print(f"file downloaded {file_path}")
89150
return file_path
90151
except ClientError as client_err:
@@ -93,28 +154,4 @@ def download_file(self,key )-> str:
93154
except Exception as exp:
94155
print(f"Couldn\'t download file : {exp}")
95156

96-
@tracer.capture_method
97-
def prepare_document_for_direct_load(self)->any:
98-
local_file_path = self.download_file(self.image_file)
99-
print(f" prepare os_document")
100-
101-
with open(f"{local_file_path}", "rb") as image_file:
102-
input_image = base64.b64encode(image_file.read()).decode("utf8")
103-
104-
s3 = boto3.resource('s3')
105-
obj = s3.Object(self.bucket, self.image_detail_file)
106-
raw_text = obj.get()['Body'].read().decode('utf-8')
107-
108-
metadata = {"source": self.image_file}
109-
110-
docs = json.dumps({
111-
"inputImage": input_image,
112-
#"inputText": raw_text,
113-
})
114-
115-
os_document = {
116-
"image_words": raw_text,
117-
"image_vector": input_image,
118-
}
119-
print (f'os_document prepared ')
120-
return os_document
157+

lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/opensearch_helper.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
from aws_lambda_powertools import Logger, Tracer, Metrics
1919
from aws_lambda_powertools.utilities.typing import LambdaContext
2020
from aws_lambda_powertools.metrics import MetricUnit
21+
from helpers.image_loader import image_loader
22+
2123

2224
logger = Logger(service="INGESTION_EMBEDDING_JOB")
2325
tracer = Tracer(service="INGESTION_EMBEDDING_JOB")
@@ -38,11 +40,15 @@ def check_if_index_exists(index_name: str, region: str, host: str, http_auth: Tu
3840
return exists
3941

4042
def process_shard(shard, os_index_name, os_domain_ep, os_http_auth,model_id) -> int:
41-
print(f'Starting process_shard of {len(shard)} chunks.')
4243
bedrock_client = boto3.client('bedrock-runtime')
43-
embeddings = BedrockEmbeddings(
44-
client=bedrock_client,
45-
model_id=model_id)
44+
45+
# if(model_id=='amazon.titan-embed-image-v1'):
46+
# print(f' save image embeddings in OS')
47+
# embeddings = image_loader.BedrockEmbeddings_image(docs=shard, model_id=model_id,)
48+
# else:
49+
# embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
50+
embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
51+
4652
opensearch_url = os_domain_ep if os_domain_ep.startswith("https://") else f"https://{os_domain_ep}"
4753
docsearch = OpenSearchVectorSearch(index_name=os_index_name,
4854
embedding_function=embeddings,

lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/lambda.py

+65-35
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@
4646

4747
opensearch_secret_id = os.environ['OPENSEARCH_SECRET_ID']
4848
bucket_name = os.environ['OUTPUT_BUCKET']
49-
# TODO: add input_bucket for csv|images
5049
opensearch_index = os.environ['OPENSEARCH_INDEX']
5150
opensearch_domain = os.environ['OPENSEARCH_DOMAIN_ENDPOINT']
5251
opensearch_api_name = os.environ['OPENSEARCH_API_NAME']
@@ -112,9 +111,14 @@ def process_documents_in_es(index_exists, shards, http_auth,model_id):
112111
def process_documents_in_aoss(index_exists, shards, http_auth,model_id):
113112
# Reference: https://python.langchain.com/docs/integrations/vectorstores/opensearch#using-aoss-amazon-opensearch-service-serverless
114113
bedrock_client = boto3.client('bedrock-runtime')
114+
# if(model_id=='amazon.titan-embed-image-v1'):
115+
# print(f'image embeddings shards[0] {shards}')
116+
# embeddings = image_loader.BedrockEmbeddings_image(docs=shards[0], model_id=model_id,)
117+
# else:
118+
# embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
115119
embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
116-
117-
print(f' Bedrock embeddings model id :: {embeddings.model_id}')
120+
121+
print(f' check index with :: {shards[0]}')
118122

119123
shard_start_index = 0
120124
if index_exists is False:
@@ -132,13 +136,14 @@ def process_documents_in_aoss(index_exists, shards, http_auth,model_id):
132136
)
133137
# we now need to start the loop below for the second shard
134138
shard_start_index = 1
135-
print(f'statrt processing shard')
136139
for shard in shards[shard_start_index:]:
140+
print(f'processing shard index {shard_start_index}')
137141
results = process_shard(shard=shard,
138142
os_index_name=opensearch_index,
139143
os_domain_ep=opensearch_domain,
140144
os_http_auth=http_auth,
141145
model_id=model_id)
146+
142147

143148
@logger.inject_lambda_context(log_event=True)
144149
@tracer.capture_lambda_handler
@@ -183,46 +188,42 @@ def handler(event, context: LambdaContext) -> dict:
183188
# Images are stored in s3 with presigned url, embeddings is not required.
184189

185190
for transformed_file in event:
186-
print(f" staus :: {transformed_file['s3_transformer_result']['Payload']['status']}")
187191
if transformed_file['s3_transformer_result']['Payload']['status'] == 'File transformed':
188192
filename = transformed_file['s3_transformer_result']['Payload']['name']
189-
name, extension = os.path.splitext(filename)
190-
print(f" the name {name} and extension {extension}")
191-
# TODO: check file format , if pdf then read raw text from output bucket and update docs[]
192-
# if csv|image then read file from input bucket using langchain document loader and update docs[]
193+
original_filename = transformed_file['name']
194+
name, extension = os.path.splitext(original_filename)
195+
print(f" the original_filename {name} and extension {extension}")
193196
if(extension == '.pdf'):
194197
loader = S3TxtFileLoaderInMemory(bucket_name, filename)
195198
sub_docs = loader.load()
196199
for doc in sub_docs:
197-
doc.metadata['source'] = filename
200+
doc.metadata['source'] = original_filename
198201
docs.extend(sub_docs)
202+
process_text_embeddings(docs,modelid,http_auth,files,job_id)
199203
if(extension == '.jpg' or extension == '.jpeg' or extension == '.png' or extension == '.svg'):
200-
# Try adding text to document
201-
#image_detal_file is created by aws rekognition
202-
img_load = image_loader(bucket_name, filename,f"{name}.txt")
203-
sub_docs = img_load.load()
204-
for doc in sub_docs:
205-
doc.metadata['source'] = filename
206-
docs.extend(sub_docs)
207-
url = img_load.get_presigned_url()
208-
print(f" source :: {filename} ")
209-
os_document = img_load.prepare_document_for_direct_load()
210-
204+
img_load = image_loader(bucket_name, filename,f"{name}.txt",modelid)
205+
docs = img_load.load()
206+
url=img_load.get_presigned_url()
207+
for doc in docs:
208+
doc.metadata['image_path'] = url
209+
process_image_embeddings(docs,modelid,http_auth,files,job_id,url)
211210

212211
if not docs:
213212
return {
214213
'status':'nothing to ingest'
215214
}
216215

216+
217+
218+
def process_text_embeddings(docs,modelid,http_auth,files,job_id):
219+
logger.info("process image embeddings with chunks")
217220
text_splitter = RecursiveCharacterTextSplitter(
218221
# Set a really small chunk size, just to show.
219222
chunk_size=CHUNCK_SIZE_DOC_SPLIT,
220223
chunk_overlap=OVERLAP_FOR_DOC_SPLIT,
221224
length_function=len,
222225
)
223226

224-
print('Documents loaded locally')
225-
226227
# add a custom metadata field, such as timestamp
227228
# we can augment data here probably (PII present ? ...)
228229
for doc in docs:
@@ -233,14 +234,11 @@ def handler(event, context: LambdaContext) -> dict:
233234

234235
db_shards = (len(chunks) // MAX_OS_DOCS_PER_PUT) + 1
235236
shards = np.array_split(chunks, db_shards)
236-
237237
# first check if index exists, if it does then call the add_documents function
238238
# otherwise call the from_documents function which would first create the index
239239
# and then do a bulk add. Both add_documents and from_documents do a bulk add
240240
# but it is important to call from_documents first so that the index is created
241241
# correctly for K-NN
242-
243-
print(f'check if index exists shards')
244242
try:
245243
index_exists = check_if_index_exists(opensearch_index,
246244
aws_region,
@@ -254,19 +252,51 @@ def handler(event, context: LambdaContext) -> dict:
254252
return {
255253
'status':'failed'
256254
}
257-
258-
print(f'job_id :: {job_id}')
259-
if(job_id=="101"):
260-
print(f'running for job_id 101, use os directly')
261-
create_index_for_image(os_document)
262-
else:
263-
print(f'Loading chunks into vector store ... using {db_shards} shards')
264-
if opensearch_api_name == "es":
255+
256+
if opensearch_api_name == "es":
265257
process_documents_in_es(index_exists, shards, http_auth,modelid)
266-
elif opensearch_api_name == "aoss":
258+
elif opensearch_api_name == "aoss":
267259
process_documents_in_aoss(index_exists, shards, http_auth,modelid)
268260

261+
for file in files:
262+
if file['status'] == 'File transformed':
263+
file['status'] = 'Ingested'
264+
else:
265+
file['status'] = 'Error_'+file['status']
266+
updateIngestionJobStatus({'jobid': job_id, 'files': files})
267+
268+
return {
269+
'status':'succeed'
270+
}
271+
272+
def process_image_embeddings(docs,modelid,http_auth,files,job_id,url):
273+
logger.info("process image embeddings")
274+
print(f' docs :: {docs}')
275+
276+
for doc in docs:
277+
doc.metadata['timestamp'] = time.time()
278+
doc.metadata['embeddings_model'] = modelid
279+
280+
shards = np.array_split(docs,1)
281+
282+
try:
283+
index_exists = check_if_index_exists(opensearch_index,
284+
aws_region,
285+
opensearch_domain,
286+
http_auth)
287+
except Exception as e:
288+
logger.exception(f'Failed to verify the existence of the os index : {e}')
289+
for file in files:
290+
file['status'] = 'Error - internal os error cannot connect'
291+
updateIngestionJobStatus({'jobid': job_id, 'files': files})
292+
return {
293+
'status':'failed'
294+
}
269295

296+
if opensearch_api_name == "es":
297+
process_documents_in_es(index_exists, shards, http_auth,modelid)
298+
elif opensearch_api_name == "aoss":
299+
process_documents_in_aoss(index_exists, shards, http_auth,modelid)
270300

271301
for file in files:
272302
if file['status'] == 'File transformed':

0 commit comments

Comments
 (0)