Skip to content

Commit 8b6d0eb

Browse files
authored
feat: Support placeholders for TransformStep (#157)
Make it possible to set Transform properties dynamically by using Placeholders in the parameters field that are passed in the TransformStep.
1 parent 5f73cf7 commit 8b6d0eb

File tree

3 files changed

+229
-18
lines changed

3 files changed

+229
-18
lines changed

src/stepfunctions/steps/sagemaker.py

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -185,36 +185,42 @@ def __merge_hyperparameters(self, training_step_hyperparameters, estimator_hyper
185185
merged_hyperparameters[key] = value
186186
return merged_hyperparameters
187187

188+
188189
class TransformStep(Task):
189190

190191
"""
191192
Creates a Task State to execute a `SageMaker Transform Job <https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html>`_.
192193
"""
193194

194-
def __init__(self, state_id, transformer, job_name, model_name, data, data_type='S3Prefix', content_type=None, compression_type=None, split_type=None, experiment_config=None, wait_for_completion=True, tags=None, input_filter=None, output_filter=None, join_source=None, **kwargs):
195+
def __init__(self, state_id, transformer, job_name, model_name, data, data_type='S3Prefix', content_type=None,
196+
compression_type=None, split_type=None, experiment_config=None, wait_for_completion=True, tags=None,
197+
input_filter=None, output_filter=None, join_source=None, **kwargs):
195198
"""
196199
Args:
197200
state_id (str): State name whose length **must be** less than or equal to 128 unicode characters. State names **must be** unique within the scope of the whole state machine.
198201
transformer (sagemaker.transformer.Transformer): The SageMaker transformer to use in the TransformStep.
199202
job_name (str or Placeholder): Specify a transform job name. We recommend to use :py:class:`~stepfunctions.inputs.ExecutionInput` placeholder collection to pass the value dynamically in each execution.
200203
model_name (str or Placeholder): Specify a model name for the transform job to use. We recommend to use :py:class:`~stepfunctions.inputs.ExecutionInput` placeholder collection to pass the value dynamically in each execution.
201-
data (str): Input data location in S3.
202-
data_type (str): What the S3 location defines (default: 'S3Prefix').
204+
data (str or Placeholder): Input data location in S3.
205+
data_type (str or Placeholder): What the S3 location defines (default: 'S3Prefix').
203206
Valid values:
204207
205208
* 'S3Prefix' - the S3 URI defines a key name prefix. All objects with this prefix will
206209
be used as inputs for the transform job.
207210
* 'ManifestFile' - the S3 URI points to a single manifest file listing each S3 object
208211
to use as an input for the transform job.
209-
content_type (str): MIME type of the input data (default: None).
210-
compression_type (str): Compression type of the input data, if compressed (default: None). Valid values: 'Gzip', None.
211-
split_type (str): The record delimiter for the input object (default: 'None'). Valid values: 'None', 'Line', 'RecordIO', and 'TFRecord'.
212-
experiment_config (dict, optional): Specify the experiment config for the transform. (Default: None)
212+
content_type (str or Placeholder): MIME type of the input data (default: None).
213+
compression_type (str or Placeholder): Compression type of the input data, if compressed (default: None). Valid values: 'Gzip', None.
214+
split_type (str or Placeholder): The record delimiter for the input object (default: 'None'). Valid values: 'None', 'Line', 'RecordIO', and 'TFRecord'.
215+
experiment_config (dict or Placeholder, optional): Specify the experiment config for the transform. (Default: None)
213216
wait_for_completion(bool, optional): Boolean value set to `True` if the Task state should wait for the transform job to complete before proceeding to the next step in the workflow. Set to `False` if the Task state should submit the transform job and proceed to the next step. (default: True)
214-
tags (list[dict], optional): `List to tags <https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html>`_ to associate with the resource.
215-
input_filter (str): A JSONPath to select a portion of the input to pass to the algorithm container for inference. If you omit the field, it gets the value ‘$’, representing the entire input. For CSV data, each row is taken as a JSON array, so only index-based JSONPaths can be applied, e.g. $[0], $[1:]. CSV data should follow the RFC format. See Supported JSONPath Operators for a table of supported JSONPath operators. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.features” (default: None).
216-
output_filter (str): A JSONPath to select a portion of the joined/original output to return as the output. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.prediction” (default: None).
217-
join_source (str): The source of data to be joined to the transform output. It can be set to ‘Input’ meaning the entire input record will be joined to the inference result. You can use OutputFilter to select the useful portion before uploading to S3. (default: None). Valid values: Input, None.
217+
tags (list[dict] or Placeholder, optional): `List to tags <https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html>`_ to associate with the resource.
218+
input_filter (str or Placeholder): A JSONPath to select a portion of the input to pass to the algorithm container for inference. If you omit the field, it gets the value ‘$’, representing the entire input. For CSV data, each row is taken as a JSON array, so only index-based JSONPaths can be applied, e.g. $[0], $[1:]. CSV data should follow the RFC format. See Supported JSONPath Operators for a table of supported JSONPath operators. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.features” (default: None).
219+
output_filter (str or Placeholder): A JSONPath to select a portion of the joined/original output to return as the output. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.prediction” (default: None).
220+
join_source (str or Placeholder): The source of data to be joined to the transform output. It can be set to ‘Input’ meaning the entire input record will be joined to the inference result. You can use OutputFilter to select the useful portion before uploading to S3. (default: None). Valid values: Input, None.
221+
parameters(dict, optional): The value of this field is merged with other arguments to become the request payload for SageMaker `CreateTransformJob<https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateTransformJob.html>`_.
222+
You can use `parameters` to override the value provided by other arguments and specify any field's value dynamically using `Placeholders<https://aws-step-functions-data-science-sdk.readthedocs.io/en/stable/placeholders.html?highlight=placeholder#stepfunctions.inputs.Placeholder>`_.
223+
218224
"""
219225
if wait_for_completion:
220226
"""
@@ -233,7 +239,7 @@ def __init__(self, state_id, transformer, job_name, model_name, data, data_type=
233239
SageMakerApi.CreateTransformJob)
234240

235241
if isinstance(job_name, str):
236-
parameters = transform_config(
242+
transform_parameters = transform_config(
237243
transformer=transformer,
238244
data=data,
239245
data_type=data_type,
@@ -246,7 +252,7 @@ def __init__(self, state_id, transformer, job_name, model_name, data, data_type=
246252
join_source=join_source
247253
)
248254
else:
249-
parameters = transform_config(
255+
transform_parameters = transform_config(
250256
transformer=transformer,
251257
data=data,
252258
data_type=data_type,
@@ -259,17 +265,21 @@ def __init__(self, state_id, transformer, job_name, model_name, data, data_type=
259265
)
260266

261267
if isinstance(job_name, Placeholder):
262-
parameters['TransformJobName'] = job_name
268+
transform_parameters['TransformJobName'] = job_name
263269

264-
parameters['ModelName'] = model_name
270+
transform_parameters['ModelName'] = model_name
265271

266272
if experiment_config is not None:
267-
parameters['ExperimentConfig'] = experiment_config
273+
transform_parameters['ExperimentConfig'] = experiment_config
268274

269275
if tags:
270-
parameters['Tags'] = tags_dict_to_kv_list(tags)
276+
transform_parameters['Tags'] = tags if isinstance(tags, Placeholder) else tags_dict_to_kv_list(tags)
271277

272-
kwargs[Field.Parameters.value] = parameters
278+
if Field.Parameters.value in kwargs and isinstance(kwargs[Field.Parameters.value], dict):
279+
# Update transform_parameters with input parameters
280+
merge_dicts(transform_parameters, kwargs[Field.Parameters.value])
281+
282+
kwargs[Field.Parameters.value] = transform_parameters
273283
super(TransformStep, self).__init__(state_id, **kwargs)
274284

275285

tests/integ/test_sagemaker_steps.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,96 @@ def test_transform_step(trained_estimator, sfn_client, sfn_role_arn):
179179
state_machine_delete_wait(sfn_client, workflow.state_machine_arn)
180180
# End of Cleanup
181181

182+
183+
def test_transform_step_with_placeholder(trained_estimator, sfn_client, sfn_role_arn):
184+
# Create transformer from supplied estimator
185+
job_name = generate_job_name()
186+
pca_transformer = trained_estimator.transformer(instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE)
187+
188+
# Create a model step to save the model
189+
model_step = ModelStep('create_model_step', model=trained_estimator.create_model(), model_name=job_name)
190+
model_step.add_retry(SAGEMAKER_RETRY_STRATEGY)
191+
192+
# Upload data for transformation to S3
193+
data_path = os.path.join(DATA_DIR, "one_p_mnist")
194+
transform_input_path = os.path.join(data_path, "transform_input.csv")
195+
transform_input_key_prefix = "integ-test-data/one_p_mnist/transform"
196+
transform_input = pca_transformer.sagemaker_session.upload_data(
197+
path=transform_input_path, key_prefix=transform_input_key_prefix
198+
)
199+
200+
execution_input = ExecutionInput(schema={
201+
'data': str,
202+
'content_type': str,
203+
'split_type': str,
204+
'job_name': str,
205+
'model_name': str,
206+
'instance_count': int,
207+
'instance_type': str,
208+
'strategy': str,
209+
'max_concurrent_transforms': int,
210+
'max_payload': int,
211+
})
212+
213+
parameters = {
214+
'BatchStrategy': execution_input['strategy'],
215+
'TransformInput': {
216+
'SplitType': execution_input['split_type'],
217+
},
218+
'TransformResources': {
219+
'InstanceCount': execution_input['instance_count'],
220+
'InstanceType': execution_input['instance_type'],
221+
},
222+
'MaxConcurrentTransforms': execution_input['max_concurrent_transforms'],
223+
'MaxPayloadInMB': execution_input['max_payload']
224+
}
225+
226+
# Build workflow definition
227+
transform_step = TransformStep(
228+
'create_transform_job_step',
229+
pca_transformer,
230+
job_name=execution_input['job_name'],
231+
model_name=execution_input['model_name'],
232+
data=execution_input['data'],
233+
content_type=execution_input['content_type'],
234+
parameters=parameters
235+
)
236+
transform_step.add_retry(SAGEMAKER_RETRY_STRATEGY)
237+
workflow_graph = Chain([model_step, transform_step])
238+
239+
with timeout(minutes=DEFAULT_TIMEOUT_MINUTES):
240+
# Create workflow and check definition
241+
workflow = create_workflow_and_check_definition(
242+
workflow_graph=workflow_graph,
243+
workflow_name=unique_name_from_base("integ-test-transform-step-workflow"),
244+
sfn_client=sfn_client,
245+
sfn_role_arn=sfn_role_arn
246+
)
247+
248+
execution_input = {
249+
'job_name': job_name,
250+
'model_name': job_name,
251+
'data': transform_input,
252+
'content_type': "text/csv",
253+
'instance_count': INSTANCE_COUNT,
254+
'instance_type': INSTANCE_TYPE,
255+
'split_type': 'Line',
256+
'strategy': 'SingleRecord',
257+
'max_concurrent_transforms': 2,
258+
'max_payload': 5
259+
}
260+
261+
# Execute workflow
262+
execution = workflow.execute(inputs=execution_input)
263+
execution_output = execution.get_output(wait=True)
264+
265+
# Check workflow output
266+
assert execution_output.get("TransformJobStatus") == "Completed"
267+
268+
# Cleanup
269+
state_machine_delete_wait(sfn_client, workflow.state_machine_arn)
270+
271+
182272
def test_endpoint_config_step(trained_estimator, sfn_client, sagemaker_session, sfn_role_arn):
183273
# Setup: Create model for trained estimator in SageMaker
184274
model = trained_estimator.create_model()

tests/unit/test_sagemaker_steps.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -901,6 +901,117 @@ def test_transform_step_creation(pca_transformer):
901901
}
902902

903903

904+
@patch.object(boto3.session.Session, 'region_name', 'us-east-1')
905+
def test_transform_step_creation_with_placeholder(pca_transformer):
906+
execution_input = ExecutionInput(schema={
907+
'data': str,
908+
'data_type': str,
909+
'content_type': str,
910+
'compression_type': str,
911+
'split_type': str,
912+
'input_filter': str,
913+
'output_filter': str,
914+
'join_source': str,
915+
'job_name': str,
916+
'model_name': str,
917+
'instance_count': int,
918+
'strategy': str,
919+
'assemble_with': str,
920+
'output_path': str,
921+
'output_kms_key': str,
922+
'accept': str,
923+
'max_concurrent_transforms': int,
924+
'max_payload': int,
925+
'tags': [{str: str}],
926+
'env': str,
927+
'volume_kms_key': str,
928+
'experiment_config': str,
929+
})
930+
931+
step_input = StepInput(schema={
932+
'instance_type': str
933+
})
934+
935+
parameters = {
936+
'BatchStrategy': execution_input['strategy'],
937+
'TransformOutput': {
938+
'Accept': execution_input['accept'],
939+
'AssembleWith': execution_input['assemble_with'],
940+
'KmsKeyId': execution_input['output_kms_key'],
941+
'S3OutputPath': execution_input['output_path']
942+
},
943+
'TransformResources': {
944+
'InstanceCount': execution_input['instance_count'],
945+
'InstanceType': step_input['instance_type'],
946+
'VolumeKmsKeyId': execution_input['volume_kms_key']
947+
},
948+
'ExperimentConfig': execution_input['experiment_config'],
949+
'Tags': execution_input['tags'],
950+
'Environment': execution_input['env'],
951+
'MaxConcurrentTransforms': execution_input['max_concurrent_transforms'],
952+
'MaxPayloadInMB': execution_input['max_payload'],
953+
}
954+
955+
step = TransformStep('Inference',
956+
transformer=pca_transformer,
957+
data=execution_input['data'],
958+
data_type=execution_input['data_type'],
959+
content_type=execution_input['content_type'],
960+
compression_type=execution_input['compression_type'],
961+
split_type=execution_input['split_type'],
962+
job_name=execution_input['job_name'],
963+
model_name=execution_input['model_name'],
964+
experiment_config={
965+
'ExperimentName': 'pca_experiment',
966+
'TrialName': 'pca_trial',
967+
'TrialComponentDisplayName': 'Transform'
968+
},
969+
tags=execution_input['tags'],
970+
join_source=execution_input['join_source'],
971+
output_filter=execution_input['output_filter'],
972+
input_filter=execution_input['input_filter'],
973+
parameters=parameters
974+
)
975+
976+
assert step.to_dict()['Parameters'] == {
977+
'BatchStrategy.$': "$$.Execution.Input['strategy']",
978+
'ModelName.$': "$$.Execution.Input['model_name']",
979+
'TransformInput': {
980+
'CompressionType.$': "$$.Execution.Input['compression_type']",
981+
'ContentType.$': "$$.Execution.Input['content_type']",
982+
'DataSource': {
983+
'S3DataSource': {
984+
'S3DataType.$': "$$.Execution.Input['data_type']",
985+
'S3Uri.$': "$$.Execution.Input['data']"
986+
}
987+
},
988+
'SplitType.$': "$$.Execution.Input['split_type']"
989+
},
990+
'TransformOutput': {
991+
'Accept.$': "$$.Execution.Input['accept']",
992+
'AssembleWith.$': "$$.Execution.Input['assemble_with']",
993+
'KmsKeyId.$': "$$.Execution.Input['output_kms_key']",
994+
'S3OutputPath.$': "$$.Execution.Input['output_path']"
995+
},
996+
'TransformJobName.$': "$$.Execution.Input['job_name']",
997+
'TransformResources': {
998+
'InstanceCount.$': "$$.Execution.Input['instance_count']",
999+
'InstanceType.$': "$['instance_type']",
1000+
'VolumeKmsKeyId.$': "$$.Execution.Input['volume_kms_key']"
1001+
},
1002+
'ExperimentConfig.$': "$$.Execution.Input['experiment_config']",
1003+
'DataProcessing': {
1004+
'InputFilter.$': "$$.Execution.Input['input_filter']",
1005+
'OutputFilter.$': "$$.Execution.Input['output_filter']",
1006+
'JoinSource.$': "$$.Execution.Input['join_source']",
1007+
},
1008+
'Tags.$': "$$.Execution.Input['tags']",
1009+
'Environment.$': "$$.Execution.Input['env']",
1010+
'MaxConcurrentTransforms.$': "$$.Execution.Input['max_concurrent_transforms']",
1011+
'MaxPayloadInMB.$': "$$.Execution.Input['max_payload']"
1012+
}
1013+
1014+
9041015
@patch('botocore.client.BaseClient._make_api_call', new=mock_boto_api_call)
9051016
@patch.object(boto3.session.Session, 'region_name', 'us-east-1')
9061017
def test_get_expected_model(pca_estimator):

0 commit comments

Comments
 (0)