Skip to content

adding support for filters to TransformStep #53

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 9, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions src/stepfunctions/steps/sagemaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ class TransformStep(Task):
Creates a Task State to execute a `SageMaker Transform Job <https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html>`_.
"""

def __init__(self, state_id, transformer, job_name, model_name, data, data_type='S3Prefix', content_type=None, compression_type=None, split_type=None, experiment_config=None, wait_for_completion=True, tags=None, **kwargs):
def __init__(self, state_id, transformer, job_name, model_name, data, data_type='S3Prefix', content_type=None, compression_type=None, split_type=None, input_filter=None, output_filter=None, join_source=None, experiment_config=None, wait_for_completion=True, tags=None, **kwargs):
"""
Args:
state_id (str): State name whose length **must be** less than or equal to 128 unicode characters. State names **must be** unique within the scope of the whole state machine.
Expand All @@ -133,6 +133,9 @@ def __init__(self, state_id, transformer, job_name, model_name, data, data_type=
content_type (str): MIME type of the input data (default: None).
compression_type (str): Compression type of the input data, if compressed (default: None). Valid values: 'Gzip', None.
split_type (str): The record delimiter for the input object (default: 'None'). Valid values: 'None', 'Line', 'RecordIO', and 'TFRecord'.
input_filter (str): A JSONPath to select a portion of the input to pass to the algorithm container for inference. If you omit the field, it gets the value ‘$’, representing the entire input. For CSV data, each row is taken as a JSON array, so only index-based JSONPaths can be applied, e.g. $[0], $[1:]. CSV data should follow the RFC format. See Supported JSONPath Operators for a table of supported JSONPath operators. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.features” (default: None).
output_filter (str): A JSONPath to select a portion of the joined/original output to return as the output. For more information, see the SageMaker API documentation for CreateTransformJob. Some examples: “$[1:]”, “$.prediction” (default: None).
join_source (str): The source of data to be joined to the transform output. It can be set to ‘Input’ meaning the entire input record will be joined to the inference result. You can use OutputFilter to select the useful portion before uploading to S3. (default: None). Valid values: Input, None.
experiment_config (dict, optional): Specify the experiment config for the transform. (Default: None)
wait_for_completion(bool, optional): Boolean value set to `True` if the Task state should wait for the transform job to complete before proceeding to the next step in the workflow. Set to `False` if the Task state should submit the transform job and proceed to the next step. (default: True)
tags (list[dict], optional): `List to tags <https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html>`_ to associate with the resource.
Expand All @@ -150,7 +153,10 @@ def __init__(self, state_id, transformer, job_name, model_name, data, data_type=
content_type=content_type,
compression_type=compression_type,
split_type=split_type,
job_name=job_name
job_name=job_name,
input_filter=input_filter,
output_filter=output_filter,
join_source=join_source
)
else:
parameters = transform_config(
Expand All @@ -159,7 +165,10 @@ def __init__(self, state_id, transformer, job_name, model_name, data, data_type=
data_type=data_type,
content_type=content_type,
compression_type=compression_type,
split_type=split_type
split_type=split_type,
input_filter=input_filter,
output_filter=output_filter,
join_source=join_source
)

if isinstance(job_name, (ExecutionInput, StepInput)):
Expand Down
8 changes: 8 additions & 0 deletions tests/unit/test_sagemaker_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,9 @@ def test_transform_step_creation(pca_transformer):
'TrialComponentDisplayName': 'Transform'
},
tags=DEFAULT_TAGS,
join_source='Input',
output_filter='$[2:]',
input_filter='$[1:]'
)
assert step.to_dict() == {
'Type': 'Task',
Expand All @@ -416,6 +419,11 @@ def test_transform_step_creation(pca_transformer):
'TrialName': 'pca_trial',
'TrialComponentDisplayName': 'Transform'
},
'DataProcessing': {
'InputFilter': '$[1:]',
'OutputFilter': '$[2:]',
'JoinSource': 'Input',
},
'Tags': DEFAULT_TAGS_LIST
},
'Resource': 'arn:aws:states:::sagemaker:createTransformJob.sync',
Expand Down