From 533f971636495a2030a6b41591b57f29ebdb17cc Mon Sep 17 00:00:00 2001 From: ananth102 Date: Fri, 21 Oct 2022 18:14:27 +0000 Subject: [PATCH 01/71] feat: warm pool --- apis/v1alpha1/ack-generate-metadata.yaml | 8 +- apis/v1alpha1/app.go | 3 +- apis/v1alpha1/enums.go | 166 +++++++++++++++++- apis/v1alpha1/generator.yaml | 3 + apis/v1alpha1/training_job.go | 5 + apis/v1alpha1/types.go | 47 ++++- apis/v1alpha1/user_profile.go | 14 +- apis/v1alpha1/zz_generated.deepcopy.go | 100 +++++++++++ .../sagemaker.services.k8s.aws_apps.yaml | 3 +- ...ices.k8s.aws_hyperparametertuningjobs.yaml | 6 + ...gemaker.services.k8s.aws_trainingjobs.yaml | 9 +- ...gemaker.services.k8s.aws_userprofiles.yaml | 12 +- generator.yaml | 3 + go.mod | 2 +- go.sum | 3 +- .../crds/sagemaker.services.k8s.aws_apps.yaml | 3 +- ...ices.k8s.aws_hyperparametertuningjobs.yaml | 6 + ...gemaker.services.k8s.aws_trainingjobs.yaml | 9 +- ...gemaker.services.k8s.aws_userprofiles.yaml | 12 +- .../hyper_parameter_tuning_job/delta.go | 7 + .../hyper_parameter_tuning_job/sdk.go | 12 ++ pkg/resource/training_job/delta.go | 7 + pkg/resource/training_job/sdk.go | 6 + 23 files changed, 405 insertions(+), 41 deletions(-) diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index 16f7d256..99198692 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,13 +1,13 @@ ack_generate_info: - build_date: "2022-10-04T18:55:06Z" + build_date: "2022-10-21T18:02:34Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc -api_directory_checksum: ca9187f53c674d6424c5a4120fe2609afce3d52a +api_directory_checksum: 255bd4888fa527d059365a19726d1848c42177e5 api_version: v1alpha1 -aws_sdk_go_version: v1.44.93 +aws_sdk_go_version: v1.44.117 generator_config_info: - file_checksum: 858695b7159c1a59326d91623f545bf0be1c18d2 + file_checksum: 86528885e7155246332cc5d9d40a017285b34456 original_file_name: generator.yaml last_modification: reason: API generation diff --git a/apis/v1alpha1/app.go b/apis/v1alpha1/app.go index 31243777..0ac32d9c 100644 --- a/apis/v1alpha1/app.go +++ b/apis/v1alpha1/app.go @@ -25,8 +25,7 @@ type AppSpec struct { // The name of the app. // +kubebuilder:validation:Required AppName *string `json:"appName"` - // The type of app. Supported apps are JupyterServer and KernelGateway. TensorBoard - // is not supported. + // The type of app. // +kubebuilder:validation:Required AppType *string `json:"appType"` // The domain ID. diff --git a/apis/v1alpha1/enums.go b/apis/v1alpha1/enums.go index 31227bcc..2c3548fa 100644 --- a/apis/v1alpha1/enums.go +++ b/apis/v1alpha1/enums.go @@ -285,6 +285,14 @@ const ( AutoMLMetricExtendedEnum_LogLoss AutoMLMetricExtendedEnum = "LogLoss" ) +type AutoMLMode string + +const ( + AutoMLMode_AUTO AutoMLMode = "AUTO" + AutoMLMode_ENSEMBLING AutoMLMode = "ENSEMBLING" + AutoMLMode_HYPERPARAMETER_TUNING AutoMLMode = "HYPERPARAMETER_TUNING" +) + type AutoMLS3DataType string const ( @@ -368,6 +376,87 @@ const ( CaptureStatus_Stopped CaptureStatus = "Stopped" ) +type ClarifyFeatureType string + +const ( + ClarifyFeatureType_numerical ClarifyFeatureType = "numerical" + ClarifyFeatureType_categorical ClarifyFeatureType = "categorical" + ClarifyFeatureType_text ClarifyFeatureType = "text" +) + +type ClarifyTextGranularity string + +const ( + ClarifyTextGranularity_token ClarifyTextGranularity = "token" + ClarifyTextGranularity_sentence ClarifyTextGranularity = "sentence" + ClarifyTextGranularity_paragraph ClarifyTextGranularity = "paragraph" +) + +type ClarifyTextLanguage string + +const ( + ClarifyTextLanguage_af ClarifyTextLanguage = "af" + ClarifyTextLanguage_sq ClarifyTextLanguage = "sq" + ClarifyTextLanguage_ar ClarifyTextLanguage = "ar" + ClarifyTextLanguage_hy ClarifyTextLanguage = "hy" + ClarifyTextLanguage_eu ClarifyTextLanguage = "eu" + ClarifyTextLanguage_bn ClarifyTextLanguage = "bn" + ClarifyTextLanguage_bg ClarifyTextLanguage = "bg" + ClarifyTextLanguage_ca ClarifyTextLanguage = "ca" + ClarifyTextLanguage_zh ClarifyTextLanguage = "zh" + ClarifyTextLanguage_hr ClarifyTextLanguage = "hr" + ClarifyTextLanguage_cs ClarifyTextLanguage = "cs" + ClarifyTextLanguage_da ClarifyTextLanguage = "da" + ClarifyTextLanguage_nl ClarifyTextLanguage = "nl" + ClarifyTextLanguage_en ClarifyTextLanguage = "en" + ClarifyTextLanguage_et ClarifyTextLanguage = "et" + ClarifyTextLanguage_fi ClarifyTextLanguage = "fi" + ClarifyTextLanguage_fr ClarifyTextLanguage = "fr" + ClarifyTextLanguage_de ClarifyTextLanguage = "de" + ClarifyTextLanguage_el ClarifyTextLanguage = "el" + ClarifyTextLanguage_gu ClarifyTextLanguage = "gu" + ClarifyTextLanguage_he ClarifyTextLanguage = "he" + ClarifyTextLanguage_hi ClarifyTextLanguage = "hi" + ClarifyTextLanguage_hu ClarifyTextLanguage = "hu" + ClarifyTextLanguage_is ClarifyTextLanguage = "is" + ClarifyTextLanguage_id ClarifyTextLanguage = "id" + ClarifyTextLanguage_ga ClarifyTextLanguage = "ga" + ClarifyTextLanguage_it ClarifyTextLanguage = "it" + ClarifyTextLanguage_kn ClarifyTextLanguage = "kn" + ClarifyTextLanguage_ky ClarifyTextLanguage = "ky" + ClarifyTextLanguage_lv ClarifyTextLanguage = "lv" + ClarifyTextLanguage_lt ClarifyTextLanguage = "lt" + ClarifyTextLanguage_lb ClarifyTextLanguage = "lb" + ClarifyTextLanguage_mk ClarifyTextLanguage = "mk" + ClarifyTextLanguage_ml ClarifyTextLanguage = "ml" + ClarifyTextLanguage_mr ClarifyTextLanguage = "mr" + ClarifyTextLanguage_ne ClarifyTextLanguage = "ne" + ClarifyTextLanguage_nb ClarifyTextLanguage = "nb" + ClarifyTextLanguage_fa ClarifyTextLanguage = "fa" + ClarifyTextLanguage_pl ClarifyTextLanguage = "pl" + ClarifyTextLanguage_pt ClarifyTextLanguage = "pt" + ClarifyTextLanguage_ro ClarifyTextLanguage = "ro" + ClarifyTextLanguage_ru ClarifyTextLanguage = "ru" + ClarifyTextLanguage_sa ClarifyTextLanguage = "sa" + ClarifyTextLanguage_sr ClarifyTextLanguage = "sr" + ClarifyTextLanguage_tn ClarifyTextLanguage = "tn" + ClarifyTextLanguage_si ClarifyTextLanguage = "si" + ClarifyTextLanguage_sk ClarifyTextLanguage = "sk" + ClarifyTextLanguage_sl ClarifyTextLanguage = "sl" + ClarifyTextLanguage_es ClarifyTextLanguage = "es" + ClarifyTextLanguage_sv ClarifyTextLanguage = "sv" + ClarifyTextLanguage_tl ClarifyTextLanguage = "tl" + ClarifyTextLanguage_ta ClarifyTextLanguage = "ta" + ClarifyTextLanguage_tt ClarifyTextLanguage = "tt" + ClarifyTextLanguage_te ClarifyTextLanguage = "te" + ClarifyTextLanguage_tr ClarifyTextLanguage = "tr" + ClarifyTextLanguage_uk ClarifyTextLanguage = "uk" + ClarifyTextLanguage_ur ClarifyTextLanguage = "ur" + ClarifyTextLanguage_yo ClarifyTextLanguage = "yo" + ClarifyTextLanguage_lij ClarifyTextLanguage = "lij" + ClarifyTextLanguage_xx ClarifyTextLanguage = "xx" +) + type CodeRepositorySortBy string const ( @@ -590,6 +679,13 @@ const ( FeatureGroupStatus_SDK_DeleteFailed FeatureGroupStatus_SDK = "DeleteFailed" ) +type FeatureStatus string + +const ( + FeatureStatus_ENABLED FeatureStatus = "ENABLED" + FeatureStatus_DISABLED FeatureStatus = "DISABLED" +) + type FeatureType string const ( @@ -685,8 +781,9 @@ const ( type HyperParameterTuningJobStrategyType string const ( - HyperParameterTuningJobStrategyType_Bayesian HyperParameterTuningJobStrategyType = "Bayesian" - HyperParameterTuningJobStrategyType_Random HyperParameterTuningJobStrategyType = "Random" + HyperParameterTuningJobStrategyType_Bayesian HyperParameterTuningJobStrategyType = "Bayesian" + HyperParameterTuningJobStrategyType_Random HyperParameterTuningJobStrategyType = "Random" + HyperParameterTuningJobStrategyType_Hyperband HyperParameterTuningJobStrategyType = "Hyperband" ) type HyperParameterTuningJobWarmStartType string @@ -1379,6 +1476,62 @@ const ( ProductionVariantInstanceType_ml_g5_24xlarge ProductionVariantInstanceType = "ml.g5.24xlarge" ProductionVariantInstanceType_ml_g5_48xlarge ProductionVariantInstanceType = "ml.g5.48xlarge" ProductionVariantInstanceType_ml_p4d_24xlarge ProductionVariantInstanceType = "ml.p4d.24xlarge" + ProductionVariantInstanceType_ml_c7g_large ProductionVariantInstanceType = "ml.c7g.large" + ProductionVariantInstanceType_ml_c7g_xlarge ProductionVariantInstanceType = "ml.c7g.xlarge" + ProductionVariantInstanceType_ml_c7g_2xlarge ProductionVariantInstanceType = "ml.c7g.2xlarge" + ProductionVariantInstanceType_ml_c7g_4xlarge ProductionVariantInstanceType = "ml.c7g.4xlarge" + ProductionVariantInstanceType_ml_c7g_8xlarge ProductionVariantInstanceType = "ml.c7g.8xlarge" + ProductionVariantInstanceType_ml_c7g_12xlarge ProductionVariantInstanceType = "ml.c7g.12xlarge" + ProductionVariantInstanceType_ml_c7g_16xlarge ProductionVariantInstanceType = "ml.c7g.16xlarge" + ProductionVariantInstanceType_ml_m6g_large ProductionVariantInstanceType = "ml.m6g.large" + ProductionVariantInstanceType_ml_m6g_xlarge ProductionVariantInstanceType = "ml.m6g.xlarge" + ProductionVariantInstanceType_ml_m6g_2xlarge ProductionVariantInstanceType = "ml.m6g.2xlarge" + ProductionVariantInstanceType_ml_m6g_4xlarge ProductionVariantInstanceType = "ml.m6g.4xlarge" + ProductionVariantInstanceType_ml_m6g_8xlarge ProductionVariantInstanceType = "ml.m6g.8xlarge" + ProductionVariantInstanceType_ml_m6g_12xlarge ProductionVariantInstanceType = "ml.m6g.12xlarge" + ProductionVariantInstanceType_ml_m6g_16xlarge ProductionVariantInstanceType = "ml.m6g.16xlarge" + ProductionVariantInstanceType_ml_m6gd_large ProductionVariantInstanceType = "ml.m6gd.large" + ProductionVariantInstanceType_ml_m6gd_xlarge ProductionVariantInstanceType = "ml.m6gd.xlarge" + ProductionVariantInstanceType_ml_m6gd_2xlarge ProductionVariantInstanceType = "ml.m6gd.2xlarge" + ProductionVariantInstanceType_ml_m6gd_4xlarge ProductionVariantInstanceType = "ml.m6gd.4xlarge" + ProductionVariantInstanceType_ml_m6gd_8xlarge ProductionVariantInstanceType = "ml.m6gd.8xlarge" + ProductionVariantInstanceType_ml_m6gd_12xlarge ProductionVariantInstanceType = "ml.m6gd.12xlarge" + ProductionVariantInstanceType_ml_m6gd_16xlarge ProductionVariantInstanceType = "ml.m6gd.16xlarge" + ProductionVariantInstanceType_ml_c6g_large ProductionVariantInstanceType = "ml.c6g.large" + ProductionVariantInstanceType_ml_c6g_xlarge ProductionVariantInstanceType = "ml.c6g.xlarge" + ProductionVariantInstanceType_ml_c6g_2xlarge ProductionVariantInstanceType = "ml.c6g.2xlarge" + ProductionVariantInstanceType_ml_c6g_4xlarge ProductionVariantInstanceType = "ml.c6g.4xlarge" + ProductionVariantInstanceType_ml_c6g_8xlarge ProductionVariantInstanceType = "ml.c6g.8xlarge" + ProductionVariantInstanceType_ml_c6g_12xlarge ProductionVariantInstanceType = "ml.c6g.12xlarge" + ProductionVariantInstanceType_ml_c6g_16xlarge ProductionVariantInstanceType = "ml.c6g.16xlarge" + ProductionVariantInstanceType_ml_c6gd_large ProductionVariantInstanceType = "ml.c6gd.large" + ProductionVariantInstanceType_ml_c6gd_xlarge ProductionVariantInstanceType = "ml.c6gd.xlarge" + ProductionVariantInstanceType_ml_c6gd_2xlarge ProductionVariantInstanceType = "ml.c6gd.2xlarge" + ProductionVariantInstanceType_ml_c6gd_4xlarge ProductionVariantInstanceType = "ml.c6gd.4xlarge" + ProductionVariantInstanceType_ml_c6gd_8xlarge ProductionVariantInstanceType = "ml.c6gd.8xlarge" + ProductionVariantInstanceType_ml_c6gd_12xlarge ProductionVariantInstanceType = "ml.c6gd.12xlarge" + ProductionVariantInstanceType_ml_c6gd_16xlarge ProductionVariantInstanceType = "ml.c6gd.16xlarge" + ProductionVariantInstanceType_ml_c6gn_large ProductionVariantInstanceType = "ml.c6gn.large" + ProductionVariantInstanceType_ml_c6gn_xlarge ProductionVariantInstanceType = "ml.c6gn.xlarge" + ProductionVariantInstanceType_ml_c6gn_2xlarge ProductionVariantInstanceType = "ml.c6gn.2xlarge" + ProductionVariantInstanceType_ml_c6gn_4xlarge ProductionVariantInstanceType = "ml.c6gn.4xlarge" + ProductionVariantInstanceType_ml_c6gn_8xlarge ProductionVariantInstanceType = "ml.c6gn.8xlarge" + ProductionVariantInstanceType_ml_c6gn_12xlarge ProductionVariantInstanceType = "ml.c6gn.12xlarge" + ProductionVariantInstanceType_ml_c6gn_16xlarge ProductionVariantInstanceType = "ml.c6gn.16xlarge" + ProductionVariantInstanceType_ml_r6g_large ProductionVariantInstanceType = "ml.r6g.large" + ProductionVariantInstanceType_ml_r6g_xlarge ProductionVariantInstanceType = "ml.r6g.xlarge" + ProductionVariantInstanceType_ml_r6g_2xlarge ProductionVariantInstanceType = "ml.r6g.2xlarge" + ProductionVariantInstanceType_ml_r6g_4xlarge ProductionVariantInstanceType = "ml.r6g.4xlarge" + ProductionVariantInstanceType_ml_r6g_8xlarge ProductionVariantInstanceType = "ml.r6g.8xlarge" + ProductionVariantInstanceType_ml_r6g_12xlarge ProductionVariantInstanceType = "ml.r6g.12xlarge" + ProductionVariantInstanceType_ml_r6g_16xlarge ProductionVariantInstanceType = "ml.r6g.16xlarge" + ProductionVariantInstanceType_ml_r6gd_large ProductionVariantInstanceType = "ml.r6gd.large" + ProductionVariantInstanceType_ml_r6gd_xlarge ProductionVariantInstanceType = "ml.r6gd.xlarge" + ProductionVariantInstanceType_ml_r6gd_2xlarge ProductionVariantInstanceType = "ml.r6gd.2xlarge" + ProductionVariantInstanceType_ml_r6gd_4xlarge ProductionVariantInstanceType = "ml.r6gd.4xlarge" + ProductionVariantInstanceType_ml_r6gd_8xlarge ProductionVariantInstanceType = "ml.r6gd.8xlarge" + ProductionVariantInstanceType_ml_r6gd_12xlarge ProductionVariantInstanceType = "ml.r6gd.12xlarge" + ProductionVariantInstanceType_ml_r6gd_16xlarge ProductionVariantInstanceType = "ml.r6gd.16xlarge" ) type ProfilingStatus string @@ -1974,6 +2127,15 @@ const ( VariantStatus_Baking VariantStatus = "Baking" ) +type WarmPoolResourceStatus string + +const ( + WarmPoolResourceStatus_Available WarmPoolResourceStatus = "Available" + WarmPoolResourceStatus_Terminated WarmPoolResourceStatus = "Terminated" + WarmPoolResourceStatus_Reused WarmPoolResourceStatus = "Reused" + WarmPoolResourceStatus_InUse WarmPoolResourceStatus = "InUse" +) + type WorkforceStatus string const ( diff --git a/apis/v1alpha1/generator.yaml b/apis/v1alpha1/generator.yaml index f012d5ff..7830ea5a 100644 --- a/apis/v1alpha1/generator.yaml +++ b/apis/v1alpha1/generator.yaml @@ -887,3 +887,6 @@ ignore: - InstanceMetadataServiceConfiguration - InstanceGroups - InstanceGroupNames + - CanvasAppSettings + - ExplainerConfig + - HyperParameterTuningJobStrategyConfig diff --git a/apis/v1alpha1/training_job.go b/apis/v1alpha1/training_job.go index 20fa4702..b9e1a264 100644 --- a/apis/v1alpha1/training_job.go +++ b/apis/v1alpha1/training_job.go @@ -76,6 +76,11 @@ type TrainingJobSpec struct { // You can specify a maximum of 100 hyperparameters. Each hyperparameter is // a key-value pair. Each key and value is limited to 256 characters, as specified // by the Length Constraint. + // + // Do not include any security-sensitive information including account access + // IDs, secrets or tokens in any hyperparameter field. If the use of security-sensitive + // credentials are detected, SageMaker will reject your training job request + // and return an exception error. HyperParameters map[string]*string `json:"hyperParameters,omitempty"` // An array of Channel objects. Each channel is a named input source. InputDataConfig // describes the input data and its location. diff --git a/apis/v1alpha1/types.go b/apis/v1alpha1/types.go index fc7ba883..f7f52040 100644 --- a/apis/v1alpha1/types.go +++ b/apis/v1alpha1/types.go @@ -449,6 +449,20 @@ type ClarifyCheckStepMetadata struct { ViolationReport *string `json:"violationReport,omitempty"` } +// The configuration for the SHAP baseline (https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-feature-attribute-shap-baselines.html) +// (also called the background or reference dataset) of the Kernal SHAP algorithm. +// +// * The number of records in the baseline data determines the size of the +// synthetic dataset, which has an impact on latency of explainability requests. +// For more information, see the Synthetic data of Configure and create an +// endpoint (https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-online-explainability-create-endpoint.html). +// +// * ShapBaseline and ShapBaselineUri are mutually exclusive parameters. +// One or the either is required to configure a SHAP baseline. +type ClarifyShapBaselineConfig struct { + ShapBaselineURI *string `json:"shapBaselineURI,omitempty"` +} + // Specifies summary information about a Git repository. type CodeRepositorySummary struct { CodeRepositoryName *string `json:"codeRepositoryName,omitempty"` @@ -1261,8 +1275,8 @@ type HyperParameterTuningJobObjective struct { Type *string `json:"type_,omitempty"` } -// An entity having characteristics over which a user can search for a hyperparameter -// tuning job. +// An entity returned by the SearchRecord (https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_SearchRecord.html) +// API containing the properties of a hyperparameter tuning job. type HyperParameterTuningJobSearchEntity struct { // The container for the summary information about a training job. BestTrainingJob *HyperParameterTrainingJobSummary `json:"bestTrainingJob,omitempty"` @@ -2665,10 +2679,17 @@ type RepositoryAuthConfig struct { // Describes the resources, including ML compute instances and ML storage volumes, // to use for model training. type ResourceConfig struct { - InstanceCount *int64 `json:"instanceCount,omitempty"` - InstanceType *string `json:"instanceType,omitempty"` - VolumeKMSKeyID *string `json:"volumeKMSKeyID,omitempty"` - VolumeSizeInGB *int64 `json:"volumeSizeInGB,omitempty"` + InstanceCount *int64 `json:"instanceCount,omitempty"` + InstanceType *string `json:"instanceType,omitempty"` + KeepAlivePeriodInSeconds *int64 `json:"keepAlivePeriodInSeconds,omitempty"` + VolumeKMSKeyID *string `json:"volumeKMSKeyID,omitempty"` + VolumeSizeInGB *int64 `json:"volumeSizeInGB,omitempty"` +} + +// The ResourceConfig to update KeepAlivePeriodInSeconds. Other fields in the +// ResourceConfig cannot be updated. +type ResourceConfigForUpdate struct { + KeepAlivePeriodInSeconds *int64 `json:"keepAlivePeriodInSeconds,omitempty"` } // Specifies the maximum number of training jobs and parallel training jobs @@ -2837,6 +2858,11 @@ type TensorBoardOutputConfig struct { S3OutputPath *string `json:"s3OutputPath,omitempty"` } +// Time series forecast settings for the SageMaker Canvas app. +type TimeSeriesForecastingSettings struct { + AmazonForecastRoleARN *string `json:"amazonForecastRoleARN,omitempty"` +} + // Defines the input needed to run a training job using the algorithm. type TrainingJobDefinition struct { HyperParameters map[string]*string `json:"hyperParameters,omitempty"` @@ -2928,6 +2954,8 @@ type TrainingJobSummary struct { TrainingJobARN *string `json:"trainingJobARN,omitempty"` TrainingJobName *string `json:"trainingJobName,omitempty"` TrainingJobStatus *string `json:"trainingJobStatus,omitempty"` + // Status and billing information about the warm pool. + WarmPoolStatus *WarmPoolStatus `json:"warmPoolStatus,omitempty"` } // Contains information about a training job. @@ -3316,6 +3344,13 @@ type VariantProperty struct { VariantPropertyType *string `json:"variantPropertyType,omitempty"` } +// Status and billing information about the warm pool. +type WarmPoolStatus struct { + ResourceRetainedBillableTimeInSeconds *int64 `json:"resourceRetainedBillableTimeInSeconds,omitempty"` + ReusedByJob *string `json:"reusedByJob,omitempty"` + Status *string `json:"status,omitempty"` +} + // A single private workforce, which is automatically created when you create // your first private work team. You can create one private work force in each // Amazon Web Services Region. By default, any workforce-related API operation diff --git a/apis/v1alpha1/user_profile.go b/apis/v1alpha1/user_profile.go index 33f699fa..89dcf346 100644 --- a/apis/v1alpha1/user_profile.go +++ b/apis/v1alpha1/user_profile.go @@ -26,15 +26,15 @@ type UserProfileSpec struct { // +kubebuilder:validation:Required DomainID *string `json:"domainID"` // A specifier for the type of value specified in SingleSignOnUserValue. Currently, - // the only supported value is "UserName". If the Domain's AuthMode is Amazon - // Web Services SSO, this field is required. If the Domain's AuthMode is not - // Amazon Web Services SSO, this field cannot be specified. + // the only supported value is "UserName". If the Domain's AuthMode is IAM Identity + // Center, this field is required. If the Domain's AuthMode is not IAM Identity + // Center, this field cannot be specified. SingleSignOnUserIdentifier *string `json:"singleSignOnUserIdentifier,omitempty"` // The username of the associated Amazon Web Services Single Sign-On User for - // this UserProfile. If the Domain's AuthMode is Amazon Web Services SSO, this - // field is required, and must match a valid username of a user in your directory. - // If the Domain's AuthMode is not Amazon Web Services SSO, this field cannot - // be specified. + // this UserProfile. If the Domain's AuthMode is IAM Identity Center, this field + // is required, and must match a valid username of a user in your directory. + // If the Domain's AuthMode is not IAM Identity Center, this field cannot be + // specified. SingleSignOnUserValue *string `json:"singleSignOnUserValue,omitempty"` // Each tag consists of a key and an optional value. Tag keys must be unique // per resource. diff --git a/apis/v1alpha1/zz_generated.deepcopy.go b/apis/v1alpha1/zz_generated.deepcopy.go index 2f8b88a0..ff05bc14 100644 --- a/apis/v1alpha1/zz_generated.deepcopy.go +++ b/apis/v1alpha1/zz_generated.deepcopy.go @@ -1453,6 +1453,26 @@ func (in *ClarifyCheckStepMetadata) DeepCopy() *ClarifyCheckStepMetadata { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClarifyShapBaselineConfig) DeepCopyInto(out *ClarifyShapBaselineConfig) { + *out = *in + if in.ShapBaselineURI != nil { + in, out := &in.ShapBaselineURI, &out.ShapBaselineURI + *out = new(string) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClarifyShapBaselineConfig. +func (in *ClarifyShapBaselineConfig) DeepCopy() *ClarifyShapBaselineConfig { + if in == nil { + return nil + } + out := new(ClarifyShapBaselineConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *CodeRepositorySummary) DeepCopyInto(out *CodeRepositorySummary) { *out = *in @@ -11873,6 +11893,11 @@ func (in *ResourceConfig) DeepCopyInto(out *ResourceConfig) { *out = new(string) **out = **in } + if in.KeepAlivePeriodInSeconds != nil { + in, out := &in.KeepAlivePeriodInSeconds, &out.KeepAlivePeriodInSeconds + *out = new(int64) + **out = **in + } if in.VolumeKMSKeyID != nil { in, out := &in.VolumeKMSKeyID, &out.VolumeKMSKeyID *out = new(string) @@ -11895,6 +11920,26 @@ func (in *ResourceConfig) DeepCopy() *ResourceConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ResourceConfigForUpdate) DeepCopyInto(out *ResourceConfigForUpdate) { + *out = *in + if in.KeepAlivePeriodInSeconds != nil { + in, out := &in.KeepAlivePeriodInSeconds, &out.KeepAlivePeriodInSeconds + *out = new(int64) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceConfigForUpdate. +func (in *ResourceConfigForUpdate) DeepCopy() *ResourceConfigForUpdate { + if in == nil { + return nil + } + out := new(ResourceConfigForUpdate) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ResourceLimits) DeepCopyInto(out *ResourceLimits) { *out = *in @@ -12348,6 +12393,26 @@ func (in *TensorBoardOutputConfig) DeepCopy() *TensorBoardOutputConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TimeSeriesForecastingSettings) DeepCopyInto(out *TimeSeriesForecastingSettings) { + *out = *in + if in.AmazonForecastRoleARN != nil { + in, out := &in.AmazonForecastRoleARN, &out.AmazonForecastRoleARN + *out = new(string) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TimeSeriesForecastingSettings. +func (in *TimeSeriesForecastingSettings) DeepCopy() *TimeSeriesForecastingSettings { + if in == nil { + return nil + } + out := new(TimeSeriesForecastingSettings) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *TrainingJob) DeepCopyInto(out *TrainingJob) { *out = *in @@ -12795,6 +12860,11 @@ func (in *TrainingJobSummary) DeepCopyInto(out *TrainingJobSummary) { *out = new(string) **out = **in } + if in.WarmPoolStatus != nil { + in, out := &in.WarmPoolStatus, &out.WarmPoolStatus + *out = new(WarmPoolStatus) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingJobSummary. @@ -14351,6 +14421,36 @@ func (in *VariantProperty) DeepCopy() *VariantProperty { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *WarmPoolStatus) DeepCopyInto(out *WarmPoolStatus) { + *out = *in + if in.ResourceRetainedBillableTimeInSeconds != nil { + in, out := &in.ResourceRetainedBillableTimeInSeconds, &out.ResourceRetainedBillableTimeInSeconds + *out = new(int64) + **out = **in + } + if in.ReusedByJob != nil { + in, out := &in.ReusedByJob, &out.ReusedByJob + *out = new(string) + **out = **in + } + if in.Status != nil { + in, out := &in.Status, &out.Status + *out = new(string) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WarmPoolStatus. +func (in *WarmPoolStatus) DeepCopy() *WarmPoolStatus { + if in == nil { + return nil + } + out := new(WarmPoolStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Workforce) DeepCopyInto(out *Workforce) { *out = *in diff --git a/config/crd/bases/sagemaker.services.k8s.aws_apps.yaml b/config/crd/bases/sagemaker.services.k8s.aws_apps.yaml index 06e370d8..24cec98b 100644 --- a/config/crd/bases/sagemaker.services.k8s.aws_apps.yaml +++ b/config/crd/bases/sagemaker.services.k8s.aws_apps.yaml @@ -43,8 +43,7 @@ spec: description: The name of the app. type: string appType: - description: The type of app. Supported apps are JupyterServer and - KernelGateway. TensorBoard is not supported. + description: The type of app. type: string domainID: description: The domain ID. diff --git a/config/crd/bases/sagemaker.services.k8s.aws_hyperparametertuningjobs.yaml b/config/crd/bases/sagemaker.services.k8s.aws_hyperparametertuningjobs.yaml index 44c89e58..310f1f50 100644 --- a/config/crd/bases/sagemaker.services.k8s.aws_hyperparametertuningjobs.yaml +++ b/config/crd/bases/sagemaker.services.k8s.aws_hyperparametertuningjobs.yaml @@ -431,6 +431,9 @@ spec: type: integer instanceType: type: string + keepAlivePeriodInSeconds: + format: int64 + type: integer volumeKMSKeyID: type: string volumeSizeInGB: @@ -760,6 +763,9 @@ spec: type: integer instanceType: type: string + keepAlivePeriodInSeconds: + format: int64 + type: integer volumeKMSKeyID: type: string volumeSizeInGB: diff --git a/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml b/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml index 93bafd0a..729b1b38 100644 --- a/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml +++ b/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml @@ -227,7 +227,11 @@ spec: provided by SageMaker, see Algorithms (https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html). \n You can specify a maximum of 100 hyperparameters. Each hyperparameter is a key-value pair. Each key and value is limited to 256 characters, - as specified by the Length Constraint." + as specified by the Length Constraint. \n Do not include any security-sensitive + information including account access IDs, secrets or tokens in any + hyperparameter field. If the use of security-sensitive credentials + are detected, SageMaker will reject your training job request and + return an exception error." type: object inputDataConfig: description: "An array of Channel objects. Each channel is a named @@ -402,6 +406,9 @@ spec: type: integer instanceType: type: string + keepAlivePeriodInSeconds: + format: int64 + type: integer volumeKMSKeyID: type: string volumeSizeInGB: diff --git a/config/crd/bases/sagemaker.services.k8s.aws_userprofiles.yaml b/config/crd/bases/sagemaker.services.k8s.aws_userprofiles.yaml index 4d786053..17c7fd47 100644 --- a/config/crd/bases/sagemaker.services.k8s.aws_userprofiles.yaml +++ b/config/crd/bases/sagemaker.services.k8s.aws_userprofiles.yaml @@ -45,16 +45,16 @@ spec: singleSignOnUserIdentifier: description: A specifier for the type of value specified in SingleSignOnUserValue. Currently, the only supported value is "UserName". If the Domain's - AuthMode is Amazon Web Services SSO, this field is required. If - the Domain's AuthMode is not Amazon Web Services SSO, this field - cannot be specified. + AuthMode is IAM Identity Center, this field is required. If the + Domain's AuthMode is not IAM Identity Center, this field cannot + be specified. type: string singleSignOnUserValue: description: The username of the associated Amazon Web Services Single - Sign-On User for this UserProfile. If the Domain's AuthMode is Amazon - Web Services SSO, this field is required, and must match a valid + Sign-On User for this UserProfile. If the Domain's AuthMode is IAM + Identity Center, this field is required, and must match a valid username of a user in your directory. If the Domain's AuthMode is - not Amazon Web Services SSO, this field cannot be specified. + not IAM Identity Center, this field cannot be specified. type: string tags: description: "Each tag consists of a key and an optional value. Tag diff --git a/generator.yaml b/generator.yaml index f012d5ff..7830ea5a 100644 --- a/generator.yaml +++ b/generator.yaml @@ -887,3 +887,6 @@ ignore: - InstanceMetadataServiceConfiguration - InstanceGroups - InstanceGroupNames + - CanvasAppSettings + - ExplainerConfig + - HyperParameterTuningJobStrategyConfig diff --git a/go.mod b/go.mod index 96750da2..d8036f96 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.17 require ( github.com/aws-controllers-k8s/runtime v0.20.1 - github.com/aws/aws-sdk-go v1.44.93 + github.com/aws/aws-sdk-go v1.44.117 github.com/ghodss/yaml v1.0.0 github.com/go-logr/logr v1.2.0 github.com/google/go-cmp v0.5.5 diff --git a/go.sum b/go.sum index 395a546f..e6d6865b 100644 --- a/go.sum +++ b/go.sum @@ -66,8 +66,9 @@ github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= github.com/aws-controllers-k8s/runtime v0.20.1 h1:L/Huf1shRahx5BqJBCSS5u+vYg3f0Rotsq1jutORpdI= github.com/aws-controllers-k8s/runtime v0.20.1/go.mod h1:k7z4qlf6aK1Kzd4ff49wzcyhDKHjWaUpqxrwgl4uS1o= -github.com/aws/aws-sdk-go v1.44.93 h1:hAgd9fuaptBatSft27/5eBMdcA8+cIMqo96/tZ6rKl8= github.com/aws/aws-sdk-go v1.44.93/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo= +github.com/aws/aws-sdk-go v1.44.117 h1:mZuODB3Y4soG9QWAXyGb2po+6Easa/enifpj4MnZ91s= +github.com/aws/aws-sdk-go v1.44.117/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo= github.com/benbjohnson/clock v1.0.3/go.mod h1:bGMdMPoPVvcYyt1gHDf4J2KE153Yf9BuiUKYMaxlTDM= github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= diff --git a/helm/crds/sagemaker.services.k8s.aws_apps.yaml b/helm/crds/sagemaker.services.k8s.aws_apps.yaml index 06e370d8..24cec98b 100644 --- a/helm/crds/sagemaker.services.k8s.aws_apps.yaml +++ b/helm/crds/sagemaker.services.k8s.aws_apps.yaml @@ -43,8 +43,7 @@ spec: description: The name of the app. type: string appType: - description: The type of app. Supported apps are JupyterServer and - KernelGateway. TensorBoard is not supported. + description: The type of app. type: string domainID: description: The domain ID. diff --git a/helm/crds/sagemaker.services.k8s.aws_hyperparametertuningjobs.yaml b/helm/crds/sagemaker.services.k8s.aws_hyperparametertuningjobs.yaml index 44c89e58..310f1f50 100644 --- a/helm/crds/sagemaker.services.k8s.aws_hyperparametertuningjobs.yaml +++ b/helm/crds/sagemaker.services.k8s.aws_hyperparametertuningjobs.yaml @@ -431,6 +431,9 @@ spec: type: integer instanceType: type: string + keepAlivePeriodInSeconds: + format: int64 + type: integer volumeKMSKeyID: type: string volumeSizeInGB: @@ -760,6 +763,9 @@ spec: type: integer instanceType: type: string + keepAlivePeriodInSeconds: + format: int64 + type: integer volumeKMSKeyID: type: string volumeSizeInGB: diff --git a/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml b/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml index 93bafd0a..729b1b38 100644 --- a/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml +++ b/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml @@ -227,7 +227,11 @@ spec: provided by SageMaker, see Algorithms (https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html). \n You can specify a maximum of 100 hyperparameters. Each hyperparameter is a key-value pair. Each key and value is limited to 256 characters, - as specified by the Length Constraint." + as specified by the Length Constraint. \n Do not include any security-sensitive + information including account access IDs, secrets or tokens in any + hyperparameter field. If the use of security-sensitive credentials + are detected, SageMaker will reject your training job request and + return an exception error." type: object inputDataConfig: description: "An array of Channel objects. Each channel is a named @@ -402,6 +406,9 @@ spec: type: integer instanceType: type: string + keepAlivePeriodInSeconds: + format: int64 + type: integer volumeKMSKeyID: type: string volumeSizeInGB: diff --git a/helm/crds/sagemaker.services.k8s.aws_userprofiles.yaml b/helm/crds/sagemaker.services.k8s.aws_userprofiles.yaml index 4d786053..17c7fd47 100644 --- a/helm/crds/sagemaker.services.k8s.aws_userprofiles.yaml +++ b/helm/crds/sagemaker.services.k8s.aws_userprofiles.yaml @@ -45,16 +45,16 @@ spec: singleSignOnUserIdentifier: description: A specifier for the type of value specified in SingleSignOnUserValue. Currently, the only supported value is "UserName". If the Domain's - AuthMode is Amazon Web Services SSO, this field is required. If - the Domain's AuthMode is not Amazon Web Services SSO, this field - cannot be specified. + AuthMode is IAM Identity Center, this field is required. If the + Domain's AuthMode is not IAM Identity Center, this field cannot + be specified. type: string singleSignOnUserValue: description: The username of the associated Amazon Web Services Single - Sign-On User for this UserProfile. If the Domain's AuthMode is Amazon - Web Services SSO, this field is required, and must match a valid + Sign-On User for this UserProfile. If the Domain's AuthMode is IAM + Identity Center, this field is required, and must match a valid username of a user in your directory. If the Domain's AuthMode is - not Amazon Web Services SSO, this field cannot be specified. + not IAM Identity Center, this field cannot be specified. type: string tags: description: "Each tag consists of a key and an optional value. Tag diff --git a/pkg/resource/hyper_parameter_tuning_job/delta.go b/pkg/resource/hyper_parameter_tuning_job/delta.go index b81f6164..717cd447 100644 --- a/pkg/resource/hyper_parameter_tuning_job/delta.go +++ b/pkg/resource/hyper_parameter_tuning_job/delta.go @@ -255,6 +255,13 @@ func newResourceDelta( delta.Add("Spec.TrainingJobDefinition.ResourceConfig.InstanceType", a.ko.Spec.TrainingJobDefinition.ResourceConfig.InstanceType, b.ko.Spec.TrainingJobDefinition.ResourceConfig.InstanceType) } } + if ackcompare.HasNilDifference(a.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds, b.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds) { + delta.Add("Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds", a.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds, b.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds) + } else if a.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds != nil && b.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds != nil { + if *a.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds != *b.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds { + delta.Add("Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds", a.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds, b.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds) + } + } if ackcompare.HasNilDifference(a.ko.Spec.TrainingJobDefinition.ResourceConfig.VolumeKMSKeyID, b.ko.Spec.TrainingJobDefinition.ResourceConfig.VolumeKMSKeyID) { delta.Add("Spec.TrainingJobDefinition.ResourceConfig.VolumeKMSKeyID", a.ko.Spec.TrainingJobDefinition.ResourceConfig.VolumeKMSKeyID, b.ko.Spec.TrainingJobDefinition.ResourceConfig.VolumeKMSKeyID) } else if a.ko.Spec.TrainingJobDefinition.ResourceConfig.VolumeKMSKeyID != nil && b.ko.Spec.TrainingJobDefinition.ResourceConfig.VolumeKMSKeyID != nil { diff --git a/pkg/resource/hyper_parameter_tuning_job/sdk.go b/pkg/resource/hyper_parameter_tuning_job/sdk.go index 3650f85e..271f1f91 100644 --- a/pkg/resource/hyper_parameter_tuning_job/sdk.go +++ b/pkg/resource/hyper_parameter_tuning_job/sdk.go @@ -533,6 +533,9 @@ func (rm *resourceManager) sdkFind( if resp.TrainingJobDefinition.ResourceConfig.InstanceType != nil { f11f9.InstanceType = resp.TrainingJobDefinition.ResourceConfig.InstanceType } + if resp.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds != nil { + f11f9.KeepAlivePeriodInSeconds = resp.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds + } if resp.TrainingJobDefinition.ResourceConfig.VolumeKmsKeyId != nil { f11f9.VolumeKMSKeyID = resp.TrainingJobDefinition.ResourceConfig.VolumeKmsKeyId } @@ -806,6 +809,9 @@ func (rm *resourceManager) sdkFind( if f12iter.ResourceConfig.InstanceType != nil { f12elemf9.InstanceType = f12iter.ResourceConfig.InstanceType } + if f12iter.ResourceConfig.KeepAlivePeriodInSeconds != nil { + f12elemf9.KeepAlivePeriodInSeconds = f12iter.ResourceConfig.KeepAlivePeriodInSeconds + } if f12iter.ResourceConfig.VolumeKmsKeyId != nil { f12elemf9.VolumeKMSKeyID = f12iter.ResourceConfig.VolumeKmsKeyId } @@ -1295,6 +1301,9 @@ func (rm *resourceManager) newCreateRequestPayload( if r.ko.Spec.TrainingJobDefinition.ResourceConfig.InstanceType != nil { f3f9.SetInstanceType(*r.ko.Spec.TrainingJobDefinition.ResourceConfig.InstanceType) } + if r.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds != nil { + f3f9.SetKeepAlivePeriodInSeconds(*r.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds) + } if r.ko.Spec.TrainingJobDefinition.ResourceConfig.VolumeKMSKeyID != nil { f3f9.SetVolumeKmsKeyId(*r.ko.Spec.TrainingJobDefinition.ResourceConfig.VolumeKMSKeyID) } @@ -1566,6 +1575,9 @@ func (rm *resourceManager) newCreateRequestPayload( if f4iter.ResourceConfig.InstanceType != nil { f4elemf9.SetInstanceType(*f4iter.ResourceConfig.InstanceType) } + if f4iter.ResourceConfig.KeepAlivePeriodInSeconds != nil { + f4elemf9.SetKeepAlivePeriodInSeconds(*f4iter.ResourceConfig.KeepAlivePeriodInSeconds) + } if f4iter.ResourceConfig.VolumeKMSKeyID != nil { f4elemf9.SetVolumeKmsKeyId(*f4iter.ResourceConfig.VolumeKMSKeyID) } diff --git a/pkg/resource/training_job/delta.go b/pkg/resource/training_job/delta.go index 094073b4..251c2246 100644 --- a/pkg/resource/training_job/delta.go +++ b/pkg/resource/training_job/delta.go @@ -249,6 +249,13 @@ func newResourceDelta( delta.Add("Spec.ResourceConfig.InstanceType", a.ko.Spec.ResourceConfig.InstanceType, b.ko.Spec.ResourceConfig.InstanceType) } } + if ackcompare.HasNilDifference(a.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds, b.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds) { + delta.Add("Spec.ResourceConfig.KeepAlivePeriodInSeconds", a.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds, b.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds) + } else if a.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds != nil && b.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds != nil { + if *a.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds != *b.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds { + delta.Add("Spec.ResourceConfig.KeepAlivePeriodInSeconds", a.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds, b.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds) + } + } if ackcompare.HasNilDifference(a.ko.Spec.ResourceConfig.VolumeKMSKeyID, b.ko.Spec.ResourceConfig.VolumeKMSKeyID) { delta.Add("Spec.ResourceConfig.VolumeKMSKeyID", a.ko.Spec.ResourceConfig.VolumeKMSKeyID, b.ko.Spec.ResourceConfig.VolumeKMSKeyID) } else if a.ko.Spec.ResourceConfig.VolumeKMSKeyID != nil && b.ko.Spec.ResourceConfig.VolumeKMSKeyID != nil { diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index d456257a..5147c6fe 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -477,6 +477,9 @@ func (rm *resourceManager) sdkFind( if resp.ResourceConfig.InstanceType != nil { f25.InstanceType = resp.ResourceConfig.InstanceType } + if resp.ResourceConfig.KeepAlivePeriodInSeconds != nil { + f25.KeepAlivePeriodInSeconds = resp.ResourceConfig.KeepAlivePeriodInSeconds + } if resp.ResourceConfig.VolumeKmsKeyId != nil { f25.VolumeKMSKeyID = resp.ResourceConfig.VolumeKmsKeyId } @@ -938,6 +941,9 @@ func (rm *resourceManager) newCreateRequestPayload( if r.ko.Spec.ResourceConfig.InstanceType != nil { f14.SetInstanceType(*r.ko.Spec.ResourceConfig.InstanceType) } + if r.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds != nil { + f14.SetKeepAlivePeriodInSeconds(*r.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds) + } if r.ko.Spec.ResourceConfig.VolumeKMSKeyID != nil { f14.SetVolumeKmsKeyId(*r.ko.Spec.ResourceConfig.VolumeKMSKeyID) } From abf0ae60f017ae1299d270ddf9ede0dc96d3e78c Mon Sep 17 00:00:00 2001 From: ananth102 Date: Fri, 21 Oct 2022 22:11:33 +0000 Subject: [PATCH 02/71] added custom code --- apis/v1alpha1/ack-generate-metadata.yaml | 6 +++--- apis/v1alpha1/generator.yaml | 11 +++++++--- apis/v1alpha1/training_job.go | 3 +++ apis/v1alpha1/zz_generated.deepcopy.go | 5 +++++ ...gemaker.services.k8s.aws_trainingjobs.yaml | 12 +++++++++++ generator.yaml | 11 +++++++--- ...gemaker.services.k8s.aws_trainingjobs.yaml | 12 +++++++++++ pkg/resource/training_job/hooks.go | 20 +++++++++++++++++++ pkg/resource/training_job/sdk.go | 19 ++++++++++++++++++ .../sdk_read_one_post_set_output.go.tpl | 5 +++++ 10 files changed, 95 insertions(+), 9 deletions(-) create mode 100644 templates/training_job/sdk_read_one_post_set_output.go.tpl diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index 99198692..b6cff421 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,13 +1,13 @@ ack_generate_info: - build_date: "2022-10-21T18:02:34Z" + build_date: "2022-10-21T22:09:47Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc -api_directory_checksum: 255bd4888fa527d059365a19726d1848c42177e5 +api_directory_checksum: ec00d2d3297b1fdca95beb0ee9c7751e9e83573a api_version: v1alpha1 aws_sdk_go_version: v1.44.117 generator_config_info: - file_checksum: 86528885e7155246332cc5d9d40a017285b34456 + file_checksum: 53eb44592068bb0332abf7fa99b385f9b560ff32 original_file_name: generator.yaml last_modification: reason: API generation diff --git a/apis/v1alpha1/generator.yaml b/apis/v1alpha1/generator.yaml index 7830ea5a..b4933566 100644 --- a/apis/v1alpha1/generator.yaml +++ b/apis/v1alpha1/generator.yaml @@ -127,7 +127,7 @@ resources: delta_pre_compare: code: customSetDefaults(a, b) sdk_read_one_post_set_output: - code: rm.customSetOutput(&resource{ko}) + template_path: training_job/sdk_read_one_post_set_output.go.tpl sdk_delete_pre_build_request: template_path: training_job/sdk_delete_pre_build_request.go.tpl sdk_delete_post_request: @@ -169,7 +169,12 @@ resources: priority: 1 from: operation: DescribeTrainingJob - path: FailureReason + path: FailureReason + WarmPoolStatus: + is_read_only: true + from: + operation: DescribeTrainingJob + path: WarmPoolStatus AlgorithmSpecification.MetricDefinitions: compare: is_ignored: true @@ -889,4 +894,4 @@ ignore: - InstanceGroupNames - CanvasAppSettings - ExplainerConfig - - HyperParameterTuningJobStrategyConfig + - HyperParameterTuningJobStrategyConfig \ No newline at end of file diff --git a/apis/v1alpha1/training_job.go b/apis/v1alpha1/training_job.go index b9e1a264..cf349c11 100644 --- a/apis/v1alpha1/training_job.go +++ b/apis/v1alpha1/training_job.go @@ -260,6 +260,9 @@ type TrainingJobStatus struct { // For more detailed information, see SecondaryStatus. // +kubebuilder:validation:Optional TrainingJobStatus *string `json:"trainingJobStatus,omitempty"` + // The status of the warm pool associated with the training job. + // +kubebuilder:validation:Optional + WarmPoolStatus *WarmPoolStatus `json:"warmPoolStatus,omitempty"` } // TrainingJob is the Schema for the TrainingJobs API diff --git a/apis/v1alpha1/zz_generated.deepcopy.go b/apis/v1alpha1/zz_generated.deepcopy.go index ff05bc14..8b272a64 100644 --- a/apis/v1alpha1/zz_generated.deepcopy.go +++ b/apis/v1alpha1/zz_generated.deepcopy.go @@ -12758,6 +12758,11 @@ func (in *TrainingJobStatus) DeepCopyInto(out *TrainingJobStatus) { *out = new(string) **out = **in } + if in.WarmPoolStatus != nil { + in, out := &in.WarmPoolStatus, &out.WarmPoolStatus + *out = new(WarmPoolStatus) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingJobStatus. diff --git a/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml b/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml index 729b1b38..6f60c60d 100644 --- a/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml +++ b/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml @@ -652,6 +652,18 @@ spec: - The training job has stopped. \n For more detailed information, see SecondaryStatus." type: string + warmPoolStatus: + description: The status of the warm pool associated with the training + job. + properties: + resourceRetainedBillableTimeInSeconds: + format: int64 + type: integer + reusedByJob: + type: string + status: + type: string + type: object type: object type: object served: true diff --git a/generator.yaml b/generator.yaml index 7830ea5a..b4933566 100644 --- a/generator.yaml +++ b/generator.yaml @@ -127,7 +127,7 @@ resources: delta_pre_compare: code: customSetDefaults(a, b) sdk_read_one_post_set_output: - code: rm.customSetOutput(&resource{ko}) + template_path: training_job/sdk_read_one_post_set_output.go.tpl sdk_delete_pre_build_request: template_path: training_job/sdk_delete_pre_build_request.go.tpl sdk_delete_post_request: @@ -169,7 +169,12 @@ resources: priority: 1 from: operation: DescribeTrainingJob - path: FailureReason + path: FailureReason + WarmPoolStatus: + is_read_only: true + from: + operation: DescribeTrainingJob + path: WarmPoolStatus AlgorithmSpecification.MetricDefinitions: compare: is_ignored: true @@ -889,4 +894,4 @@ ignore: - InstanceGroupNames - CanvasAppSettings - ExplainerConfig - - HyperParameterTuningJobStrategyConfig + - HyperParameterTuningJobStrategyConfig \ No newline at end of file diff --git a/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml b/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml index 729b1b38..6f60c60d 100644 --- a/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml +++ b/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml @@ -652,6 +652,18 @@ spec: - The training job has stopped. \n For more detailed information, see SecondaryStatus." type: string + warmPoolStatus: + description: The status of the warm pool associated with the training + job. + properties: + resourceRetainedBillableTimeInSeconds: + format: int64 + type: integer + reusedByJob: + type: string + status: + type: string + type: object type: object type: object served: true diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index 6efd865f..78cbd635 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -16,6 +16,7 @@ package training_job import ( "errors" + ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" ackrequeue "github.com/aws-controllers-k8s/runtime/pkg/requeue" svccommon "github.com/aws-controllers-k8s/sagemaker-controller/pkg/common" "github.com/aws/aws-sdk-go/aws" @@ -31,12 +32,21 @@ var ( svcsdk.RuleEvaluationStatusInProgress, svcsdk.RuleEvaluationStatusStopping, } + WarmPoolModifyingStatuses = []string{ + svcsdk.WarmPoolResourceStatusAvailable, + svcsdk.WarmPoolResourceStatusInUse, + } resourceName = GroupKind.Kind requeueWaitWhileDeleting = ackrequeue.NeededAfter( errors.New(resourceName+" is Stopping."), ackrequeue.DefaultRequeueAfterDuration, ) + + requeueWaitWhileWarmPoolInUse = ackrequeue.NeededAfter( + errors.New("Warm Pool Cluster is still active."), + ackrequeue.DefaultRequeueAfterDuration, + ) ) // customSetOutput sets the resource ResourceSynced condition to False if @@ -65,3 +75,13 @@ func (rm *resourceManager) customSetOutput(r *resource) { svccommon.SetSyncedCondition(r, trainingJobStatus, &resourceName, &trainingJobModifyingStatuses) } + +func (rm *resourceManager) customSetWarmPoolOutput(r *resource) error { + if ackcompare.IsNil(r.ko.Status.WarmPoolStatus) { + return nil + } + if svccommon.IsModifyingStatus(r.ko.Status.WarmPoolStatus.Status, &WarmPoolModifyingStatuses) { + return requeueWaitWhileWarmPoolInUse + } + return nil +} diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index 5147c6fe..0817e995 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -565,9 +565,28 @@ func (rm *resourceManager) sdkFind( } else { ko.Spec.VPCConfig = nil } + if resp.WarmPoolStatus != nil { + f39 := &svcapitypes.WarmPoolStatus{} + if resp.WarmPoolStatus.ResourceRetainedBillableTimeInSeconds != nil { + f39.ResourceRetainedBillableTimeInSeconds = resp.WarmPoolStatus.ResourceRetainedBillableTimeInSeconds + } + if resp.WarmPoolStatus.ReusedByJob != nil { + f39.ReusedByJob = resp.WarmPoolStatus.ReusedByJob + } + if resp.WarmPoolStatus.Status != nil { + f39.Status = resp.WarmPoolStatus.Status + } + ko.Status.WarmPoolStatus = f39 + } else { + ko.Status.WarmPoolStatus = nil + } rm.setStatusDefaults(ko) rm.customSetOutput(&resource{ko}) + wp_err := rm.customSetWarmPoolOutput(&resource{ko}) + if wp_err != nil { + return &resource{ko}, wp_err + } return &resource{ko}, nil } diff --git a/templates/training_job/sdk_read_one_post_set_output.go.tpl b/templates/training_job/sdk_read_one_post_set_output.go.tpl new file mode 100644 index 00000000..0467b2b7 --- /dev/null +++ b/templates/training_job/sdk_read_one_post_set_output.go.tpl @@ -0,0 +1,5 @@ +rm.customSetOutput(&resource{ko}) +wp_err := rm.customSetWarmPoolOutput(&resource{ko}) +if wp_err != nil{ + return &resource{ko}, wp_err +} \ No newline at end of file From e9458605416e418572ede2a31639ca9a066c56a4 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Mon, 24 Oct 2022 20:54:43 +0000 Subject: [PATCH 03/71] test: unit tests --- pkg/resource/training_job/hooks.go | 1 + .../describe/warmpool_describe_available.json | 191 +++++++++++++++++ .../warmpool_describe_terminated.json | 192 ++++++++++++++++++ .../training_job/testdata/test_suite.yaml | 20 ++ .../readone/desired/warmpool_available.yaml | 59 ++++++ .../readone/desired/warmpool_terminated.yaml | 59 ++++++ .../readone/observed/warmpool_available.yaml | 116 +++++++++++ .../readone/observed/warmpool_terminated.yaml | 113 +++++++++++ 8 files changed, 751 insertions(+) create mode 100644 pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_available.json create mode 100644 pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_terminated.json create mode 100644 pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool_available.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool_terminated.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index 78cbd635..4d1375fc 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -76,6 +76,7 @@ func (rm *resourceManager) customSetOutput(r *resource) { svccommon.SetSyncedCondition(r, trainingJobStatus, &resourceName, &trainingJobModifyingStatuses) } +// Requeue whenever Warmpool cluster is in Available or Inuse state. func (rm *resourceManager) customSetWarmPoolOutput(r *resource) error { if ackcompare.IsNil(r.ko.Status.WarmPoolStatus) { return nil diff --git a/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_available.json b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_available.json new file mode 100644 index 00000000..d7e0d1f9 --- /dev/null +++ b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_available.json @@ -0,0 +1,191 @@ +{ + "AlgorithmSpecification": { + "AlgorithmName": null, + "EnableSageMakerMetricsTimeSeries": false, + "MetricDefinitions": [ + { + "Name": "train:mae", + "Regex": ".*\\[[0-9]+\\].*#011train-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:auc", + "Regex": ".*\\[[0-9]+\\].*#011validation-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:merror", + "Regex": ".*\\[[0-9]+\\].*#011train-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:auc", + "Regex": ".*\\[[0-9]+\\].*#011train-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mae", + "Regex": ".*\\[[0-9]+\\].*#011validation-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:error", + "Regex": ".*\\[[0-9]+\\].*#011validation-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:merror", + "Regex": ".*\\[[0-9]+\\].*#011validation-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:logloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:rmse", + "Regex": ".*\\[[0-9]+\\].*#011train-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:logloss", + "Regex": ".*\\[[0-9]+\\].*#011train-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011train-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:rmse", + "Regex": ".*\\[[0-9]+\\].*#011validation-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011validation-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:error", + "Regex": ".*\\[[0-9]+\\].*#011train-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011train-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:map", + "Regex": ".*\\[[0-9]+\\].*#011train-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:map", + "Regex": ".*\\[[0-9]+\\].*#011validation-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + } + ], + "TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/xgboost:1", + "TrainingInputMode": "File" + }, + "AutoMLJobArn": null, + "BillableTimeInSeconds": null, + "CheckpointConfig": null, + "CreationTime": "2021-10-12T05:49:40.493Z", + "DebugHookConfig": null, + "DebugRuleConfigurations": null, + "DebugRuleEvaluationStatuses": null, + "EnableInterContainerTrafficEncryption": false, + "EnableManagedSpotTraining": false, + "EnableNetworkIsolation": false, + "Environment": null, + "ExperimentConfig": null, + "FailureReason": null, + "FinalMetricDataList": null, + "HyperParameters": { + "eta": "0.2", + "gamma": "4", + "max_depth": "5", + "min_child_weight": "6", + "num_class": "10", + "num_round": "10", + "objective": "multi:softmax", + "silent": "0" + }, + "InputDataConfig": [ + { + "ChannelName": "train", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "FileSystemDataSource": null, + "S3DataSource": { + "AttributeNames": null, + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train" + } + }, + "InputMode": null, + "RecordWrapperType": "None", + "ShuffleConfig": null + }, + { + "ChannelName": "validation", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "FileSystemDataSource": null, + "S3DataSource": { + "AttributeNames": null, + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation" + } + }, + "InputMode": null, + "RecordWrapperType": "None", + "ShuffleConfig": null + } + ], + "LabelingJobArn": null, + "LastModifiedTime": "2021-10-12T05:52:46.108Z", + "ModelArtifacts": null, + "OutputDataConfig": { + "KmsKeyId": "", + "S3OutputPath": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output" + }, + "ProfilerConfig": null, + "ProfilerRuleConfigurations": null, + "ProfilerRuleEvaluationStatuses": null, + "ProfilingStatus": "Disabled", + "ResourceConfig": { + "InstanceCount": 1, + "InstanceType": "ml.m4.xlarge", + "VolumeKmsKeyId": null, + "VolumeSizeInGB": 5 + }, + "RoleArn": "arn:aws:iam::123456789012:role/service-role/AmazonSageMaker", + "SecondaryStatus": "Completed", + "SecondaryStatusTransitions": [ + { + "EndTime": "2021-10-12T05:52:46.108Z", + "StartTime": "2021-10-12T05:49:40.493Z", + "Status": "Starting", + "StatusMessage": "Preparing the instances for training" + }, + { + "EndTime": null, + "StartTime": "2021-10-12T05:52:46.108Z", + "Status": "Downloading", + "StatusMessage": "Downloading input data" + } + ], + "StoppingCondition": { + "MaxRuntimeInSeconds": 86400, + "MaxWaitTimeInSeconds": null + }, + "WarmPoolStatus": { + "Status":"Available" + }, + "TensorBoardOutputConfig": null, + "TrainingEndTime": null, + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job", + "TrainingJobName": "xgboost-training-job", + "TrainingJobStatus": "Completed", + "TrainingStartTime": "2021-10-12T05:52:46.108Z", + "TrainingTimeInSeconds": 31, + "TuningJobArn": null, + "VpcConfig": null +} \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_terminated.json b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_terminated.json new file mode 100644 index 00000000..a9da8b31 --- /dev/null +++ b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_terminated.json @@ -0,0 +1,192 @@ +{ + "AlgorithmSpecification": { + "AlgorithmName": null, + "EnableSageMakerMetricsTimeSeries": false, + "MetricDefinitions": [ + { + "Name": "train:mae", + "Regex": ".*\\[[0-9]+\\].*#011train-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:auc", + "Regex": ".*\\[[0-9]+\\].*#011validation-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:merror", + "Regex": ".*\\[[0-9]+\\].*#011train-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:auc", + "Regex": ".*\\[[0-9]+\\].*#011train-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mae", + "Regex": ".*\\[[0-9]+\\].*#011validation-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:error", + "Regex": ".*\\[[0-9]+\\].*#011validation-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:merror", + "Regex": ".*\\[[0-9]+\\].*#011validation-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:logloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:rmse", + "Regex": ".*\\[[0-9]+\\].*#011train-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:logloss", + "Regex": ".*\\[[0-9]+\\].*#011train-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011train-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:rmse", + "Regex": ".*\\[[0-9]+\\].*#011validation-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011validation-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:error", + "Regex": ".*\\[[0-9]+\\].*#011train-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011train-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:map", + "Regex": ".*\\[[0-9]+\\].*#011train-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:map", + "Regex": ".*\\[[0-9]+\\].*#011validation-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + } + ], + "TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/xgboost:1", + "TrainingInputMode": "File" + }, + "AutoMLJobArn": null, + "BillableTimeInSeconds": null, + "CheckpointConfig": null, + "CreationTime": "2021-10-12T05:49:40.493Z", + "DebugHookConfig": null, + "DebugRuleConfigurations": null, + "DebugRuleEvaluationStatuses": null, + "EnableInterContainerTrafficEncryption": false, + "EnableManagedSpotTraining": false, + "EnableNetworkIsolation": false, + "Environment": null, + "ExperimentConfig": null, + "FailureReason": null, + "FinalMetricDataList": null, + "HyperParameters": { + "eta": "0.2", + "gamma": "4", + "max_depth": "5", + "min_child_weight": "6", + "num_class": "10", + "num_round": "10", + "objective": "multi:softmax", + "silent": "0" + }, + "InputDataConfig": [ + { + "ChannelName": "train", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "FileSystemDataSource": null, + "S3DataSource": { + "AttributeNames": null, + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train" + } + }, + "InputMode": null, + "RecordWrapperType": "None", + "ShuffleConfig": null + }, + { + "ChannelName": "validation", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "FileSystemDataSource": null, + "S3DataSource": { + "AttributeNames": null, + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation" + } + }, + "InputMode": null, + "RecordWrapperType": "None", + "ShuffleConfig": null + } + ], + "LabelingJobArn": null, + "LastModifiedTime": "2021-10-12T05:52:46.108Z", + "ModelArtifacts": null, + "OutputDataConfig": { + "KmsKeyId": "", + "S3OutputPath": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output" + }, + "ProfilerConfig": null, + "ProfilerRuleConfigurations": null, + "ProfilerRuleEvaluationStatuses": null, + "ProfilingStatus": "Disabled", + "ResourceConfig": { + "InstanceCount": 1, + "InstanceType": "ml.m4.xlarge", + "VolumeKmsKeyId": null, + "VolumeSizeInGB": 5 + }, + "RoleArn": "arn:aws:iam::123456789012:role/service-role/AmazonSageMaker", + "SecondaryStatus": "Completed", + "SecondaryStatusTransitions": [ + { + "EndTime": "2021-10-12T05:52:46.108Z", + "StartTime": "2021-10-12T05:49:40.493Z", + "Status": "Starting", + "StatusMessage": "Preparing the instances for training" + }, + { + "EndTime": null, + "StartTime": "2021-10-12T05:52:46.108Z", + "Status": "Downloading", + "StatusMessage": "Downloading input data" + } + ], + "StoppingCondition": { + "MaxRuntimeInSeconds": 86400, + "MaxWaitTimeInSeconds": null + }, + "WarmPoolStatus": { + "Status":"Terminated", + "ResourceRetainedBillableTimeInSeconds": 69 + }, + "TensorBoardOutputConfig": null, + "TrainingEndTime": null, + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job", + "TrainingJobName": "xgboost-training-job", + "TrainingJobStatus": "Completed", + "TrainingStartTime": "2021-10-12T05:52:46.108Z", + "TrainingTimeInSeconds": 31, + "TuningJobArn": null, + "VpcConfig": null +} \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/test_suite.yaml b/pkg/resource/training_job/testdata/test_suite.yaml index 82eeab46..87e63547 100644 --- a/pkg/resource/training_job/testdata/test_suite.yaml +++ b/pkg/resource/training_job/testdata/test_suite.yaml @@ -142,6 +142,26 @@ tests: invoke: ReadOne expect: latest_state: "v1alpha1/readone/observed/stopping_debugger_variation.yaml" + - name: "ReadOne=WarmPool" + description: "Test readOne after training job finishes when user specifies a warm pool" + given: + desired_state: "v1alpha1/readone/desired/warmpool_available.yaml" + svc_api: + - operation: DescribeTrainingJobWithContext + output_fixture: "sdkapi/describe/warmpool_describe_available.json" + invoke: ReadOne + expect: + latest_state: "v1alpha1/readone/observed/warmpool_available.yaml" + - name: "ReadOne=WarmPoolTerminated" + description: "Test readOne after training job and warm pool complete/terminate." + given: + desired_state: "v1alpha1/readone/desired/warmpool_terminated.yaml" + svc_api: + - operation: DescribeTrainingJobWithContext + output_fixture: "sdkapi/describe/warmpool_describe_terminated.json" + invoke: ReadOne + expect: + latest_state: "v1alpha1/readone/observed/warmpool_terminated.yaml" - name: "Training job update tests" description: "Testing the Update operation" scenarios: diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool_available.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool_available.yaml new file mode 100644 index 00000000..bbf82578 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool_available.yaml @@ -0,0 +1,59 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: unit-testing-training-job +spec: + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + outputDataConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + keepAliveInSeconds: 70 + roleARN: arn:aws:iam::123456789012:role/ack-sagemaker-execution-role + stoppingCondition: + maxRuntimeInSeconds: 86400 + tags: + - key: algorithm + value: xgboost + - key: environment + value: testing + - key: customer + value: test-user + trainingJobName: xgboost-training-job +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job + ownerAccountID: "" + region: "" + conditions: [] diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool_terminated.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool_terminated.yaml new file mode 100644 index 00000000..bbf82578 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool_terminated.yaml @@ -0,0 +1,59 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: unit-testing-training-job +spec: + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + outputDataConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + keepAliveInSeconds: 70 + roleARN: arn:aws:iam::123456789012:role/ack-sagemaker-execution-role + stoppingCondition: + maxRuntimeInSeconds: 86400 + tags: + - key: algorithm + value: xgboost + - key: environment + value: testing + - key: customer + value: test-user + trainingJobName: xgboost-training-job +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job + ownerAccountID: "" + region: "" + conditions: [] diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml new file mode 100644 index 00000000..b0081201 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml @@ -0,0 +1,116 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: unit-testing-training-job +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + keepAliveInSeconds: 70 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + tags: + - key: algorithm + value: xgboost + - key: environment + value: testing + - key: customer + value: test-user + trainingJobName: xgboost-training-job +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in Completed status. + status: "True" + type: ACK.ResourceSynced + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: Warm Pool Cluster is still active. + status: "True" + type: ACK.Recoverable + warmPoolStatus: + status: Available + secondaryStatus: Completed + trainingJobStatus: Completed + diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml new file mode 100644 index 00000000..793576d8 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml @@ -0,0 +1,113 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: unit-testing-training-job +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + keepAliveInSeconds: 70 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + tags: + - key: algorithm + value: xgboost + - key: environment + value: testing + - key: customer + value: test-user + trainingJobName: xgboost-training-job +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in Completed status. + status: "True" + type: ACK.ResourceSynced + warmPoolStatus: + status: Terminated + resourceRetainedBillableTimeInSeconds: 69 + secondaryStatus: Completed + trainingJobStatus: Completed + From abece9a5cca5bcea3c5ec2a78a1b2b22a38623e1 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Tue, 25 Oct 2022 02:29:46 +0000 Subject: [PATCH 04/71] fix: hack for HPO and warmpool --- .../hyper_parameter_tuning_job/custom_delta.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pkg/resource/hyper_parameter_tuning_job/custom_delta.go b/pkg/resource/hyper_parameter_tuning_job/custom_delta.go index 2f5b5eec..3f704980 100644 --- a/pkg/resource/hyper_parameter_tuning_job/custom_delta.go +++ b/pkg/resource/hyper_parameter_tuning_job/custom_delta.go @@ -31,5 +31,18 @@ func customSetDefaults( } } } + // TODO: Remove the block below. + // The server side default of KeepAlivePeriodInSeconds is nil, when launching a HPO job. + // The code generator currently cannot ignore the field path for resourceConfig.KeepAlivePeriodInSeconds + // without also ignoring Trainingjob. This block below should be removed once the code generator supports + // removing fields like resourceConfig.KeepAlivePeriodInSeconds + if ackcompare.IsNotNil(a.ko.Spec.TrainingJobDefinition) && ackcompare.IsNotNil(b.ko.Spec.TrainingJobDefinition) { + if ackcompare.IsNotNil(a.ko.Spec.TrainingJobDefinition.ResourceConfig) && ackcompare.IsNotNil(b.ko.Spec.TrainingJobDefinition.ResourceConfig) { + if ackcompare.IsNotNil(a.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds) && ackcompare.IsNil(b.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds) { + a.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds = nil + } + } + } } + } From cd58a3cd0a9eb2e90785e7092255ef8aa096e2b3 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Tue, 25 Oct 2022 20:32:48 +0000 Subject: [PATCH 05/71] fix: futureproofing --- pkg/resource/training_job/hooks.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index 4d1375fc..0651c8bc 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -81,6 +81,13 @@ func (rm *resourceManager) customSetWarmPoolOutput(r *resource) error { if ackcompare.IsNil(r.ko.Status.WarmPoolStatus) { return nil } + + trainingJobStatus := r.ko.Status.TrainingJobStatus + // Currently the warm pool status does not appear in the api resonse, but that could change + // in the future, warm pool should only dealt with after the training job finishes. + if trainingJobStatus != nil && *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { + return nil + } if svccommon.IsModifyingStatus(r.ko.Status.WarmPoolStatus.Status, &WarmPoolModifyingStatuses) { return requeueWaitWhileWarmPoolInUse } From 362f7ff1ac5d3ba813e1d99eecdc1f75be66aa91 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Tue, 25 Oct 2022 20:56:55 +0000 Subject: [PATCH 06/71] format: reformat one file --- pkg/resource/hyper_parameter_tuning_job/custom_delta.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/resource/hyper_parameter_tuning_job/custom_delta.go b/pkg/resource/hyper_parameter_tuning_job/custom_delta.go index 3f704980..7c2bfb48 100644 --- a/pkg/resource/hyper_parameter_tuning_job/custom_delta.go +++ b/pkg/resource/hyper_parameter_tuning_job/custom_delta.go @@ -44,5 +44,4 @@ func customSetDefaults( } } } - } From 3318693576e4310b49ca4b9be8ce99251cb3b3a5 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Thu, 27 Oct 2022 22:00:21 +0000 Subject: [PATCH 07/71] fix: made pr changes --- apis/v1alpha1/ack-generate-metadata.yaml | 4 +- .../custom_delta.go | 12 ++ pkg/resource/training_job/hooks.go | 21 +-- pkg/resource/training_job/sdk.go | 3 +- .../describe/warmpool_describe_available.json | 39 +---- .../describe/warmpool_describe_inuse.json | 160 +++++++++++++++++ .../describe/warmpool_describe_reused.json | 162 ++++++++++++++++++ .../warmpool_describe_terminated.json | 39 +---- .../training_job/testdata/test_suite.yaml | 24 ++- ...{warmpool_available.yaml => warmpool.yaml} | 0 .../readone/desired/warmpool_terminated.yaml | 59 ------- .../readone/observed/warmpool_available.yaml | 5 +- .../readone/observed/warmpool_inuse.yaml | 111 ++++++++++++ .../readone/observed/warmpool_reused.yaml | 113 ++++++++++++ .../sdk_read_one_post_set_output.go.tpl | 3 +- 15 files changed, 601 insertions(+), 154 deletions(-) create mode 100644 pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_inuse.json create mode 100644 pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_reused.json rename pkg/resource/training_job/testdata/v1alpha1/readone/desired/{warmpool_available.yaml => warmpool.yaml} (100%) delete mode 100644 pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool_terminated.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index b6cff421..9bc022d8 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,9 +1,9 @@ ack_generate_info: - build_date: "2022-10-21T22:09:47Z" + build_date: "2022-10-27T21:59:13Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc -api_directory_checksum: ec00d2d3297b1fdca95beb0ee9c7751e9e83573a +api_directory_checksum: afd89795e3cceb09d028722ecbd9512b6197eb6a api_version: v1alpha1 aws_sdk_go_version: v1.44.117 generator_config_info: diff --git a/pkg/resource/hyper_parameter_tuning_job/custom_delta.go b/pkg/resource/hyper_parameter_tuning_job/custom_delta.go index 7c2bfb48..82e89680 100644 --- a/pkg/resource/hyper_parameter_tuning_job/custom_delta.go +++ b/pkg/resource/hyper_parameter_tuning_job/custom_delta.go @@ -43,5 +43,17 @@ func customSetDefaults( } } } + // HPO does not support custom warm pool durations and the Server side default will + // always be nil. + if ackcompare.IsNotNil(a.ko.Spec.TrainingJobDefinitions) { + for i, trainDefinition := range a.ko.Spec.TrainingJobDefinitions { + if ackcompare.IsNotNil(trainDefinition) { + if ackcompare.IsNotNil(trainDefinition.ResourceConfig) && ackcompare.IsNotNil(trainDefinition.ResourceConfig.KeepAlivePeriodInSeconds) { + a.ko.Spec.TrainingJobDefinitions[i].ResourceConfig.KeepAlivePeriodInSeconds = nil + } + } + } + } + } } diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index 0651c8bc..f9428e85 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -44,52 +44,45 @@ var ( ) requeueWaitWhileWarmPoolInUse = ackrequeue.NeededAfter( - errors.New("Warm Pool Cluster is still active."), + errors.New("Provisioned infrastructure is still being retained."), ackrequeue.DefaultRequeueAfterDuration, ) ) // customSetOutput sets the resource ResourceSynced condition to False if // TrainingJob is being modified by AWS. It checks for debug and profiler rule status in addition to TrainingJobStatus -func (rm *resourceManager) customSetOutput(r *resource) { +func (rm *resourceManager) customSetOutput(r *resource) error { trainingJobStatus := r.ko.Status.TrainingJobStatus // early exit if training job is InProgress if trainingJobStatus != nil && *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { svccommon.SetSyncedCondition(r, trainingJobStatus, &resourceName, &trainingJobModifyingStatuses) - return + return nil } for _, rule := range r.ko.Status.DebugRuleEvaluationStatuses { if rule.RuleEvaluationStatus != nil && svccommon.IsModifyingStatus(rule.RuleEvaluationStatus, &ruleModifyingStatuses) { svccommon.SetSyncedCondition(r, rule.RuleEvaluationStatus, aws.String("DebugRule"), &ruleModifyingStatuses) - return + return nil } } for _, rule := range r.ko.Status.ProfilerRuleEvaluationStatuses { if rule.RuleEvaluationStatus != nil && svccommon.IsModifyingStatus(rule.RuleEvaluationStatus, &ruleModifyingStatuses) { svccommon.SetSyncedCondition(r, rule.RuleEvaluationStatus, aws.String("ProfilerRule"), &ruleModifyingStatuses) - return + return nil } } svccommon.SetSyncedCondition(r, trainingJobStatus, &resourceName, &trainingJobModifyingStatuses) -} -// Requeue whenever Warmpool cluster is in Available or Inuse state. -func (rm *resourceManager) customSetWarmPoolOutput(r *resource) error { + // Requeue whenever Warmpool cluster is in Available or Inuse state. if ackcompare.IsNil(r.ko.Status.WarmPoolStatus) { return nil } - trainingJobStatus := r.ko.Status.TrainingJobStatus - // Currently the warm pool status does not appear in the api resonse, but that could change - // in the future, warm pool should only dealt with after the training job finishes. - if trainingJobStatus != nil && *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { - return nil - } if svccommon.IsModifyingStatus(r.ko.Status.WarmPoolStatus.Status, &WarmPoolModifyingStatuses) { return requeueWaitWhileWarmPoolInUse } return nil + } diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index 0817e995..86505db6 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -582,8 +582,7 @@ func (rm *resourceManager) sdkFind( } rm.setStatusDefaults(ko) - rm.customSetOutput(&resource{ko}) - wp_err := rm.customSetWarmPoolOutput(&resource{ko}) + wp_err := rm.customSetOutput(&resource{ko}) if wp_err != nil { return &resource{ko}, wp_err } diff --git a/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_available.json b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_available.json index d7e0d1f9..2bf332c1 100644 --- a/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_available.json +++ b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_available.json @@ -1,6 +1,5 @@ { "AlgorithmSpecification": { - "AlgorithmName": null, "EnableSageMakerMetricsTimeSeries": false, "MetricDefinitions": [ { @@ -79,20 +78,10 @@ "TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/xgboost:1", "TrainingInputMode": "File" }, - "AutoMLJobArn": null, - "BillableTimeInSeconds": null, - "CheckpointConfig": null, "CreationTime": "2021-10-12T05:49:40.493Z", - "DebugHookConfig": null, - "DebugRuleConfigurations": null, - "DebugRuleEvaluationStatuses": null, "EnableInterContainerTrafficEncryption": false, "EnableManagedSpotTraining": false, "EnableNetworkIsolation": false, - "Environment": null, - "ExperimentConfig": null, - "FailureReason": null, - "FinalMetricDataList": null, "HyperParameters": { "eta": "0.2", "gamma": "4", @@ -109,51 +98,37 @@ "CompressionType": "None", "ContentType": "text/csv", "DataSource": { - "FileSystemDataSource": null, "S3DataSource": { - "AttributeNames": null, "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train" } }, - "InputMode": null, - "RecordWrapperType": "None", - "ShuffleConfig": null + "RecordWrapperType": "None" }, { "ChannelName": "validation", "CompressionType": "None", "ContentType": "text/csv", "DataSource": { - "FileSystemDataSource": null, "S3DataSource": { - "AttributeNames": null, "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation" } }, - "InputMode": null, - "RecordWrapperType": "None", - "ShuffleConfig": null + "RecordWrapperType": "None" } ], - "LabelingJobArn": null, "LastModifiedTime": "2021-10-12T05:52:46.108Z", - "ModelArtifacts": null, "OutputDataConfig": { "KmsKeyId": "", "S3OutputPath": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output" }, - "ProfilerConfig": null, - "ProfilerRuleConfigurations": null, - "ProfilerRuleEvaluationStatuses": null, "ProfilingStatus": "Disabled", "ResourceConfig": { "InstanceCount": 1, "InstanceType": "ml.m4.xlarge", - "VolumeKmsKeyId": null, "VolumeSizeInGB": 5 }, "RoleArn": "arn:aws:iam::123456789012:role/service-role/AmazonSageMaker", @@ -166,26 +141,20 @@ "StatusMessage": "Preparing the instances for training" }, { - "EndTime": null, "StartTime": "2021-10-12T05:52:46.108Z", "Status": "Downloading", "StatusMessage": "Downloading input data" } ], "StoppingCondition": { - "MaxRuntimeInSeconds": 86400, - "MaxWaitTimeInSeconds": null + "MaxRuntimeInSeconds": 86400 }, "WarmPoolStatus": { "Status":"Available" }, - "TensorBoardOutputConfig": null, - "TrainingEndTime": null, "TrainingJobArn": "arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job", "TrainingJobName": "xgboost-training-job", "TrainingJobStatus": "Completed", "TrainingStartTime": "2021-10-12T05:52:46.108Z", - "TrainingTimeInSeconds": 31, - "TuningJobArn": null, - "VpcConfig": null + "TrainingTimeInSeconds": 31 } \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_inuse.json b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_inuse.json new file mode 100644 index 00000000..7ba81289 --- /dev/null +++ b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_inuse.json @@ -0,0 +1,160 @@ +{ + "AlgorithmSpecification": { + "EnableSageMakerMetricsTimeSeries": false, + "MetricDefinitions": [ + { + "Name": "train:mae", + "Regex": ".*\\[[0-9]+\\].*#011train-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:auc", + "Regex": ".*\\[[0-9]+\\].*#011validation-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:merror", + "Regex": ".*\\[[0-9]+\\].*#011train-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:auc", + "Regex": ".*\\[[0-9]+\\].*#011train-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mae", + "Regex": ".*\\[[0-9]+\\].*#011validation-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:error", + "Regex": ".*\\[[0-9]+\\].*#011validation-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:merror", + "Regex": ".*\\[[0-9]+\\].*#011validation-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:logloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:rmse", + "Regex": ".*\\[[0-9]+\\].*#011train-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:logloss", + "Regex": ".*\\[[0-9]+\\].*#011train-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011train-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:rmse", + "Regex": ".*\\[[0-9]+\\].*#011validation-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011validation-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:error", + "Regex": ".*\\[[0-9]+\\].*#011train-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011train-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:map", + "Regex": ".*\\[[0-9]+\\].*#011train-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:map", + "Regex": ".*\\[[0-9]+\\].*#011validation-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + } + ], + "TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/xgboost:1", + "TrainingInputMode": "File" + }, + "CreationTime": "2021-10-12T05:49:40.493Z", + "EnableInterContainerTrafficEncryption": false, + "EnableManagedSpotTraining": false, + "EnableNetworkIsolation": false, + "HyperParameters": { + "eta": "0.2", + "gamma": "4", + "max_depth": "5", + "min_child_weight": "6", + "num_class": "10", + "num_round": "10", + "objective": "multi:softmax", + "silent": "0" + }, + "InputDataConfig": [ + { + "ChannelName": "train", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "S3DataSource": { + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train" + } + }, + "RecordWrapperType": "None" + }, + { + "ChannelName": "validation", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "S3DataSource": { + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation" + } + }, + "RecordWrapperType": "None" + } + ], + "LastModifiedTime": "2021-10-12T05:52:46.108Z", + "OutputDataConfig": { + "KmsKeyId": "", + "S3OutputPath": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output" + }, + "ProfilingStatus": "Disabled", + "ResourceConfig": { + "InstanceCount": 1, + "InstanceType": "ml.m4.xlarge", + "VolumeSizeInGB": 5 + }, + "RoleArn": "arn:aws:iam::123456789012:role/service-role/AmazonSageMaker", + "SecondaryStatus": "Starting", + "SecondaryStatusTransitions": [ + { + "EndTime": "2021-10-12T05:52:46.108Z", + "StartTime": "2021-10-12T05:49:40.493Z", + "Status": "Starting", + "StatusMessage": "Preparing the instances for training" + }, + { + "StartTime": "2021-10-12T05:52:46.108Z", + "Status": "Downloading", + "StatusMessage": "Downloading input data" + } + ], + "StoppingCondition": { + "MaxRuntimeInSeconds": 86400 + }, + "WarmPoolStatus": { + "Status": "InUse" + }, + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job", + "TrainingJobName": "xgboost-training-job", + "TrainingJobStatus": "InProgress", + "TrainingStartTime": "2021-10-12T05:52:46.108Z", + "TrainingTimeInSeconds": 31 +} \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_reused.json b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_reused.json new file mode 100644 index 00000000..b3adc836 --- /dev/null +++ b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_reused.json @@ -0,0 +1,162 @@ +{ + "AlgorithmSpecification": { + "EnableSageMakerMetricsTimeSeries": false, + "MetricDefinitions": [ + { + "Name": "train:mae", + "Regex": ".*\\[[0-9]+\\].*#011train-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:auc", + "Regex": ".*\\[[0-9]+\\].*#011validation-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:merror", + "Regex": ".*\\[[0-9]+\\].*#011train-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:auc", + "Regex": ".*\\[[0-9]+\\].*#011train-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mae", + "Regex": ".*\\[[0-9]+\\].*#011validation-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:error", + "Regex": ".*\\[[0-9]+\\].*#011validation-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:merror", + "Regex": ".*\\[[0-9]+\\].*#011validation-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:logloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:rmse", + "Regex": ".*\\[[0-9]+\\].*#011train-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:logloss", + "Regex": ".*\\[[0-9]+\\].*#011train-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011train-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:rmse", + "Regex": ".*\\[[0-9]+\\].*#011validation-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011validation-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:error", + "Regex": ".*\\[[0-9]+\\].*#011train-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011train-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:map", + "Regex": ".*\\[[0-9]+\\].*#011train-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:map", + "Regex": ".*\\[[0-9]+\\].*#011validation-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + } + ], + "TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/xgboost:1", + "TrainingInputMode": "File" + }, + "CreationTime": "2021-10-12T05:49:40.493Z", + "EnableInterContainerTrafficEncryption": false, + "EnableManagedSpotTraining": false, + "EnableNetworkIsolation": false, + "HyperParameters": { + "eta": "0.2", + "gamma": "4", + "max_depth": "5", + "min_child_weight": "6", + "num_class": "10", + "num_round": "10", + "objective": "multi:softmax", + "silent": "0" + }, + "InputDataConfig": [ + { + "ChannelName": "train", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "S3DataSource": { + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train" + } + }, + "RecordWrapperType": "None" + }, + { + "ChannelName": "validation", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "S3DataSource": { + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation" + } + }, + "RecordWrapperType": "None" + } + ], + "LastModifiedTime": "2021-10-12T05:52:46.108Z", + "OutputDataConfig": { + "KmsKeyId": "", + "S3OutputPath": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output" + }, + "ProfilingStatus": "Disabled", + "ResourceConfig": { + "InstanceCount": 1, + "InstanceType": "ml.m4.xlarge", + "VolumeSizeInGB": 5 + }, + "RoleArn": "arn:aws:iam::123456789012:role/service-role/AmazonSageMaker", + "SecondaryStatus": "Completed", + "SecondaryStatusTransitions": [ + { + "EndTime": "2021-10-12T05:52:46.108Z", + "StartTime": "2021-10-12T05:49:40.493Z", + "Status": "Starting", + "StatusMessage": "Preparing the instances for training" + }, + { + "StartTime": "2021-10-12T05:52:46.108Z", + "Status": "Downloading", + "StatusMessage": "Downloading input data" + } + ], + "StoppingCondition": { + "MaxRuntimeInSeconds": 86400 + }, + "WarmPoolStatus": { + "Status":"Reused", + "ResourceRetainedBillableTimeInSeconds": 69, + "ReusedByJob":"Trainingjob-ccsjjbdsjhhcsvdj" + }, + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job", + "TrainingJobName": "xgboost-training-job", + "TrainingJobStatus": "Completed", + "TrainingStartTime": "2021-10-12T05:52:46.108Z", + "TrainingTimeInSeconds": 31 +} \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_terminated.json b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_terminated.json index a9da8b31..d6dc7e5e 100644 --- a/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_terminated.json +++ b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_terminated.json @@ -1,6 +1,5 @@ { "AlgorithmSpecification": { - "AlgorithmName": null, "EnableSageMakerMetricsTimeSeries": false, "MetricDefinitions": [ { @@ -79,20 +78,10 @@ "TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/xgboost:1", "TrainingInputMode": "File" }, - "AutoMLJobArn": null, - "BillableTimeInSeconds": null, - "CheckpointConfig": null, "CreationTime": "2021-10-12T05:49:40.493Z", - "DebugHookConfig": null, - "DebugRuleConfigurations": null, - "DebugRuleEvaluationStatuses": null, "EnableInterContainerTrafficEncryption": false, "EnableManagedSpotTraining": false, "EnableNetworkIsolation": false, - "Environment": null, - "ExperimentConfig": null, - "FailureReason": null, - "FinalMetricDataList": null, "HyperParameters": { "eta": "0.2", "gamma": "4", @@ -109,51 +98,37 @@ "CompressionType": "None", "ContentType": "text/csv", "DataSource": { - "FileSystemDataSource": null, "S3DataSource": { - "AttributeNames": null, "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train" } }, - "InputMode": null, - "RecordWrapperType": "None", - "ShuffleConfig": null + "RecordWrapperType": "None" }, { "ChannelName": "validation", "CompressionType": "None", "ContentType": "text/csv", "DataSource": { - "FileSystemDataSource": null, "S3DataSource": { - "AttributeNames": null, "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation" } }, - "InputMode": null, - "RecordWrapperType": "None", - "ShuffleConfig": null + "RecordWrapperType": "None" } ], - "LabelingJobArn": null, "LastModifiedTime": "2021-10-12T05:52:46.108Z", - "ModelArtifacts": null, "OutputDataConfig": { "KmsKeyId": "", "S3OutputPath": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output" }, - "ProfilerConfig": null, - "ProfilerRuleConfigurations": null, - "ProfilerRuleEvaluationStatuses": null, "ProfilingStatus": "Disabled", "ResourceConfig": { "InstanceCount": 1, "InstanceType": "ml.m4.xlarge", - "VolumeKmsKeyId": null, "VolumeSizeInGB": 5 }, "RoleArn": "arn:aws:iam::123456789012:role/service-role/AmazonSageMaker", @@ -166,27 +141,21 @@ "StatusMessage": "Preparing the instances for training" }, { - "EndTime": null, "StartTime": "2021-10-12T05:52:46.108Z", "Status": "Downloading", "StatusMessage": "Downloading input data" } ], "StoppingCondition": { - "MaxRuntimeInSeconds": 86400, - "MaxWaitTimeInSeconds": null + "MaxRuntimeInSeconds": 86400 }, "WarmPoolStatus": { "Status":"Terminated", "ResourceRetainedBillableTimeInSeconds": 69 }, - "TensorBoardOutputConfig": null, - "TrainingEndTime": null, "TrainingJobArn": "arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job", "TrainingJobName": "xgboost-training-job", "TrainingJobStatus": "Completed", "TrainingStartTime": "2021-10-12T05:52:46.108Z", - "TrainingTimeInSeconds": 31, - "TuningJobArn": null, - "VpcConfig": null + "TrainingTimeInSeconds": 31 } \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/test_suite.yaml b/pkg/resource/training_job/testdata/test_suite.yaml index 87e63547..da6b4dd3 100644 --- a/pkg/resource/training_job/testdata/test_suite.yaml +++ b/pkg/resource/training_job/testdata/test_suite.yaml @@ -145,7 +145,7 @@ tests: - name: "ReadOne=WarmPool" description: "Test readOne after training job finishes when user specifies a warm pool" given: - desired_state: "v1alpha1/readone/desired/warmpool_available.yaml" + desired_state: "v1alpha1/readone/desired/warmpool.yaml" svc_api: - operation: DescribeTrainingJobWithContext output_fixture: "sdkapi/describe/warmpool_describe_available.json" @@ -155,13 +155,33 @@ tests: - name: "ReadOne=WarmPoolTerminated" description: "Test readOne after training job and warm pool complete/terminate." given: - desired_state: "v1alpha1/readone/desired/warmpool_terminated.yaml" + desired_state: "v1alpha1/readone/desired/warmpool.yaml" svc_api: - operation: DescribeTrainingJobWithContext output_fixture: "sdkapi/describe/warmpool_describe_terminated.json" invoke: ReadOne expect: latest_state: "v1alpha1/readone/observed/warmpool_terminated.yaml" + - name: "ReadOne=WarmPoolReused" + description: "Test Warmpool reused state" + given: + desired_state: "v1alpha1/readone/desired/warmpool.yaml" + svc_api: + - operation: DescribeTrainingJobWithContext + output_fixture: "sdkapi/describe/warmpool_describe_reused.json" + invoke: ReadOne + expect: + latest_state: "v1alpha1/readone/observed/warmpool_reused.yaml" + - name: "ReadOne=WarmPoolInUse" + description: "Test Warmpool inuse state" + given: + desired_state: "v1alpha1/readone/desired/warmpool.yaml" + svc_api: + - operation: DescribeTrainingJobWithContext + output_fixture: "sdkapi/describe/warmpool_describe_inuse.json" + invoke: ReadOne + expect: + latest_state: "v1alpha1/readone/observed/warmpool_inuse.yaml" - name: "Training job update tests" description: "Testing the Update operation" scenarios: diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool_available.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool.yaml similarity index 100% rename from pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool_available.yaml rename to pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool.yaml diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool_terminated.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool_terminated.yaml deleted file mode 100644 index bbf82578..00000000 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool_terminated.yaml +++ /dev/null @@ -1,59 +0,0 @@ -apiVersion: sagemaker.services.k8s.aws/v1alpha1 -kind: TrainingJob -metadata: - creationTimestamp: null - name: unit-testing-training-job -spec: - algorithmSpecification: - trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 - trainingInputMode: File - hyperParameters: - eta: "0.2" - gamma: "4" - max_depth: "5" - min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - outputDataConfig: - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output - resourceConfig: - instanceCount: 1 - instanceType: ml.m4.xlarge - volumeSizeInGB: 5 - keepAliveInSeconds: 70 - roleARN: arn:aws:iam::123456789012:role/ack-sagemaker-execution-role - stoppingCondition: - maxRuntimeInSeconds: 86400 - tags: - - key: algorithm - value: xgboost - - key: environment - value: testing - - key: customer - value: test-user - trainingJobName: xgboost-training-job -status: - ackResourceMetadata: - arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job - ownerAccountID: "" - region: "" - conditions: [] diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml index b0081201..f8a0db77 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml @@ -106,11 +106,10 @@ status: status: "True" type: ACK.ResourceSynced - lastTransitionTime: "0001-01-01T00:00:00Z" - message: Warm Pool Cluster is still active. + message: Provisioned infrastructure is still being retained. status: "True" type: ACK.Recoverable warmPoolStatus: status: Available secondaryStatus: Completed - trainingJobStatus: Completed - + trainingJobStatus: Completed \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml new file mode 100644 index 00000000..619c0d3e --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml @@ -0,0 +1,111 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: unit-testing-training-job +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + keepAliveInSeconds: 70 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + tags: + - key: algorithm + value: xgboost + - key: environment + value: testing + - key: customer + value: test-user + trainingJobName: xgboost-training-job +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + warmPoolStatus: + status: InUse + secondaryStatus: Starting + trainingJobStatus: InProgress \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml new file mode 100644 index 00000000..fd2b8590 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml @@ -0,0 +1,113 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: unit-testing-training-job +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + keepAliveInSeconds: 70 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + tags: + - key: algorithm + value: xgboost + - key: environment + value: testing + - key: customer + value: test-user + trainingJobName: xgboost-training-job +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in Completed status. + status: "True" + type: ACK.ResourceSynced + warmPoolStatus: + status: Reused + reusedByJob: Trainingjob-ccsjjbdsjhhcsvdj + resourceRetainedBillableTimeInSeconds: 69 + secondaryStatus: Completed + trainingJobStatus: Completed \ No newline at end of file diff --git a/templates/training_job/sdk_read_one_post_set_output.go.tpl b/templates/training_job/sdk_read_one_post_set_output.go.tpl index 0467b2b7..f687f275 100644 --- a/templates/training_job/sdk_read_one_post_set_output.go.tpl +++ b/templates/training_job/sdk_read_one_post_set_output.go.tpl @@ -1,5 +1,4 @@ -rm.customSetOutput(&resource{ko}) -wp_err := rm.customSetWarmPoolOutput(&resource{ko}) +wp_err := rm.customSetOutput(&resource{ko}) if wp_err != nil{ return &resource{ko}, wp_err } \ No newline at end of file From ce4bf7823e87948fe93d309f888fb58063666f48 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Fri, 28 Oct 2022 21:22:29 +0000 Subject: [PATCH 08/71] added unit test --- .../testdata/test_suite.yaml | 10 +++ .../desired/warmpool_attempt_single.yaml | 76 +++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 pkg/resource/hyper_parameter_tuning_job/testdata/v1alpha1/readone/desired/warmpool_attempt_single.yaml diff --git a/pkg/resource/hyper_parameter_tuning_job/testdata/test_suite.yaml b/pkg/resource/hyper_parameter_tuning_job/testdata/test_suite.yaml index dfb61e2d..bd7a5586 100644 --- a/pkg/resource/hyper_parameter_tuning_job/testdata/test_suite.yaml +++ b/pkg/resource/hyper_parameter_tuning_job/testdata/test_suite.yaml @@ -153,6 +153,16 @@ invoke: ReadOne expect: latest_state: "v1alpha1/readone/observed/stopping.yaml" + - name: "ReadOne=Warmpool" + description: "Testing warm pool custom code" + given: + desired_state: "v1alpha1/readone/desired/warmpool_attempt_single.yaml" + svc_api: + - operation: DescribeHyperParameterTuningJobWithContext + output_fixture: "sdkapi/describe/inprogress_describe.json" + invoke: ReadOne + expect: + latest_state: "v1alpha1/readone/observed/inprogress.yaml" - name: "HyperParameter tuning job update tests" description: "Testing the Update operation" scenarios: diff --git a/pkg/resource/hyper_parameter_tuning_job/testdata/v1alpha1/readone/desired/warmpool_attempt_single.yaml b/pkg/resource/hyper_parameter_tuning_job/testdata/v1alpha1/readone/desired/warmpool_attempt_single.yaml new file mode 100644 index 00000000..a41cd9b2 --- /dev/null +++ b/pkg/resource/hyper_parameter_tuning_job/testdata/v1alpha1/readone/desired/warmpool_attempt_single.yaml @@ -0,0 +1,76 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: HyperParameterTuningJob +metadata: + name: unit-testing-hyper-parameter-tuning-job +spec: + hyperParameterTuningJobName: unit-testing-hpo-job + hyperParameterTuningJobConfig: + strategy: Bayesian + hyperParameterTuningJobObjective: + type_: Minimize + metricName: validation:error + resourceLimits: + maxNumberOfTrainingJobs: 2 + maxParallelTrainingJobs: 1 + parameterRanges: + integerParameterRanges: + - name: num_round + minValue: '10' + maxValue: '20' + scalingType: Linear + continuousParameterRanges: + - name: gamma + minValue: '0' + maxValue: '5' + scalingType: Linear + categoricalParameterRanges: + - name: category + values: + - test + trainingJobEarlyStoppingType: Auto + trainingJobDefinition: + staticHyperParameters: + base_score: '0.5' + algorithmSpecification: + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/ack-sagemaker-execution-role + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + s3DataDistributionType: FullyReplicated + contentType: text/csv + compressionType: None + recordWrapperType: None + inputMode: File + - channelName: validation + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation/ + s3DataDistributionType: FullyReplicated + contentType: text/csv + compressionType: None + recordWrapperType: None + inputMode: File + outputDataConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/hpo/output + resourceConfig: + instanceType: ml.m4.xlarge + instanceCount: 1 + volumeSizeInGB: 25 + keepAlivePeriodInSeconds: 10 + stoppingCondition: + maxRuntimeInSeconds: 3600 + enableNetworkIsolation: true + enableInterContainerTrafficEncryption: false + tags: + - key: algorithm + value: xgboost + - key: environment + value: testing + - key: customer + value: test-user From 3f9b43cfa9a4a4293553bcd035d7315465947ce5 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Thu, 10 Nov 2022 17:47:20 +0000 Subject: [PATCH 09/71] fix: use conditions instead of requeue error --- apis/v1alpha1/ack-generate-metadata.yaml | 4 ++-- apis/v1alpha1/generator.yaml | 2 +- generator.yaml | 2 +- pkg/resource/training_job/hooks.go | 23 +++++++------------ pkg/resource/training_job/sdk.go | 5 +--- .../readone/observed/warmpool_available.yaml | 8 ++----- .../sdk_read_one_post_set_output.go.tpl | 4 ---- 7 files changed, 15 insertions(+), 33 deletions(-) delete mode 100644 templates/training_job/sdk_read_one_post_set_output.go.tpl diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index 9bc022d8..4e625db6 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,5 +1,5 @@ ack_generate_info: - build_date: "2022-10-27T21:59:13Z" + build_date: "2022-11-10T17:23:42Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc @@ -7,7 +7,7 @@ api_directory_checksum: afd89795e3cceb09d028722ecbd9512b6197eb6a api_version: v1alpha1 aws_sdk_go_version: v1.44.117 generator_config_info: - file_checksum: 53eb44592068bb0332abf7fa99b385f9b560ff32 + file_checksum: 7fb39ef5f630ff8286ff19472f423304aaa1c69c original_file_name: generator.yaml last_modification: reason: API generation diff --git a/apis/v1alpha1/generator.yaml b/apis/v1alpha1/generator.yaml index b4933566..d31c683d 100644 --- a/apis/v1alpha1/generator.yaml +++ b/apis/v1alpha1/generator.yaml @@ -127,7 +127,7 @@ resources: delta_pre_compare: code: customSetDefaults(a, b) sdk_read_one_post_set_output: - template_path: training_job/sdk_read_one_post_set_output.go.tpl + code: rm.customSetOutput(&resource{ko}) sdk_delete_pre_build_request: template_path: training_job/sdk_delete_pre_build_request.go.tpl sdk_delete_post_request: diff --git a/generator.yaml b/generator.yaml index b4933566..d31c683d 100644 --- a/generator.yaml +++ b/generator.yaml @@ -127,7 +127,7 @@ resources: delta_pre_compare: code: customSetDefaults(a, b) sdk_read_one_post_set_output: - template_path: training_job/sdk_read_one_post_set_output.go.tpl + code: rm.customSetOutput(&resource{ko}) sdk_delete_pre_build_request: template_path: training_job/sdk_delete_pre_build_request.go.tpl sdk_delete_post_request: diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index f9428e85..0ffb2906 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -42,47 +42,40 @@ var ( errors.New(resourceName+" is Stopping."), ackrequeue.DefaultRequeueAfterDuration, ) - - requeueWaitWhileWarmPoolInUse = ackrequeue.NeededAfter( - errors.New("Provisioned infrastructure is still being retained."), - ackrequeue.DefaultRequeueAfterDuration, - ) ) // customSetOutput sets the resource ResourceSynced condition to False if // TrainingJob is being modified by AWS. It checks for debug and profiler rule status in addition to TrainingJobStatus -func (rm *resourceManager) customSetOutput(r *resource) error { +func (rm *resourceManager) customSetOutput(r *resource) { trainingJobStatus := r.ko.Status.TrainingJobStatus // early exit if training job is InProgress if trainingJobStatus != nil && *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { svccommon.SetSyncedCondition(r, trainingJobStatus, &resourceName, &trainingJobModifyingStatuses) - return nil + return } for _, rule := range r.ko.Status.DebugRuleEvaluationStatuses { if rule.RuleEvaluationStatus != nil && svccommon.IsModifyingStatus(rule.RuleEvaluationStatus, &ruleModifyingStatuses) { svccommon.SetSyncedCondition(r, rule.RuleEvaluationStatus, aws.String("DebugRule"), &ruleModifyingStatuses) - return nil + return } } for _, rule := range r.ko.Status.ProfilerRuleEvaluationStatuses { if rule.RuleEvaluationStatus != nil && svccommon.IsModifyingStatus(rule.RuleEvaluationStatus, &ruleModifyingStatuses) { svccommon.SetSyncedCondition(r, rule.RuleEvaluationStatus, aws.String("ProfilerRule"), &ruleModifyingStatuses) - return nil + return } } svccommon.SetSyncedCondition(r, trainingJobStatus, &resourceName, &trainingJobModifyingStatuses) - // Requeue whenever Warmpool cluster is in Available or Inuse state. - if ackcompare.IsNil(r.ko.Status.WarmPoolStatus) { - return nil + if ackcompare.IsNil(r.ko.Status.WarmPoolStatus) || ackcompare.IsNil(r.ko.Status.WarmPoolStatus.Status) { + return } - + // Set synced condition to False if Warm Pool is in Inuse or Available state if svccommon.IsModifyingStatus(r.ko.Status.WarmPoolStatus.Status, &WarmPoolModifyingStatuses) { - return requeueWaitWhileWarmPoolInUse + svccommon.SetSyncedCondition(r, r.ko.Status.WarmPoolStatus.Status, aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) } - return nil } diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index 86505db6..b00ecbfc 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -582,10 +582,7 @@ func (rm *resourceManager) sdkFind( } rm.setStatusDefaults(ko) - wp_err := rm.customSetOutput(&resource{ko}) - if wp_err != nil { - return &resource{ko}, wp_err - } + rm.customSetOutput(&resource{ko}) return &resource{ko}, nil } diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml index f8a0db77..4ea80b89 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml @@ -102,13 +102,9 @@ status: region: "" conditions: - lastTransitionTime: "0001-01-01T00:00:00Z" - message: TrainingJob is in Completed status. - status: "True" + message: Warm Pool Infrastructure is in Available status. + status: "False" type: ACK.ResourceSynced - - lastTransitionTime: "0001-01-01T00:00:00Z" - message: Provisioned infrastructure is still being retained. - status: "True" - type: ACK.Recoverable warmPoolStatus: status: Available secondaryStatus: Completed diff --git a/templates/training_job/sdk_read_one_post_set_output.go.tpl b/templates/training_job/sdk_read_one_post_set_output.go.tpl deleted file mode 100644 index f687f275..00000000 --- a/templates/training_job/sdk_read_one_post_set_output.go.tpl +++ /dev/null @@ -1,4 +0,0 @@ -wp_err := rm.customSetOutput(&resource{ko}) -if wp_err != nil{ - return &resource{ko}, wp_err -} \ No newline at end of file From ecb27ef0702113bb4b7b696d201a8810abd38a42 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 23 Nov 2022 22:05:12 +0000 Subject: [PATCH 10/71] unit test fixes --- apis/v1alpha1/ack-generate-metadata.yaml | 2 +- pkg/resource/training_job/hooks.go | 22 ++++++++++++++----- .../describe/warmpool_describe_available.json | 3 ++- .../describe/warmpool_describe_inuse.json | 3 ++- .../describe/warmpool_describe_reused.json | 3 ++- .../warmpool_describe_terminated.json | 3 ++- .../v1alpha1/readone/desired/warmpool.yaml | 2 +- .../readone/observed/warmpool_available.yaml | 2 +- .../readone/observed/warmpool_inuse.yaml | 2 +- .../readone/observed/warmpool_reused.yaml | 2 +- .../readone/observed/warmpool_terminated.yaml | 2 +- 11 files changed, 30 insertions(+), 16 deletions(-) diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index 4e625db6..25193da2 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,5 +1,5 @@ ack_generate_info: - build_date: "2022-11-10T17:23:42Z" + build_date: "2022-11-23T21:48:31Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index 0ffb2906..9fbff6bb 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -70,12 +70,22 @@ func (rm *resourceManager) customSetOutput(r *resource) { svccommon.SetSyncedCondition(r, trainingJobStatus, &resourceName, &trainingJobModifyingStatuses) - if ackcompare.IsNil(r.ko.Status.WarmPoolStatus) || ackcompare.IsNil(r.ko.Status.WarmPoolStatus.Status) { - return - } - // Set synced condition to False if Warm Pool is in Inuse or Available state - if svccommon.IsModifyingStatus(r.ko.Status.WarmPoolStatus.Status, &WarmPoolModifyingStatuses) { - svccommon.SetSyncedCondition(r, r.ko.Status.WarmPoolStatus.Status, aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) + warmpoolUsed := ackcompare.IsNotNil(r.ko.Spec.ResourceConfig) && ackcompare.IsNotNil(r.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds) + + // Only requeue when warm pool is being used and when training job is in the completed state. + // WP will always have terminated status on error(Training Job or Warmpool). + if ackcompare.IsNotNil(trainingJobStatus) && *trainingJobStatus == svcsdk.TrainingJobStatusCompleted && + warmpoolUsed { + + // Sometimes DescribeTrainingJob does not contain the warm pool status + // In this condition the only possible status is Available or Terminated. + if ackcompare.IsNotNil(trainingJobStatus) && ackcompare.IsNil(r.ko.Status.WarmPoolStatus) { + svccommon.SetSyncedCondition(r, aws.String("Available"), aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) + } + + if svccommon.IsModifyingStatus(r.ko.Status.WarmPoolStatus.Status, &WarmPoolModifyingStatuses) { + svccommon.SetSyncedCondition(r, r.ko.Status.WarmPoolStatus.Status, aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) + } } } diff --git a/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_available.json b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_available.json index 2bf332c1..088764ba 100644 --- a/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_available.json +++ b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_available.json @@ -129,7 +129,8 @@ "ResourceConfig": { "InstanceCount": 1, "InstanceType": "ml.m4.xlarge", - "VolumeSizeInGB": 5 + "VolumeSizeInGB": 5, + "KeepAlivePeriodInSeconds": 70 }, "RoleArn": "arn:aws:iam::123456789012:role/service-role/AmazonSageMaker", "SecondaryStatus": "Completed", diff --git a/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_inuse.json b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_inuse.json index 7ba81289..00ce3e05 100644 --- a/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_inuse.json +++ b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_inuse.json @@ -129,7 +129,8 @@ "ResourceConfig": { "InstanceCount": 1, "InstanceType": "ml.m4.xlarge", - "VolumeSizeInGB": 5 + "VolumeSizeInGB": 5, + "KeepAlivePeriodInSeconds": 70 }, "RoleArn": "arn:aws:iam::123456789012:role/service-role/AmazonSageMaker", "SecondaryStatus": "Starting", diff --git a/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_reused.json b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_reused.json index b3adc836..ff039990 100644 --- a/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_reused.json +++ b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_reused.json @@ -129,7 +129,8 @@ "ResourceConfig": { "InstanceCount": 1, "InstanceType": "ml.m4.xlarge", - "VolumeSizeInGB": 5 + "VolumeSizeInGB": 5, + "KeepAlivePeriodInSeconds": 70 }, "RoleArn": "arn:aws:iam::123456789012:role/service-role/AmazonSageMaker", "SecondaryStatus": "Completed", diff --git a/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_terminated.json b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_terminated.json index d6dc7e5e..04d477ba 100644 --- a/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_terminated.json +++ b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_terminated.json @@ -129,7 +129,8 @@ "ResourceConfig": { "InstanceCount": 1, "InstanceType": "ml.m4.xlarge", - "VolumeSizeInGB": 5 + "VolumeSizeInGB": 5, + "KeepAlivePeriodInSeconds": 70 }, "RoleArn": "arn:aws:iam::123456789012:role/service-role/AmazonSageMaker", "SecondaryStatus": "Completed", diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool.yaml index bbf82578..dc306ab6 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/desired/warmpool.yaml @@ -39,7 +39,7 @@ spec: instanceCount: 1 instanceType: ml.m4.xlarge volumeSizeInGB: 5 - keepAliveInSeconds: 70 + keepAlivePeriodInSeconds: 70 roleARN: arn:aws:iam::123456789012:role/ack-sagemaker-execution-role stoppingCondition: maxRuntimeInSeconds: 86400 diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml index 4ea80b89..9658b7ab 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml @@ -83,7 +83,7 @@ spec: instanceCount: 1 instanceType: ml.m4.xlarge volumeSizeInGB: 5 - keepAliveInSeconds: 70 + keepAlivePeriodInSeconds: 70 roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker stoppingCondition: maxRuntimeInSeconds: 86400 diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml index 619c0d3e..d5bfe159 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml @@ -83,7 +83,7 @@ spec: instanceCount: 1 instanceType: ml.m4.xlarge volumeSizeInGB: 5 - keepAliveInSeconds: 70 + keepAlivePeriodInSeconds: 70 roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker stoppingCondition: maxRuntimeInSeconds: 86400 diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml index fd2b8590..115764af 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml @@ -83,7 +83,7 @@ spec: instanceCount: 1 instanceType: ml.m4.xlarge volumeSizeInGB: 5 - keepAliveInSeconds: 70 + keepAlivePeriodInSeconds: 70 roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker stoppingCondition: maxRuntimeInSeconds: 86400 diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml index 793576d8..99829bad 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml @@ -83,7 +83,7 @@ spec: instanceCount: 1 instanceType: ml.m4.xlarge volumeSizeInGB: 5 - keepAliveInSeconds: 70 + keepAlivePeriodInSeconds: 70 roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker stoppingCondition: maxRuntimeInSeconds: 86400 From 4b1b3793030ad71db903143a1f11e8989291af82 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Fri, 18 Nov 2022 17:35:48 +0000 Subject: [PATCH 11/71] Feature: Update TrainingJob --- apis/v1alpha1/ack-generate-metadata.yaml | 6 +- apis/v1alpha1/generator.yaml | 16 +- apis/v1alpha1/training_job.go | 3 + apis/v1alpha1/zz_generated.deepcopy.go | 5 + ...gemaker.services.k8s.aws_trainingjobs.yaml | 3 + generator.yaml | 16 +- ...gemaker.services.k8s.aws_trainingjobs.yaml | 3 + pkg/resource/training_job/custom_delta.go | 53 ++++++ .../training_job/custom_set_update_input.go | 125 +++++++++++++++ .../training_job/custom_update_conditions.go | 53 ++++++ pkg/resource/training_job/delta.go | 1 + pkg/resource/training_job/hooks.go | 93 +++++++++++ pkg/resource/training_job/sdk.go | 151 +++++++++++++++++- .../training_job/testdata/test_suite.yaml | 12 -- .../v1alpha1/readone/observed/completed.yaml | 1 + .../completed_debugger_variation.yaml | 3 +- .../observed/conditions_clear_on_success.yaml | 1 + .../v1alpha1/readone/observed/created.yaml | 1 + .../observed/created_debugger_variation.yaml | 1 + .../readone/observed/late_initialize.yaml | 1 + .../v1alpha1/readone/observed/stopping.yaml | 1 + .../observed/stopping_debugger_variation.yaml | 1 + .../readone/observed/warmpool_available.yaml | 3 +- .../readone/observed/warmpool_inuse.yaml | 3 +- .../readone/observed/warmpool_reused.yaml | 3 +- .../readone/observed/warmpool_terminated.yaml | 3 +- .../sdk_update_post_build_request.go.tpl | 31 ++++ 27 files changed, 565 insertions(+), 28 deletions(-) create mode 100644 pkg/resource/training_job/custom_set_update_input.go create mode 100644 pkg/resource/training_job/custom_update_conditions.go create mode 100644 templates/training_job/sdk_update_post_build_request.go.tpl diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index 25193da2..1bb00153 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,13 +1,13 @@ ack_generate_info: - build_date: "2022-11-23T21:48:31Z" + build_date: "2022-11-18T16:40:15Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc -api_directory_checksum: afd89795e3cceb09d028722ecbd9512b6197eb6a +api_directory_checksum: a11209e81e188afecb6812ec3080cead7ce995b1 api_version: v1alpha1 aws_sdk_go_version: v1.44.117 generator_config_info: - file_checksum: 7fb39ef5f630ff8286ff19472f423304aaa1c69c + file_checksum: 9678da88ff1b4061cfae104e44116cf0d87ebe09 original_file_name: generator.yaml last_modification: reason: API generation diff --git a/apis/v1alpha1/generator.yaml b/apis/v1alpha1/generator.yaml index d31c683d..eacb3dce 100644 --- a/apis/v1alpha1/generator.yaml +++ b/apis/v1alpha1/generator.yaml @@ -123,11 +123,18 @@ resources: - InvalidParameterCombination - InvalidParameterValue - MissingParameter + update_conditions_custom_method_name: CustomUpdateConditions hooks: delta_pre_compare: code: customSetDefaults(a, b) + delta_post_compare: + code: customPostCompare(b, a, delta) sdk_read_one_post_set_output: code: rm.customSetOutput(&resource{ko}) + sdk_update_post_build_request: + template_path: training_job/sdk_update_post_build_request.go.tpl + sdk_update_post_set_output: + code: rm.customSetOutput(&resource{ko}) sdk_delete_pre_build_request: template_path: training_job/sdk_delete_pre_build_request.go.tpl sdk_delete_post_request: @@ -175,6 +182,11 @@ resources: from: operation: DescribeTrainingJob path: WarmPoolStatus + ProfilingStatus: + is_read_only: true + from: + operation: DescribeTrainingJob + path: ProfilingStatus AlgorithmSpecification.MetricDefinitions: compare: is_ignored: true @@ -879,7 +891,6 @@ ignore: - Workforce - Workteam operations: - - UpdateTrainingJob - UpdateFeatureGroup shape_names: # RSessionAppSettings is an empty struct that causes generation errors @@ -894,4 +905,5 @@ ignore: - InstanceGroupNames - CanvasAppSettings - ExplainerConfig - - HyperParameterTuningJobStrategyConfig \ No newline at end of file + - HyperParameterTuningJobStrategyConfig + - DisableProfiler \ No newline at end of file diff --git a/apis/v1alpha1/training_job.go b/apis/v1alpha1/training_job.go index cf349c11..31b3a351 100644 --- a/apis/v1alpha1/training_job.go +++ b/apis/v1alpha1/training_job.go @@ -184,6 +184,9 @@ type TrainingJobStatus struct { // Evaluation status of Debugger rules for profiling on a training job. // +kubebuilder:validation:Optional ProfilerRuleEvaluationStatuses []*ProfilerRuleEvaluationStatus `json:"profilerRuleEvaluationStatuses,omitempty"` + // Profiling status of a training job. + // +kubebuilder:validation:Optional + ProfilingStatus *string `json:"profilingStatus,omitempty"` // Provides detailed information about the state of the training job. For detailed // information on the secondary status of the training job, see StatusMessage // under SecondaryStatusTransition. diff --git a/apis/v1alpha1/zz_generated.deepcopy.go b/apis/v1alpha1/zz_generated.deepcopy.go index 8b272a64..df2c8444 100644 --- a/apis/v1alpha1/zz_generated.deepcopy.go +++ b/apis/v1alpha1/zz_generated.deepcopy.go @@ -12748,6 +12748,11 @@ func (in *TrainingJobStatus) DeepCopyInto(out *TrainingJobStatus) { } } } + if in.ProfilingStatus != nil { + in, out := &in.ProfilingStatus, &out.ProfilingStatus + *out = new(string) + **out = **in + } if in.SecondaryStatus != nil { in, out := &in.SecondaryStatus, &out.SecondaryStatus *out = new(string) diff --git a/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml b/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml index 6f60c60d..d0f4ca28 100644 --- a/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml +++ b/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml @@ -618,6 +618,9 @@ spec: type: string type: object type: array + profilingStatus: + description: Profiling status of a training job. + type: string secondaryStatus: description: "Provides detailed information about the state of the training job. For detailed information on the secondary status of diff --git a/generator.yaml b/generator.yaml index d31c683d..eacb3dce 100644 --- a/generator.yaml +++ b/generator.yaml @@ -123,11 +123,18 @@ resources: - InvalidParameterCombination - InvalidParameterValue - MissingParameter + update_conditions_custom_method_name: CustomUpdateConditions hooks: delta_pre_compare: code: customSetDefaults(a, b) + delta_post_compare: + code: customPostCompare(b, a, delta) sdk_read_one_post_set_output: code: rm.customSetOutput(&resource{ko}) + sdk_update_post_build_request: + template_path: training_job/sdk_update_post_build_request.go.tpl + sdk_update_post_set_output: + code: rm.customSetOutput(&resource{ko}) sdk_delete_pre_build_request: template_path: training_job/sdk_delete_pre_build_request.go.tpl sdk_delete_post_request: @@ -175,6 +182,11 @@ resources: from: operation: DescribeTrainingJob path: WarmPoolStatus + ProfilingStatus: + is_read_only: true + from: + operation: DescribeTrainingJob + path: ProfilingStatus AlgorithmSpecification.MetricDefinitions: compare: is_ignored: true @@ -879,7 +891,6 @@ ignore: - Workforce - Workteam operations: - - UpdateTrainingJob - UpdateFeatureGroup shape_names: # RSessionAppSettings is an empty struct that causes generation errors @@ -894,4 +905,5 @@ ignore: - InstanceGroupNames - CanvasAppSettings - ExplainerConfig - - HyperParameterTuningJobStrategyConfig \ No newline at end of file + - HyperParameterTuningJobStrategyConfig + - DisableProfiler \ No newline at end of file diff --git a/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml b/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml index 6f60c60d..d0f4ca28 100644 --- a/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml +++ b/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml @@ -618,6 +618,9 @@ spec: type: string type: object type: array + profilingStatus: + description: Profiling status of a training job. + type: string secondaryStatus: description: "Provides detailed information about the state of the training job. For detailed information on the secondary status of diff --git a/pkg/resource/training_job/custom_delta.go b/pkg/resource/training_job/custom_delta.go index 7653044c..a67c3490 100644 --- a/pkg/resource/training_job/custom_delta.go +++ b/pkg/resource/training_job/custom_delta.go @@ -29,6 +29,10 @@ func customSetDefaults( if ackcompare.IsNotNil(a.ko.Spec.ProfilerRuleConfigurations) && ackcompare.IsNotNil(b.ko.Spec.ProfilerRuleConfigurations) { for index := range a.ko.Spec.ProfilerRuleConfigurations { + // Prevent out of bounds panics. + if index == len(a.ko.Spec.ProfilerRuleConfigurations) || index == len(b.ko.Spec.ProfilerRuleConfigurations) { + break + } if ackcompare.IsNil(a.ko.Spec.ProfilerRuleConfigurations[index].VolumeSizeInGB) && ackcompare.IsNotNil(b.ko.Spec.ProfilerRuleConfigurations[index].VolumeSizeInGB) { a.ko.Spec.ProfilerRuleConfigurations[index].VolumeSizeInGB = defaultVolumeSizeInGB } @@ -60,3 +64,52 @@ func customSetDefaults( } } } + +// SM returns profiler related objects even if the user disables the profiler +// this function detects if there is a diff +func customPostCompare(latest *resource, desired *resource, delta *ackcompare.Delta) { + profilerConfigDiff := delta.DifferentAt("Spec.ProfilerConfig") + profilerRuleDiff := delta.DifferentAt("Spec.ProfilerRuleConfigurations") + if !profilerConfigDiff && !profilerRuleDiff { + return + } + profilerStatus := latest.ko.Status.ProfilingStatus + profilerDisabled := false + + if ackcompare.IsNotNil(profilerStatus) { + //Do not remove profiler if user wants to enable it + if *profilerStatus == "Disabled" && !userInitiatesProfilerCheck(desired) { + profilerDisabled = true + } else { + return + } + } else { + return + } + // TODO: Replace remove delta with an ack version when its natively supported + if profilerConfigDiff && profilerDisabled { + removeDelta(delta, "Spec.ProfilerConfig") + } + if profilerRuleDiff { + removeDelta(delta, "Spec.ProfilerRuleConfigurations") + } +} + +func userInitiatesProfilerCheck(desired *resource) bool { + profilerConfigPresent := ackcompare.IsNotNil(desired.ko.Spec.ProfilerConfig) + profilerRuleConfigPresent := ackcompare.IsNotNil(desired.ko.Spec.ProfilerRuleConfigurations) + return profilerConfigPresent && profilerRuleConfigPresent +} + +// Removes fieldName from the delta slice. +// TODO: Replace when ack runtime can do this. +func removeDelta(delta *ackcompare.Delta, fieldName string) { + differences := delta.Differences + for index, diff := range differences { + if diff.Path.Contains(fieldName) { + differences = append(differences[:index], differences[index+1:]...) + delta.Differences = differences + return + } + } +} diff --git a/pkg/resource/training_job/custom_set_update_input.go b/pkg/resource/training_job/custom_set_update_input.go new file mode 100644 index 00000000..cede7eed --- /dev/null +++ b/pkg/resource/training_job/custom_set_update_input.go @@ -0,0 +1,125 @@ +// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +// Use this file if conditions need to be updated based on the latest status +// of training job which is not evident from API response + +package training_job + +import ( + "errors" + + ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" + smv1alpha "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" + svcsdk "github.com/aws/aws-sdk-go/service/sagemaker" +) + +// Three conditions: +// 1. Customer updates both profiler parameters: Recreate the input for profiler Rule. +// 2. Customer only updates Profiler Config: Set the profiler rule configuration to nil to avoid validation error. +// 3. Customer only updates Rule Configurations: Recreate the input for profiler Rule and set Profiler config to nil. +// safer to do this because the "only add" behavior might reappear. + +func customSetUpdateInput(desired *resource, latest *resource, delta *ackcompare.Delta, input *svcsdk.UpdateTrainingJobInput) error { + if delta.DifferentAt("Spec.ProfilerConfig") && delta.DifferentAt("Spec.ProfilerRuleConfigurations") { + err := handleProfilerRuleConfig(desired, latest, input) + return err + } + if delta.DifferentAt("Spec.ProfilerConfig") && !delta.DifferentAt("Spec.ProfilerRuleConfigurations") { + input.SetProfilerRuleConfigurations(nil) + return nil + } + if delta.DifferentAt("Spec.ProfilerRuleConfigurations") && !delta.DifferentAt("Spec.ProfilerConfig") { + err := handleProfilerRuleConfig(desired, latest, input) + input.SetProfilerConfig(nil) // SM still assumes the profiler config is the same. + return err + } + return nil +} + +// Update training job is post operation wrt to the profiler parameters. +// Because of this only NEW rules can be specified. +// In this function we check to see if any new profiler configurstions have been added. +func handleProfilerRuleConfig(desired *resource, latest *resource, input *svcsdk.UpdateTrainingJobInput) error { + profilerRuleDesired := desired.ko.Spec.ProfilerRuleConfigurations + profilerRuleLatest := latest.ko.Spec.ProfilerRuleConfigurations + + if ackcompare.IsNil(profilerRuleDesired) { + return errors.New("[ACK_SM] Cannot remove a profiler rule.") + } + if ackcompare.IsNil(profilerRuleLatest) { + return nil + } + if len(profilerRuleDesired) < len(profilerRuleLatest) { + return errors.New("[ACK_SM] Cannot remove a profiler rule.") + } + + ruleMap := map[string]int{} + profilerRuleInput := []*svcsdk.ProfilerRuleConfiguration{} + for _, rule := range profilerRuleLatest { + if ackcompare.IsNotNil(rule) && ackcompare.IsNotNil(rule.RuleConfigurationName) { + ruleMap[*rule.RuleConfigurationName] = 1 + } + } + for _, rule := range profilerRuleDesired { + if ackcompare.IsNotNil(rule) && ackcompare.IsNotNil(rule.RuleConfigurationName) { + _, present := ruleMap[*rule.RuleConfigurationName] + if !present { + profilerRuleInput = append(profilerRuleInput, convertProfileRuleType(rule)) + } + } + } + input.SetProfilerRuleConfigurations(profilerRuleInput) + return nil +} + +// Recreates input and sets disable profiler to true +func handleProfilerRemoval(input *svcsdk.UpdateTrainingJobInput) { + input.SetProfilerRuleConfigurations(nil) + profilerConfig := svcsdk.ProfilerConfigForUpdate{} + profilerConfig.SetDisableProfiler(true) + input.SetProfilerConfig(&profilerConfig) +} + +// Sagemaker and kubernetes types are not the same so the input has to be reconstructed. +func convertProfileRuleType(rule *smv1alpha.ProfilerRuleConfiguration) *svcsdk.ProfilerRuleConfiguration { + smRule := &svcsdk.ProfilerRuleConfiguration{} + if rule.InstanceType != nil { + smRule.SetInstanceType(*rule.InstanceType) + } + if rule.LocalPath != nil { + smRule.SetLocalPath(*rule.LocalPath) + } + if rule.RuleConfigurationName != nil { + smRule.SetRuleConfigurationName(*rule.RuleConfigurationName) + } + if rule.RuleEvaluatorImage != nil { + smRule.SetRuleEvaluatorImage(*rule.RuleEvaluatorImage) + } + if rule.RuleParameters != nil { + f1elemf4 := map[string]*string{} + for f1elemf4key, f1elemf4valiter := range rule.RuleParameters { + var f1elemf4val string + f1elemf4val = *f1elemf4valiter + f1elemf4[f1elemf4key] = &f1elemf4val + } + smRule.SetRuleParameters(f1elemf4) + } + if rule.S3OutputPath != nil { + smRule.SetS3OutputPath(*rule.S3OutputPath) + } + if rule.VolumeSizeInGB != nil { + smRule.SetVolumeSizeInGB(*rule.VolumeSizeInGB) + } + return smRule +} diff --git a/pkg/resource/training_job/custom_update_conditions.go b/pkg/resource/training_job/custom_update_conditions.go new file mode 100644 index 00000000..c94cfcf0 --- /dev/null +++ b/pkg/resource/training_job/custom_update_conditions.go @@ -0,0 +1,53 @@ +// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +// Use this file if conditions need to be updated based on the latest status +// of training job which is not evident from API response + +package training_job + +import ( + "strings" + + ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" + ackcondition "github.com/aws-controllers-k8s/runtime/pkg/condition" + svcapitypes "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" + corev1 "k8s.io/api/core/v1" +) + +var ( + terminalCode string = "[ACK_SM]" +) + +// If the controller runs into an error that contains "[ACK_SM]" +// it will set the resource to a terminal state because it is an unrecoverable error. +func (rm *resourceManager) CustomUpdateConditions( + ko *svcapitypes.TrainingJob, + r *resource, + err error, +) bool { + + if ackcompare.IsNil(err) { + return false + } + + if strings.Contains(err.Error(), terminalCode) { + conditionManager := &resource{ko} + exception := err.Error() + ackcondition.SetTerminal(conditionManager, corev1.ConditionTrue, &exception, nil) + return true + } + + return false + +} diff --git a/pkg/resource/training_job/delta.go b/pkg/resource/training_job/delta.go index 251c2246..bbe75e10 100644 --- a/pkg/resource/training_job/delta.go +++ b/pkg/resource/training_job/delta.go @@ -332,5 +332,6 @@ func newResourceDelta( } } + customPostCompare(b, a, delta) return delta } diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index 9fbff6bb..a4e39980 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -36,12 +36,27 @@ var ( svcsdk.WarmPoolResourceStatusAvailable, svcsdk.WarmPoolResourceStatusInUse, } + TrainingJobTerminalProfiler = []string{ + svcsdk.TrainingJobStatusCompleted, + svcsdk.TrainingJobStatusFailed, + svcsdk.TrainingJobStatusStopping, + svcsdk.TrainingJobStatusStopped, + } resourceName = GroupKind.Kind requeueWaitWhileDeleting = ackrequeue.NeededAfter( errors.New(resourceName+" is Stopping."), ackrequeue.DefaultRequeueAfterDuration, ) + + requeueBeforeUpdate = ackrequeue.NeededAfter( + errors.New("Warm pool cannot be updated in InProgress state requeuing until TrainingJob reaches completed state."), + ackrequeue.DefaultRequeueAfterDuration, + ) + requeueBeforeUpdateStarting = ackrequeue.NeededAfter( + errors.New("Controller cannot update while secondary status is in Starting state."), + ackrequeue.DefaultRequeueAfterDuration, + ) ) // customSetOutput sets the resource ResourceSynced condition to False if @@ -62,6 +77,12 @@ func (rm *resourceManager) customSetOutput(r *resource) { } for _, rule := range r.ko.Status.ProfilerRuleEvaluationStatuses { + if ackcompare.IsNotNil(r.ko.Status.ProfilingStatus) { + // Sometimes rule evaluation status will stay in InProgress state. + if *r.ko.Status.ProfilingStatus == "Disabled" { + break + } + } if rule.RuleEvaluationStatus != nil && svccommon.IsModifyingStatus(rule.RuleEvaluationStatus, &ruleModifyingStatuses) { svccommon.SetSyncedCondition(r, rule.RuleEvaluationStatus, aws.String("ProfilerRule"), &ruleModifyingStatuses) return @@ -89,3 +110,75 @@ func (rm *resourceManager) customSetOutput(r *resource) { } } + +// This function makes the controller requeue if there is an update and +// the training job is still in InProgress +func customSetOutputUpdateWarmpool(r *resource) error { + trainingJobStatus := r.ko.Status.TrainingJobStatus + if ackcompare.IsNotNil(trainingJobStatus) && *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { + return requeueBeforeUpdate + } + return nil +} + +// Check if warm pool has reached a state where it is not updateable +func warmPoolTerminalCheck(latest *resource) bool { + trainingJobStatus := latest.ko.Status.TrainingJobStatus + if ackcompare.IsNotNil(latest.ko.Spec.ResourceConfig) { + if ackcompare.IsNil(latest.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds) { + return true // Warm pool can only be updated iff there is a provisioned cluster. + } + } else { + return false + } + + if ackcompare.IsNotNil(trainingJobStatus) { + if *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { + return false + } + if *trainingJobStatus == svcsdk.TrainingJobStatusCompleted { + if ackcompare.IsNotNil(latest.ko.Status.WarmPoolStatus) { + wp_modifying := svccommon.IsModifyingStatus(latest.ko.Status.WarmPoolStatus.Status, &WarmPoolModifyingStatuses) + return !wp_modifying + } else { + return false // Sometimes the API (briefly) does not return the WP status even if it completes. + } + } else { + // Training Job is in 'Failed'|'Stopping'|'Stopped' (Terminal) + return true + } + } + + // ACK OIDC is misconfigured (Terminal) + return true +} + +// Profiler cannot be updated at certain statuses. +func customSetOutputUpdateProfiler(r *resource) error { + trainingSecondaryStatus := r.ko.Status.SecondaryStatus + trainingJobStatus := r.ko.Status.TrainingJobStatus + if ackcompare.IsNotNil(trainingSecondaryStatus) && *trainingSecondaryStatus == svcsdk.SecondaryStatusStarting { + return requeueBeforeUpdateStarting + } + if ackcompare.IsNotNil(trainingJobStatus) { + for _, terminalStatus := range TrainingJobTerminalProfiler { + if terminalStatus == *trainingJobStatus { + return errors.New("[ACK_SM] Profiler can only be updated when Training Job is in InProgress state") + } + } + } + return nil +} + +// Checks if the profiler was removed. +func profilerRemovalCheck(desired *resource, latest *resource) bool { + if ackcompare.IsNotNil(desired.ko.Spec) && ackcompare.IsNotNil(latest.ko.Spec) { + if ackcompare.IsNil(desired.ko.Spec.ProfilerRuleConfigurations) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerRuleConfigurations) { + return true + } + if ackcompare.IsNil(desired.ko.Spec.ProfilerConfig) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerConfig) { + return true + } + } + return false +} diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index b00ecbfc..023f3e63 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -469,6 +469,11 @@ func (rm *resourceManager) sdkFind( } else { ko.Status.ProfilerRuleEvaluationStatuses = nil } + if resp.ProfilingStatus != nil { + ko.Status.ProfilingStatus = resp.ProfilingStatus + } else { + ko.Status.ProfilingStatus = nil + } if resp.ResourceConfig != nil { f25 := &svcapitypes.ResourceConfig{} if resp.ResourceConfig.InstanceCount != nil { @@ -1040,9 +1045,145 @@ func (rm *resourceManager) sdkUpdate( desired *resource, latest *resource, delta *ackcompare.Delta, -) (*resource, error) { - // TODO(jaypipes): Figure this out... - return nil, ackerr.NotImplemented +) (updated *resource, err error) { + rlog := ackrtlog.FromContext(ctx) + exit := rlog.Trace("rm.sdkUpdate") + defer func() { + exit(err) + }() + input, err := rm.newUpdateRequestPayload(ctx, desired) + if err != nil { + return nil, err + } + warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") + profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") + if warmpool_diff && profiler_diff { + return latest, errors.New("[ACK_SM] Cannot update Warm pool and Profiler at the same time.") + } + if warmpool_diff { + input.SetProfilerConfig(nil) + input.SetProfilerRuleConfigurations(nil) + warmpool_terminal := warmPoolTerminalCheck(latest) + if warmpool_terminal { + return latest, errors.New("[ACK_SM] Warm pool either does not exist or has reached a non updatable state.") + } + if err := customSetOutputUpdateWarmpool(latest); err != nil { + return nil, err + } + } + if profiler_diff { + if up_err := customSetOutputUpdateProfiler(latest); up_err != nil { + return nil, up_err + } + input.SetResourceConfig(nil) + if profilerRemovalCheck(desired, latest) { + handleProfilerRemoval(input) + } else { + inp_err := customSetUpdateInput(desired, latest, delta, input) + if inp_err != nil { + return nil, err + } + } + } + + var resp *svcsdk.UpdateTrainingJobOutput + _ = resp + resp, err = rm.sdkapi.UpdateTrainingJobWithContext(ctx, input) + rm.metrics.RecordAPICall("UPDATE", "UpdateTrainingJob", err) + if err != nil { + return nil, err + } + // Merge in the information we read from the API call above to the copy of + // the original Kubernetes object we passed to the function + ko := desired.ko.DeepCopy() + + if ko.Status.ACKResourceMetadata == nil { + ko.Status.ACKResourceMetadata = &ackv1alpha1.ResourceMetadata{} + } + if resp.TrainingJobArn != nil { + arn := ackv1alpha1.AWSResourceName(*resp.TrainingJobArn) + ko.Status.ACKResourceMetadata.ARN = &arn + } + + rm.setStatusDefaults(ko) + rm.customSetOutput(&resource{ko}) + return &resource{ko}, nil +} + +// newUpdateRequestPayload returns an SDK-specific struct for the HTTP request +// payload of the Update API call for the resource +func (rm *resourceManager) newUpdateRequestPayload( + ctx context.Context, + r *resource, +) (*svcsdk.UpdateTrainingJobInput, error) { + res := &svcsdk.UpdateTrainingJobInput{} + + if r.ko.Spec.ProfilerConfig != nil { + f0 := &svcsdk.ProfilerConfigForUpdate{} + if r.ko.Spec.ProfilerConfig.ProfilingIntervalInMilliseconds != nil { + f0.SetProfilingIntervalInMilliseconds(*r.ko.Spec.ProfilerConfig.ProfilingIntervalInMilliseconds) + } + if r.ko.Spec.ProfilerConfig.ProfilingParameters != nil { + f0f1 := map[string]*string{} + for f0f1key, f0f1valiter := range r.ko.Spec.ProfilerConfig.ProfilingParameters { + var f0f1val string + f0f1val = *f0f1valiter + f0f1[f0f1key] = &f0f1val + } + f0.SetProfilingParameters(f0f1) + } + if r.ko.Spec.ProfilerConfig.S3OutputPath != nil { + f0.SetS3OutputPath(*r.ko.Spec.ProfilerConfig.S3OutputPath) + } + res.SetProfilerConfig(f0) + } + if r.ko.Spec.ProfilerRuleConfigurations != nil { + f1 := []*svcsdk.ProfilerRuleConfiguration{} + for _, f1iter := range r.ko.Spec.ProfilerRuleConfigurations { + f1elem := &svcsdk.ProfilerRuleConfiguration{} + if f1iter.InstanceType != nil { + f1elem.SetInstanceType(*f1iter.InstanceType) + } + if f1iter.LocalPath != nil { + f1elem.SetLocalPath(*f1iter.LocalPath) + } + if f1iter.RuleConfigurationName != nil { + f1elem.SetRuleConfigurationName(*f1iter.RuleConfigurationName) + } + if f1iter.RuleEvaluatorImage != nil { + f1elem.SetRuleEvaluatorImage(*f1iter.RuleEvaluatorImage) + } + if f1iter.RuleParameters != nil { + f1elemf4 := map[string]*string{} + for f1elemf4key, f1elemf4valiter := range f1iter.RuleParameters { + var f1elemf4val string + f1elemf4val = *f1elemf4valiter + f1elemf4[f1elemf4key] = &f1elemf4val + } + f1elem.SetRuleParameters(f1elemf4) + } + if f1iter.S3OutputPath != nil { + f1elem.SetS3OutputPath(*f1iter.S3OutputPath) + } + if f1iter.VolumeSizeInGB != nil { + f1elem.SetVolumeSizeInGB(*f1iter.VolumeSizeInGB) + } + f1 = append(f1, f1elem) + } + res.SetProfilerRuleConfigurations(f1) + } + if r.ko.Spec.ResourceConfig != nil { + f2 := &svcsdk.ResourceConfigForUpdate{} + if r.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds != nil { + f2.SetKeepAlivePeriodInSeconds(*r.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds) + } + res.SetResourceConfig(f2) + } + if r.ko.Spec.TrainingJobName != nil { + res.SetTrainingJobName(*r.ko.Spec.TrainingJobName) + } + + return res, nil } // sdkDelete deletes the supplied resource in the backend AWS service API @@ -1192,7 +1333,9 @@ func (rm *resourceManager) updateConditions( } // Required to avoid the "declared but not used" error in the default case _ = syncCondition - if terminalCondition != nil || recoverableCondition != nil || syncCondition != nil { + // custom update conditions + customUpdate := rm.CustomUpdateConditions(ko, r, err) + if terminalCondition != nil || recoverableCondition != nil || syncCondition != nil || customUpdate { return &resource{ko}, true // updated } return nil, false // not updated diff --git a/pkg/resource/training_job/testdata/test_suite.yaml b/pkg/resource/training_job/testdata/test_suite.yaml index da6b4dd3..5e370bf8 100644 --- a/pkg/resource/training_job/testdata/test_suite.yaml +++ b/pkg/resource/training_job/testdata/test_suite.yaml @@ -182,18 +182,6 @@ tests: invoke: ReadOne expect: latest_state: "v1alpha1/readone/observed/warmpool_inuse.yaml" - - name: "Training job update tests" - description: "Testing the Update operation" - scenarios: - - name: "Update=NotSupported" - description: "This test checks if the controller throws error for update" - given: - desired_state: "v1alpha1/update/desired/updated_base.yaml" - latest_state: "v1alpha1/create/observed/success_after_create.yaml" - invoke: Update - expect: - latest_state: "v1alpha1/update/observed/error_on_update.yaml" - error: "not implemented" - name: "Training job delete tests" description: "Testing the delete operation" scenarios: diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed.yaml index 9b3e4bca..c086d9a3 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed.yaml @@ -108,3 +108,4 @@ status: s3ModelArtifacts: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output/model.tar.gz secondaryStatus: Completed trainingJobStatus: Completed + profilingStatus: Disabled diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed_debugger_variation.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed_debugger_variation.yaml index 827f165b..6b78b3d2 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed_debugger_variation.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed_debugger_variation.yaml @@ -149,4 +149,5 @@ status: ruleConfigurationName: ProfilerReport ruleEvaluationStatus: Completed secondaryStatus: Completed - trainingJobStatus: Completed \ No newline at end of file + trainingJobStatus: Completed + profilingStatus: Enabled \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/conditions_clear_on_success.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/conditions_clear_on_success.yaml index 21abd8ed..b4431657 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/conditions_clear_on_success.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/conditions_clear_on_success.yaml @@ -108,3 +108,4 @@ status: type: ACK.ResourceSynced secondaryStatus: Downloading trainingJobStatus: InProgress + profilingStatus: Disabled diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created.yaml index 6bd29e20..ace5bc9c 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created.yaml @@ -106,3 +106,4 @@ status: type: ACK.ResourceSynced secondaryStatus: Downloading trainingJobStatus: InProgress + profilingStatus: Disabled diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_debugger_variation.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_debugger_variation.yaml index 3432a708..006d0382 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_debugger_variation.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_debugger_variation.yaml @@ -150,3 +150,4 @@ status: ruleEvaluationStatus: InProgress secondaryStatus: Starting trainingJobStatus: InProgress + profilingStatus: Enabled diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/late_initialize.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/late_initialize.yaml index a08bbe9f..7d41233d 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/late_initialize.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/late_initialize.yaml @@ -110,3 +110,4 @@ status: type: ACK.LateInitialized secondaryStatus: Downloading trainingJobStatus: InProgress + profilingStatus: Disabled diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping.yaml index c90f2114..bbd786d2 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping.yaml @@ -106,3 +106,4 @@ status: type: ACK.ResourceSynced secondaryStatus: Starting trainingJobStatus: Stopping + profilingStatus: Disabled diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping_debugger_variation.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping_debugger_variation.yaml index a06b365c..0558f186 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping_debugger_variation.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping_debugger_variation.yaml @@ -152,3 +152,4 @@ status: ruleEvaluationStatus: InProgress secondaryStatus: Starting trainingJobStatus: Stopping + profilingStatus: Enabled diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml index 9658b7ab..9819560f 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml @@ -108,4 +108,5 @@ status: warmPoolStatus: status: Available secondaryStatus: Completed - trainingJobStatus: Completed \ No newline at end of file + trainingJobStatus: Completed + profilingStatus: Disabled \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml index d5bfe159..f7acc768 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml @@ -108,4 +108,5 @@ status: warmPoolStatus: status: InUse secondaryStatus: Starting - trainingJobStatus: InProgress \ No newline at end of file + trainingJobStatus: InProgress + profilingStatus: Disabled \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml index 115764af..2cd053eb 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml @@ -110,4 +110,5 @@ status: reusedByJob: Trainingjob-ccsjjbdsjhhcsvdj resourceRetainedBillableTimeInSeconds: 69 secondaryStatus: Completed - trainingJobStatus: Completed \ No newline at end of file + trainingJobStatus: Completed + profilingStatus: Disabled \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml index 99829bad..b0d56c11 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml @@ -110,4 +110,5 @@ status: resourceRetainedBillableTimeInSeconds: 69 secondaryStatus: Completed trainingJobStatus: Completed - + profilingStatus: Disabled + diff --git a/templates/training_job/sdk_update_post_build_request.go.tpl b/templates/training_job/sdk_update_post_build_request.go.tpl new file mode 100644 index 00000000..68984daf --- /dev/null +++ b/templates/training_job/sdk_update_post_build_request.go.tpl @@ -0,0 +1,31 @@ +warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") +profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") +if warmpool_diff && profiler_diff{ + return latest, errors.New("[ACK_SM] Cannot update Warm pool and Profiler at the same time.") +} +if warmpool_diff { + input.SetProfilerConfig(nil) + input.SetProfilerRuleConfigurations(nil) + warmpool_terminal := warmPoolTerminalCheck(latest) + if warmpool_terminal { + return latest, errors.New("[ACK_SM] Warm pool either does not exist or has reached a non updatable state.") + } + //Requeue if TrainingJob is in InProgress state + if err := customSetOutputUpdateWarmpool(latest); err != nil { + return nil,err + } +} +if profiler_diff { + if up_err := customSetOutputUpdateProfiler(latest); up_err != nil { + return nil, up_err + } + input.SetResourceConfig(nil) + if profilerRemovalCheck(desired, latest) { + handleProfilerRemoval(input) + } else{ + inp_err := customSetUpdateInput(desired, latest, delta, input) + if inp_err != nil { + return nil, err + } + } +} From b28e26f0e3e1093158aee3fa5d0a281d6a528639 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Fri, 18 Nov 2022 20:26:11 +0000 Subject: [PATCH 12/71] test: added integration test --- test/e2e/tests/test_trainingjob_debugger.py | 91 ++++++++++++++++++--- 1 file changed, 80 insertions(+), 11 deletions(-) diff --git a/test/e2e/tests/test_trainingjob_debugger.py b/test/e2e/tests/test_trainingjob_debugger.py index 5bccd3bc..efd1a3bc 100644 --- a/test/e2e/tests/test_trainingjob_debugger.py +++ b/test/e2e/tests/test_trainingjob_debugger.py @@ -30,6 +30,7 @@ from e2e.common import config as cfg RESOURCE_PLURAL = "trainingjobs" +NEW_PROFILER_INTERVAL = 200 @pytest.fixture(scope="function") @@ -37,7 +38,7 @@ def xgboost_training_job_debugger(): resource_name = random_suffix_name("xgboost-trainingjob-debugger", 50) replacements = REPLACEMENT_VALUES.copy() replacements["TRAINING_JOB_NAME"] = resource_name - reference, _, resource = create_sagemaker_resource( + reference, spec, resource = create_sagemaker_resource( resource_plural=RESOURCE_PLURAL, resource_name=resource_name, spec_file="xgboost_trainingjob_debugger", @@ -45,7 +46,7 @@ def xgboost_training_job_debugger(): ) assert resource is not None - yield (reference, resource) + yield (reference, resource, spec) if k8s.get_resource_exists(reference): _, deleted = k8s.delete_custom_resource(reference, 3, 10) @@ -54,17 +55,20 @@ def xgboost_training_job_debugger(): def get_training_rule_eval_sagemaker_status(training_job_name: str, rule_type: str): training_sm_desc = get_sagemaker_training_job(training_job_name) - return training_sm_desc[rule_type+"EvaluationStatuses"][0]["RuleEvaluationStatus"] + return training_sm_desc[rule_type + "EvaluationStatuses"][0]["RuleEvaluationStatus"] -def get_training_rule_eval_resource_status(reference: k8s.CustomResourceReference, rule_type: str): +def get_training_rule_eval_resource_status( + reference: k8s.CustomResourceReference, rule_type: str +): resource = k8s.get_resource(reference) - resource_status = resource["status"][rule_type+"EvaluationStatuses"][0][ + resource_status = resource["status"][rule_type + "EvaluationStatuses"][0][ "ruleEvaluationStatus" ] assert resource_status is not None return resource_status + @service_marker class TestTrainingDebuggerJob: def _wait_sagemaker_training_rule_eval_status( @@ -107,14 +111,18 @@ def _assert_training_rule_eval_status_in_sync( resource_rule_type = sagemaker_rule_type[0].lower() + sagemaker_rule_type[1:] assert ( self._wait_sagemaker_training_rule_eval_status( - training_job_name, sagemaker_rule_type, expected_status, + training_job_name, + sagemaker_rule_type, + expected_status, + ) + == self._wait_resource_training_rule_eval_status( + reference, resource_rule_type, expected_status ) - == self._wait_resource_training_rule_eval_status(reference, resource_rule_type, expected_status) == expected_status ) def test_completed(self, xgboost_training_job_debugger): - (reference, resource) = xgboost_training_job_debugger + (reference, resource, _) = xgboost_training_job_debugger assert k8s.get_resource_exists(reference) training_job_name = resource["spec"].get("trainingJobName", None) @@ -122,7 +130,7 @@ def test_completed(self, xgboost_training_job_debugger): training_job_desc = get_sagemaker_training_job(training_job_name) training_job_arn = training_job_desc["TrainingJobArn"] - + resource_arn = k8s.get_resource_arn(resource) if resource_arn is None: logging.error( @@ -142,7 +150,7 @@ def test_completed(self, xgboost_training_job_debugger): self._assert_training_rule_eval_status_in_sync( training_job_name, "DebugRule", reference, cfg.RULE_STATUS_COMPLETED ) - + # Assert profiler rule evaluation completed self._assert_training_rule_eval_status_in_sync( training_job_name, "ProfilerRule", reference, cfg.RULE_STATUS_COMPLETED @@ -153,5 +161,66 @@ def test_completed(self, xgboost_training_job_debugger): assert_tags_in_sync(training_job_arn, resource_tags) # Check that you can delete a completed resource from k8s - _, deleted = k8s.delete_custom_resource(reference, cfg.JOB_DELETE_WAIT_PERIODS, cfg.JOB_DELETE_WAIT_LENGTH) + _, deleted = k8s.delete_custom_resource( + reference, cfg.JOB_DELETE_WAIT_PERIODS, cfg.JOB_DELETE_WAIT_LENGTH + ) + assert deleted is True + + def test_update(self, xgboost_training_job_debugger): + (reference, resource, spec) = xgboost_training_job_debugger + assert k8s.get_resource_exists(reference) + + training_job_name = resource["spec"].get("trainingJobName", None) + assert training_job_name is not None + + training_job_desc = get_sagemaker_training_job(training_job_name) + training_job_arn = training_job_desc["TrainingJobArn"] + + resource_arn = k8s.get_resource_arn(resource) + if resource_arn is None: + logging.error( + f"ARN for this resource is None, resource status is: {resource['status']}" + ) + assert resource_arn == training_job_arn + + assert training_job_desc["TrainingJobStatus"] == cfg.JOB_STATUS_INPROGRESS + assert k8s.wait_on_condition(reference, "ACK.ResourceSynced", "False") + + spec["spec"]["profilerConfig"][ + "profilingIntervalInMilliseconds" + ] = NEW_PROFILER_INTERVAL + k8s.patch_custom_resource(reference, spec) + + assert_training_status_in_sync( + training_job_name, reference, cfg.JOB_STATUS_COMPLETED + ) + assert k8s.wait_on_condition(reference, "ACK.ResourceSynced", "False") + + # Assert debugger rule evaluation completed + self._assert_training_rule_eval_status_in_sync( + training_job_name, "DebugRule", reference, cfg.RULE_STATUS_COMPLETED + ) + + # Assert profiler rule evaluation completed + self._assert_training_rule_eval_status_in_sync( + training_job_name, "ProfilerRule", reference, cfg.RULE_STATUS_COMPLETED + ) + assert k8s.wait_on_condition(reference, "ACK.ResourceSynced", "True") + + # Check if the update worked. + training_sm_desc = get_sagemaker_training_job(training_job_name) + assert ( + training_sm_desc["ProfilerConfig"]["ProfilingIntervalInMilliseconds"] + == NEW_PROFILER_INTERVAL + ) + resource = k8s.get_resource(reference) + assert ( + resource["spec"]["profilerConfig"]["profilingIntervalInMilliseconds"] + == NEW_PROFILER_INTERVAL + ) + + # Check that you can delete a completed resource from k8s + _, deleted = k8s.delete_custom_resource( + reference, cfg.JOB_DELETE_WAIT_PERIODS, cfg.JOB_DELETE_WAIT_LENGTH + ) assert deleted is True From 8c79de8088b7be98a153e4dd68576bc1c4c225fd Mon Sep 17 00:00:00 2001 From: ananth102 Date: Mon, 21 Nov 2022 06:12:23 +0000 Subject: [PATCH 13/71] fix: corrected hook --- apis/v1alpha1/ack-generate-metadata.yaml | 4 ++-- apis/v1alpha1/generator.yaml | 2 +- generator.yaml | 2 +- pkg/resource/training_job/hooks.go | 14 ++++++++++++++ pkg/resource/training_job/sdk.go | 3 ++- 5 files changed, 20 insertions(+), 5 deletions(-) diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index 1bb00153..ccc0e77e 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,5 +1,5 @@ ack_generate_info: - build_date: "2022-11-18T16:40:15Z" + build_date: "2022-11-21T06:07:29Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc @@ -7,7 +7,7 @@ api_directory_checksum: a11209e81e188afecb6812ec3080cead7ce995b1 api_version: v1alpha1 aws_sdk_go_version: v1.44.117 generator_config_info: - file_checksum: 9678da88ff1b4061cfae104e44116cf0d87ebe09 + file_checksum: ac5bdf0ea0d52467b65d65438608bbb1eb0ee571 original_file_name: generator.yaml last_modification: reason: API generation diff --git a/apis/v1alpha1/generator.yaml b/apis/v1alpha1/generator.yaml index eacb3dce..485ffd82 100644 --- a/apis/v1alpha1/generator.yaml +++ b/apis/v1alpha1/generator.yaml @@ -134,7 +134,7 @@ resources: sdk_update_post_build_request: template_path: training_job/sdk_update_post_build_request.go.tpl sdk_update_post_set_output: - code: rm.customSetOutput(&resource{ko}) + code: customSetOutputPostUpdate(ko, delta) sdk_delete_pre_build_request: template_path: training_job/sdk_delete_pre_build_request.go.tpl sdk_delete_post_request: diff --git a/generator.yaml b/generator.yaml index eacb3dce..485ffd82 100644 --- a/generator.yaml +++ b/generator.yaml @@ -134,7 +134,7 @@ resources: sdk_update_post_build_request: template_path: training_job/sdk_update_post_build_request.go.tpl sdk_update_post_set_output: - code: rm.customSetOutput(&resource{ko}) + code: customSetOutputPostUpdate(ko, delta) sdk_delete_pre_build_request: template_path: training_job/sdk_delete_pre_build_request.go.tpl sdk_delete_post_request: diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index a4e39980..a2d830cd 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -18,6 +18,7 @@ import ( ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" ackrequeue "github.com/aws-controllers-k8s/runtime/pkg/requeue" + svcapitypes "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" svccommon "github.com/aws-controllers-k8s/sagemaker-controller/pkg/common" "github.com/aws/aws-sdk-go/aws" svcsdk "github.com/aws/aws-sdk-go/service/sagemaker" @@ -182,3 +183,16 @@ func profilerRemovalCheck(desired *resource, latest *resource) bool { } return false } + +// The statuses in ko object in the end of update are empty, using customSetOutput wont work. +func customSetOutputPostUpdate(ko *svcapitypes.TrainingJob, delta *ackcompare.Delta) { + warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") + profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") + if profiler_diff { + svccommon.SetSyncedCondition(&resource{ko}, aws.string("InProgress"), &resourceName, &trainingJobModifyingStatuses) + } + if warmpool_diff { + svccommon.SetSyncedCondition(&resource{ko}, aws.string("Availible"), aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) + } + +} diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index 023f3e63..ec5a0146 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -1067,6 +1067,7 @@ func (rm *resourceManager) sdkUpdate( if warmpool_terminal { return latest, errors.New("[ACK_SM] Warm pool either does not exist or has reached a non updatable state.") } + //Requeue if TrainingJob is in InProgress state if err := customSetOutputUpdateWarmpool(latest); err != nil { return nil, err } @@ -1106,7 +1107,7 @@ func (rm *resourceManager) sdkUpdate( } rm.setStatusDefaults(ko) - rm.customSetOutput(&resource{ko}) + customSetOutputPostUpdate(ko, delta) return &resource{ko}, nil } From 60747ea96cc4ae3197442b2f66e78d1269ed6ad4 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Mon, 21 Nov 2022 06:17:43 +0000 Subject: [PATCH 14/71] fix: small typo --- pkg/resource/training_job/hooks.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index a2d830cd..54274979 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -189,10 +189,10 @@ func customSetOutputPostUpdate(ko *svcapitypes.TrainingJob, delta *ackcompare.De warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") if profiler_diff { - svccommon.SetSyncedCondition(&resource{ko}, aws.string("InProgress"), &resourceName, &trainingJobModifyingStatuses) + svccommon.SetSyncedCondition(&resource{ko}, aws.String("InProgress"), &resourceName, &trainingJobModifyingStatuses) } if warmpool_diff { - svccommon.SetSyncedCondition(&resource{ko}, aws.string("Availible"), aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) + svccommon.SetSyncedCondition(&resource{ko}, aws.String("Availible"), aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) } } From 5a5a178ef992bc64e093b9e1f009225e46bd40f5 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Mon, 21 Nov 2022 07:14:57 +0000 Subject: [PATCH 15/71] fix: corrected spelling/grammar erorrs --- pkg/resource/training_job/hooks.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index 54274979..c0563c02 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -51,7 +51,7 @@ var ( ) requeueBeforeUpdate = ackrequeue.NeededAfter( - errors.New("Warm pool cannot be updated in InProgress state requeuing until TrainingJob reaches completed state."), + errors.New("Warm pool cannot be updated in InProgress state, requeuing until TrainingJob reaches completed state."), ackrequeue.DefaultRequeueAfterDuration, ) requeueBeforeUpdateStarting = ackrequeue.NeededAfter( @@ -192,7 +192,7 @@ func customSetOutputPostUpdate(ko *svcapitypes.TrainingJob, delta *ackcompare.De svccommon.SetSyncedCondition(&resource{ko}, aws.String("InProgress"), &resourceName, &trainingJobModifyingStatuses) } if warmpool_diff { - svccommon.SetSyncedCondition(&resource{ko}, aws.String("Availible"), aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) + svccommon.SetSyncedCondition(&resource{ko}, aws.String("Available"), aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) } } From 22a72b01feb6a8f15e2cfb0787196e60d005b1b7 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Tue, 22 Nov 2022 18:02:52 +0000 Subject: [PATCH 16/71] fix: handle invalid update --- apis/v1alpha1/ack-generate-metadata.yaml | 2 +- pkg/resource/training_job/sdk.go | 3 +++ templates/training_job/sdk_update_post_build_request.go.tpl | 5 ++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index ccc0e77e..791bf8d3 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,5 +1,5 @@ ack_generate_info: - build_date: "2022-11-21T06:07:29Z" + build_date: "2022-11-22T17:54:48Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index ec5a0146..5358b78b 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -1060,6 +1060,9 @@ func (rm *resourceManager) sdkUpdate( if warmpool_diff && profiler_diff { return latest, errors.New("[ACK_SM] Cannot update Warm pool and Profiler at the same time.") } + if !warmpool_diff && !profiler_diff { + return latest, errors.New("[ACK_SM] Only Warm Pool or Profiler can be updated") + } if warmpool_diff { input.SetProfilerConfig(nil) input.SetProfilerRuleConfigurations(nil) diff --git a/templates/training_job/sdk_update_post_build_request.go.tpl b/templates/training_job/sdk_update_post_build_request.go.tpl index 68984daf..38d60b0c 100644 --- a/templates/training_job/sdk_update_post_build_request.go.tpl +++ b/templates/training_job/sdk_update_post_build_request.go.tpl @@ -1,8 +1,11 @@ warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") -if warmpool_diff && profiler_diff{ +if warmpool_diff && profiler_diff { return latest, errors.New("[ACK_SM] Cannot update Warm pool and Profiler at the same time.") } +if !warmpool_diff && !profiler_diff { + return latest, errors.New("[ACK_SM] Only Warm Pool or Profiler can be updated") +} if warmpool_diff { input.SetProfilerConfig(nil) input.SetProfilerRuleConfigurations(nil) From baab0a6b4bbb589b7c189544b4dfd4916c09c4af Mon Sep 17 00:00:00 2001 From: ananth102 Date: Tue, 22 Nov 2022 19:00:13 +0000 Subject: [PATCH 17/71] refactor: changed comments/terminal condition --- apis/v1alpha1/ack-generate-metadata.yaml | 4 +- apis/v1alpha1/generator.yaml | 1 - generator.yaml | 1 - .../training_job/custom_set_update_input.go | 13 +++-- .../training_job/custom_update_conditions.go | 53 ------------------- pkg/resource/training_job/hooks.go | 18 ++++--- pkg/resource/training_job/sdk.go | 10 ++-- .../sdk_update_post_build_request.go.tpl | 6 +-- 8 files changed, 28 insertions(+), 78 deletions(-) delete mode 100644 pkg/resource/training_job/custom_update_conditions.go diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index 791bf8d3..b95cdb0a 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,5 +1,5 @@ ack_generate_info: - build_date: "2022-11-22T17:54:48Z" + build_date: "2022-11-22T18:47:47Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc @@ -7,7 +7,7 @@ api_directory_checksum: a11209e81e188afecb6812ec3080cead7ce995b1 api_version: v1alpha1 aws_sdk_go_version: v1.44.117 generator_config_info: - file_checksum: ac5bdf0ea0d52467b65d65438608bbb1eb0ee571 + file_checksum: ecbd3d6faa6352c2e9af3cbbe365a6d75c19c3ce original_file_name: generator.yaml last_modification: reason: API generation diff --git a/apis/v1alpha1/generator.yaml b/apis/v1alpha1/generator.yaml index 485ffd82..2b099c73 100644 --- a/apis/v1alpha1/generator.yaml +++ b/apis/v1alpha1/generator.yaml @@ -123,7 +123,6 @@ resources: - InvalidParameterCombination - InvalidParameterValue - MissingParameter - update_conditions_custom_method_name: CustomUpdateConditions hooks: delta_pre_compare: code: customSetDefaults(a, b) diff --git a/generator.yaml b/generator.yaml index 485ffd82..2b099c73 100644 --- a/generator.yaml +++ b/generator.yaml @@ -123,7 +123,6 @@ resources: - InvalidParameterCombination - InvalidParameterValue - MissingParameter - update_conditions_custom_method_name: CustomUpdateConditions hooks: delta_pre_compare: code: customSetDefaults(a, b) diff --git a/pkg/resource/training_job/custom_set_update_input.go b/pkg/resource/training_job/custom_set_update_input.go index cede7eed..2ea43247 100644 --- a/pkg/resource/training_job/custom_set_update_input.go +++ b/pkg/resource/training_job/custom_set_update_input.go @@ -20,16 +20,17 @@ import ( "errors" ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" + ackerr "github.com/aws-controllers-k8s/runtime/pkg/errors" smv1alpha "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" svcsdk "github.com/aws/aws-sdk-go/service/sagemaker" ) +// customSetUpdateInput modifies the input of UpdateTrainingJob. // Three conditions: // 1. Customer updates both profiler parameters: Recreate the input for profiler Rule. // 2. Customer only updates Profiler Config: Set the profiler rule configuration to nil to avoid validation error. // 3. Customer only updates Rule Configurations: Recreate the input for profiler Rule and set Profiler config to nil. // safer to do this because the "only add" behavior might reappear. - func customSetUpdateInput(desired *resource, latest *resource, delta *ackcompare.Delta, input *svcsdk.UpdateTrainingJobInput) error { if delta.DifferentAt("Spec.ProfilerConfig") && delta.DifferentAt("Spec.ProfilerRuleConfigurations") { err := handleProfilerRuleConfig(desired, latest, input) @@ -47,6 +48,8 @@ func customSetUpdateInput(desired *resource, latest *resource, delta *ackcompare return nil } +// handleProfilerRuleConfig sets the input of the ProfilerRuleConfiguration so that +// it is compatible with the sagemaker API. // Update training job is post operation wrt to the profiler parameters. // Because of this only NEW rules can be specified. // In this function we check to see if any new profiler configurstions have been added. @@ -55,13 +58,13 @@ func handleProfilerRuleConfig(desired *resource, latest *resource, input *svcsdk profilerRuleLatest := latest.ko.Spec.ProfilerRuleConfigurations if ackcompare.IsNil(profilerRuleDesired) { - return errors.New("[ACK_SM] Cannot remove a profiler rule.") + return ackerr.NewTerminalError(errors.New("cannot remove a profiler rule.")) } if ackcompare.IsNil(profilerRuleLatest) { return nil } if len(profilerRuleDesired) < len(profilerRuleLatest) { - return errors.New("[ACK_SM] Cannot remove a profiler rule.") + return ackerr.NewTerminalError(errors.New("cannot remove a profiler rule.")) } ruleMap := map[string]int{} @@ -83,7 +86,7 @@ func handleProfilerRuleConfig(desired *resource, latest *resource, input *svcsdk return nil } -// Recreates input and sets disable profiler to true +// handleProfilerRemoval sets the input parameters to disable the profiler. func handleProfilerRemoval(input *svcsdk.UpdateTrainingJobInput) { input.SetProfilerRuleConfigurations(nil) profilerConfig := svcsdk.ProfilerConfigForUpdate{} @@ -91,6 +94,8 @@ func handleProfilerRemoval(input *svcsdk.UpdateTrainingJobInput) { input.SetProfilerConfig(&profilerConfig) } +// convertProfileRuleType converts the kubernetes object ProfilerRuleConfiguration into +// a type that is compatible with the AWS API. // Sagemaker and kubernetes types are not the same so the input has to be reconstructed. func convertProfileRuleType(rule *smv1alpha.ProfilerRuleConfiguration) *svcsdk.ProfilerRuleConfiguration { smRule := &svcsdk.ProfilerRuleConfiguration{} diff --git a/pkg/resource/training_job/custom_update_conditions.go b/pkg/resource/training_job/custom_update_conditions.go deleted file mode 100644 index c94cfcf0..00000000 --- a/pkg/resource/training_job/custom_update_conditions.go +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"). You may -// not use this file except in compliance with the License. A copy of the -// License is located at -// -// http://aws.amazon.com/apache2.0/ -// -// or in the "license" file accompanying this file. This file is distributed -// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either -// express or implied. See the License for the specific language governing -// permissions and limitations under the License. - -// Use this file if conditions need to be updated based on the latest status -// of training job which is not evident from API response - -package training_job - -import ( - "strings" - - ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" - ackcondition "github.com/aws-controllers-k8s/runtime/pkg/condition" - svcapitypes "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" - corev1 "k8s.io/api/core/v1" -) - -var ( - terminalCode string = "[ACK_SM]" -) - -// If the controller runs into an error that contains "[ACK_SM]" -// it will set the resource to a terminal state because it is an unrecoverable error. -func (rm *resourceManager) CustomUpdateConditions( - ko *svcapitypes.TrainingJob, - r *resource, - err error, -) bool { - - if ackcompare.IsNil(err) { - return false - } - - if strings.Contains(err.Error(), terminalCode) { - conditionManager := &resource{ko} - exception := err.Error() - ackcondition.SetTerminal(conditionManager, corev1.ConditionTrue, &exception, nil) - return true - } - - return false - -} diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index c0563c02..0df5b553 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -17,6 +17,7 @@ import ( "errors" ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" + ackerr "github.com/aws-controllers-k8s/runtime/pkg/errors" ackrequeue "github.com/aws-controllers-k8s/runtime/pkg/requeue" svcapitypes "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" svccommon "github.com/aws-controllers-k8s/sagemaker-controller/pkg/common" @@ -51,11 +52,11 @@ var ( ) requeueBeforeUpdate = ackrequeue.NeededAfter( - errors.New("Warm pool cannot be updated in InProgress state, requeuing until TrainingJob reaches completed state."), + errors.New("warm pool cannot be updated in InProgress state, requeuing until TrainingJob reaches completed state."), ackrequeue.DefaultRequeueAfterDuration, ) requeueBeforeUpdateStarting = ackrequeue.NeededAfter( - errors.New("Controller cannot update while secondary status is in Starting state."), + errors.New("controller cannot update while secondary status is in Starting state."), ackrequeue.DefaultRequeueAfterDuration, ) ) @@ -112,7 +113,7 @@ func (rm *resourceManager) customSetOutput(r *resource) { } -// This function makes the controller requeue if there is an update and +// customSetOutputUpdateWarmpool makes the controller requeue if there is an update and // the training job is still in InProgress func customSetOutputUpdateWarmpool(r *resource) error { trainingJobStatus := r.ko.Status.TrainingJobStatus @@ -122,7 +123,7 @@ func customSetOutputUpdateWarmpool(r *resource) error { return nil } -// Check if warm pool has reached a state where it is not updateable +// warmPoolTerminalCheck checks if warm pool has reached a state where it is not updateable func warmPoolTerminalCheck(latest *resource) bool { trainingJobStatus := latest.ko.Status.TrainingJobStatus if ackcompare.IsNotNil(latest.ko.Spec.ResourceConfig) { @@ -154,7 +155,8 @@ func warmPoolTerminalCheck(latest *resource) bool { return true } -// Profiler cannot be updated at certain statuses. +// customSetOutputUpdateProfiler decides whether the training job is ready/eligible for update +// depending on the status. func customSetOutputUpdateProfiler(r *resource) error { trainingSecondaryStatus := r.ko.Status.SecondaryStatus trainingJobStatus := r.ko.Status.TrainingJobStatus @@ -164,14 +166,14 @@ func customSetOutputUpdateProfiler(r *resource) error { if ackcompare.IsNotNil(trainingJobStatus) { for _, terminalStatus := range TrainingJobTerminalProfiler { if terminalStatus == *trainingJobStatus { - return errors.New("[ACK_SM] Profiler can only be updated when Training Job is in InProgress state") + return ackerr.NewTerminalError(errors.New("profiler can only be updated when Training Job is in InProgress state")) } } } return nil } -// Checks if the profiler was removed. +// profilerRemovalCheck checks if the profiler was removed. func profilerRemovalCheck(desired *resource, latest *resource) bool { if ackcompare.IsNotNil(desired.ko.Spec) && ackcompare.IsNotNil(latest.ko.Spec) { if ackcompare.IsNil(desired.ko.Spec.ProfilerRuleConfigurations) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerRuleConfigurations) { @@ -184,7 +186,7 @@ func profilerRemovalCheck(desired *resource, latest *resource) bool { return false } -// The statuses in ko object in the end of update are empty, using customSetOutput wont work. +// customSetOutputPostUpdate sets the synced condition at the end of the update. func customSetOutputPostUpdate(ko *svcapitypes.TrainingJob, delta *ackcompare.Delta) { warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index 5358b78b..78eb3196 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -1058,17 +1058,17 @@ func (rm *resourceManager) sdkUpdate( warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") if warmpool_diff && profiler_diff { - return latest, errors.New("[ACK_SM] Cannot update Warm pool and Profiler at the same time.") + return latest, ackerr.NewTerminalError(errors.New("cannot update Warm pool and Profiler at the same time")) } if !warmpool_diff && !profiler_diff { - return latest, errors.New("[ACK_SM] Only Warm Pool or Profiler can be updated") + return latest, ackerr.NewTerminalError(errors.New("only Warm Pool or Profiler can be updated")) } if warmpool_diff { input.SetProfilerConfig(nil) input.SetProfilerRuleConfigurations(nil) warmpool_terminal := warmPoolTerminalCheck(latest) if warmpool_terminal { - return latest, errors.New("[ACK_SM] Warm pool either does not exist or has reached a non updatable state.") + return latest, ackerr.NewTerminalError(errors.New("warm pool either does not exist or has reached a non updatable state")) } //Requeue if TrainingJob is in InProgress state if err := customSetOutputUpdateWarmpool(latest); err != nil { @@ -1337,9 +1337,7 @@ func (rm *resourceManager) updateConditions( } // Required to avoid the "declared but not used" error in the default case _ = syncCondition - // custom update conditions - customUpdate := rm.CustomUpdateConditions(ko, r, err) - if terminalCondition != nil || recoverableCondition != nil || syncCondition != nil || customUpdate { + if terminalCondition != nil || recoverableCondition != nil || syncCondition != nil { return &resource{ko}, true // updated } return nil, false // not updated diff --git a/templates/training_job/sdk_update_post_build_request.go.tpl b/templates/training_job/sdk_update_post_build_request.go.tpl index 38d60b0c..bbc63400 100644 --- a/templates/training_job/sdk_update_post_build_request.go.tpl +++ b/templates/training_job/sdk_update_post_build_request.go.tpl @@ -1,17 +1,17 @@ warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") if warmpool_diff && profiler_diff { - return latest, errors.New("[ACK_SM] Cannot update Warm pool and Profiler at the same time.") + return latest, ackerr.NewTerminalError(errors.New("cannot update Warm pool and Profiler at the same time")) } if !warmpool_diff && !profiler_diff { - return latest, errors.New("[ACK_SM] Only Warm Pool or Profiler can be updated") + return latest, ackerr.NewTerminalError(errors.New("only Warm Pool or Profiler can be updated")) } if warmpool_diff { input.SetProfilerConfig(nil) input.SetProfilerRuleConfigurations(nil) warmpool_terminal := warmPoolTerminalCheck(latest) if warmpool_terminal { - return latest, errors.New("[ACK_SM] Warm pool either does not exist or has reached a non updatable state.") + return latest, ackerr.NewTerminalError(errors.New("warm pool either does not exist or has reached a non updatable state")) } //Requeue if TrainingJob is in InProgress state if err := customSetOutputUpdateWarmpool(latest); err != nil { From 095c65d9adc910792fbfb418de99393a2c3c6516 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Fri, 18 Nov 2022 17:35:48 +0000 Subject: [PATCH 18/71] Feature: Update TrainingJob --- apis/v1alpha1/generator.yaml | 1 + .../training_job/custom_update_conditions.go | 53 ++++++ pkg/resource/training_job/hooks.go | 159 ++++++++++++++++++ pkg/resource/training_job/sdk.go | 4 +- 4 files changed, 216 insertions(+), 1 deletion(-) create mode 100644 pkg/resource/training_job/custom_update_conditions.go diff --git a/apis/v1alpha1/generator.yaml b/apis/v1alpha1/generator.yaml index 2b099c73..485ffd82 100644 --- a/apis/v1alpha1/generator.yaml +++ b/apis/v1alpha1/generator.yaml @@ -123,6 +123,7 @@ resources: - InvalidParameterCombination - InvalidParameterValue - MissingParameter + update_conditions_custom_method_name: CustomUpdateConditions hooks: delta_pre_compare: code: customSetDefaults(a, b) diff --git a/pkg/resource/training_job/custom_update_conditions.go b/pkg/resource/training_job/custom_update_conditions.go new file mode 100644 index 00000000..c94cfcf0 --- /dev/null +++ b/pkg/resource/training_job/custom_update_conditions.go @@ -0,0 +1,53 @@ +// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +// Use this file if conditions need to be updated based on the latest status +// of training job which is not evident from API response + +package training_job + +import ( + "strings" + + ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" + ackcondition "github.com/aws-controllers-k8s/runtime/pkg/condition" + svcapitypes "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" + corev1 "k8s.io/api/core/v1" +) + +var ( + terminalCode string = "[ACK_SM]" +) + +// If the controller runs into an error that contains "[ACK_SM]" +// it will set the resource to a terminal state because it is an unrecoverable error. +func (rm *resourceManager) CustomUpdateConditions( + ko *svcapitypes.TrainingJob, + r *resource, + err error, +) bool { + + if ackcompare.IsNil(err) { + return false + } + + if strings.Contains(err.Error(), terminalCode) { + conditionManager := &resource{ko} + exception := err.Error() + ackcondition.SetTerminal(conditionManager, corev1.ConditionTrue, &exception, nil) + return true + } + + return false + +} diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index 0df5b553..6929ab50 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -186,6 +186,93 @@ func profilerRemovalCheck(desired *resource, latest *resource) bool { return false } +// customSetOutputPostUpdate sets the synced condition at the end of the update. +func customSetOutputPostUpdate(ko *svcapitypes.TrainingJob, delta *ackcompare.Delta) { + warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") + profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") + if profiler_diff { + svccommon.SetSyncedCondition(&resource{ko}, aws.String("InProgress"), &resourceName, &trainingJobModifyingStatuses) + } + if warmpool_diff { + svccommon.SetSyncedCondition(&resource{ko}, aws.String("Available"), aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) + } + } + +} + +// customSetOutputUpdateWarmpool makes the controller requeue if there is an update and +// the training job is still in InProgress +func customSetOutputUpdateWarmpool(r *resource) error { + trainingJobStatus := r.ko.Status.TrainingJobStatus + if ackcompare.IsNotNil(trainingJobStatus) && *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { + return requeueBeforeUpdate + } + return nil +} + +// warmPoolTerminalCheck checks if warm pool has reached a state where it is not updateable +func warmPoolTerminalCheck(latest *resource) bool { + trainingJobStatus := latest.ko.Status.TrainingJobStatus + if ackcompare.IsNotNil(latest.ko.Spec.ResourceConfig) { + if ackcompare.IsNil(latest.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds) { + return true // Warm pool can only be updated iff there is a provisioned cluster. + } + } else { + return false + } + + if ackcompare.IsNotNil(trainingJobStatus) { + if *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { + return false + } + if *trainingJobStatus == svcsdk.TrainingJobStatusCompleted { + if ackcompare.IsNotNil(latest.ko.Status.WarmPoolStatus) { + wp_modifying := svccommon.IsModifyingStatus(latest.ko.Status.WarmPoolStatus.Status, &WarmPoolModifyingStatuses) + return !wp_modifying + } else { + return false // Sometimes the API (briefly) does not return the WP status even if it completes. + } + } else { + // Training Job is in 'Failed'|'Stopping'|'Stopped' (Terminal) + return true + } + } + + // ACK OIDC is misconfigured (Terminal) + return true +} + +// customSetOutputUpdateProfiler decides whether the training job is ready/eligible for update +// depending on the status. +func customSetOutputUpdateProfiler(r *resource) error { + trainingSecondaryStatus := r.ko.Status.SecondaryStatus + trainingJobStatus := r.ko.Status.TrainingJobStatus + if ackcompare.IsNotNil(trainingSecondaryStatus) && *trainingSecondaryStatus == svcsdk.SecondaryStatusStarting { + return requeueBeforeUpdateStarting + } + if ackcompare.IsNotNil(trainingJobStatus) { + for _, terminalStatus := range TrainingJobTerminalProfiler { + if terminalStatus == *trainingJobStatus { + return ackerr.NewTerminalError(errors.New("profiler can only be updated when Training Job is in InProgress state")) + } + } + } + return nil +} + +// profilerRemovalCheck checks if the profiler was removed. +func profilerRemovalCheck(desired *resource, latest *resource) bool { + if ackcompare.IsNotNil(desired.ko.Spec) && ackcompare.IsNotNil(latest.ko.Spec) { + if ackcompare.IsNil(desired.ko.Spec.ProfilerRuleConfigurations) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerRuleConfigurations) { + return true + } + if ackcompare.IsNil(desired.ko.Spec.ProfilerConfig) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerConfig) { + return true + } + } + return false +} + // customSetOutputPostUpdate sets the synced condition at the end of the update. func customSetOutputPostUpdate(ko *svcapitypes.TrainingJob, delta *ackcompare.Delta) { warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") @@ -198,3 +285,75 @@ func customSetOutputPostUpdate(ko *svcapitypes.TrainingJob, delta *ackcompare.De } } + +// This function makes the controller requeue if there is an update and +// the training job is still in InProgress +func customSetOutputUpdateWarmpool(r *resource) error { + trainingJobStatus := r.ko.Status.TrainingJobStatus + if ackcompare.IsNotNil(trainingJobStatus) && *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { + return requeueBeforeUpdate + } + return nil +} + +// Check if warm pool has reached a state where it is not updateable +func warmPoolTerminalCheck(latest *resource) bool { + trainingJobStatus := latest.ko.Status.TrainingJobStatus + if ackcompare.IsNotNil(latest.ko.Spec.ResourceConfig) { + if ackcompare.IsNil(latest.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds) { + return true // Warm pool can only be updated iff there is a provisioned cluster. + } + } else { + return false + } + + if ackcompare.IsNotNil(trainingJobStatus) { + if *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { + return false + } + if *trainingJobStatus == svcsdk.TrainingJobStatusCompleted { + if ackcompare.IsNotNil(latest.ko.Status.WarmPoolStatus) { + wp_modifying := svccommon.IsModifyingStatus(latest.ko.Status.WarmPoolStatus.Status, &WarmPoolModifyingStatuses) + return !wp_modifying + } else { + return false // Sometimes the API (briefly) does not return the WP status even if it completes. + } + } else { + // Training Job is in 'Failed'|'Stopping'|'Stopped' (Terminal) + return true + } + } + + // ACK OIDC is misconfigured (Terminal) + return true +} + +// Profiler cannot be updated at certain statuses. +func customSetOutputUpdateProfiler(r *resource) error { + trainingSecondaryStatus := r.ko.Status.SecondaryStatus + trainingJobStatus := r.ko.Status.TrainingJobStatus + if ackcompare.IsNotNil(trainingSecondaryStatus) && *trainingSecondaryStatus == svcsdk.SecondaryStatusStarting { + return requeueBeforeUpdateStarting + } + if ackcompare.IsNotNil(trainingJobStatus) { + for _, terminalStatus := range TrainingJobTerminalProfiler { + if terminalStatus == *trainingJobStatus { + return errors.New("[ACK_SM] Profiler can only be updated when Training Job is in InProgress state") + } + } + } + return nil +} + +// Checks if the profiler was removed. +func profilerRemovalCheck(desired *resource, latest *resource) bool { + if ackcompare.IsNotNil(desired.ko.Spec) && ackcompare.IsNotNil(latest.ko.Spec) { + if ackcompare.IsNil(desired.ko.Spec.ProfilerRuleConfigurations) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerRuleConfigurations) { + return true + } + if ackcompare.IsNil(desired.ko.Spec.ProfilerConfig) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerConfig) { + return true + } + } + return false +} diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index 78eb3196..3ba81b89 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -1337,7 +1337,9 @@ func (rm *resourceManager) updateConditions( } // Required to avoid the "declared but not used" error in the default case _ = syncCondition - if terminalCondition != nil || recoverableCondition != nil || syncCondition != nil { + // custom update conditions + customUpdate := rm.CustomUpdateConditions(ko, r, err) + if terminalCondition != nil || recoverableCondition != nil || syncCondition != nil || customUpdate { return &resource{ko}, true // updated } return nil, false // not updated From 0d70417eaa963efc6f68f9bc02a40b2bb7b3b007 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Mon, 21 Nov 2022 06:05:05 +0000 Subject: [PATCH 19/71] preliminary changes --- .../training_job/manager_test_suite_test.go | 3 + .../testdata/sdkapi/update/update_sucess.json | 3 + .../training_job/testdata/test_suite.yaml | 33 ++++- .../update/desired/invalidparameter.yaml | 49 +++++++ .../testdata/v1alpha1/update/desired/ob1.yaml | 49 +++++++ .../update/desired/removeProfilerConfig.yaml | 46 +++++++ .../update/desired/removeProfilerRule.yaml | 44 +++++++ .../update/desired/removeProfilerboth.yaml | 41 ++++++ .../desired/updateBothProfileParams.yaml | 53 ++++++++ .../update/desired/updateProfileConfig.yaml | 49 +++++++ .../update/desired/updateProfilerRule.yaml | 53 ++++++++ .../update/desired/updateWarmPool.yaml | 50 ++++++++ .../v1alpha1/update/latest/WPDownloading.yaml | 117 +++++++++++++++++ .../v1alpha1/update/latest/WPHappyBase.yaml | 119 +++++++++++++++++ .../v1alpha1/update/latest/WPTerminal.yaml | 119 +++++++++++++++++ .../v1alpha1/update/latest/base1.yaml | 116 +++++++++++++++++ .../v1alpha1/update/latest/base4.yaml | 116 +++++++++++++++++ .../v1alpha1/update/latest/base5.yaml | 116 +++++++++++++++++ .../v1alpha1/update/latest/base6.yaml | 116 +++++++++++++++++ .../v1alpha1/update/latest/noProfiler.yaml | 102 +++++++++++++++ .../testdata/v1alpha1/update/latest/noWP.yaml | 116 +++++++++++++++++ .../update/latest/profilerHappyBase.yaml | 116 +++++++++++++++++ .../update/latest/profilerStarting.yaml | 111 ++++++++++++++++ .../update/latest/profilerTerminal.yaml | 116 +++++++++++++++++ .../v1alpha1/update/observed/base1.yaml | 116 +++++++++++++++++ .../v1alpha1/update/observed/base10.yaml | 116 +++++++++++++++++ .../v1alpha1/update/observed/base11.yaml | 116 +++++++++++++++++ .../v1alpha1/update/observed/base12.yaml | 116 +++++++++++++++++ .../v1alpha1/update/observed/base13.yaml | 116 +++++++++++++++++ .../update/observed/removeProfilerBoth.yaml | 107 ++++++++++++++++ .../update/observed/removeProfilerConfig.yaml | 113 +++++++++++++++++ .../update/observed/removeProfilerRule.yaml | 110 ++++++++++++++++ .../update/observed/updateProfilerBoth.yaml | 120 ++++++++++++++++++ .../update/observed/updateProfilerConfig.yaml | 116 +++++++++++++++++ .../update/observed/updateProfilerRule.yaml | 120 ++++++++++++++++++ .../v1alpha1/update/observed/updateWP.yaml | 120 ++++++++++++++++++ 36 files changed, 3238 insertions(+), 1 deletion(-) create mode 100644 pkg/resource/training_job/testdata/sdkapi/update/update_sucess.json create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/desired/invalidparameter.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/desired/ob1.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/desired/removeProfilerConfig.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/desired/removeProfilerRule.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/desired/removeProfilerboth.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/desired/updateBothProfileParams.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/desired/updateProfileConfig.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/desired/updateProfilerRule.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/desired/updateWarmPool.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/latest/WPDownloading.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/latest/WPHappyBase.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/latest/WPTerminal.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/latest/base1.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/latest/base4.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/latest/base5.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/latest/base6.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/latest/noProfiler.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/latest/noWP.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/latest/profilerHappyBase.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/latest/profilerStarting.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/latest/profilerTerminal.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/observed/base1.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/observed/base10.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/observed/base11.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/observed/base12.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/observed/base13.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml diff --git a/pkg/resource/training_job/manager_test_suite_test.go b/pkg/resource/training_job/manager_test_suite_test.go index 27b86c88..c8c01dd1 100644 --- a/pkg/resource/training_job/manager_test_suite_test.go +++ b/pkg/resource/training_job/manager_test_suite_test.go @@ -94,6 +94,9 @@ func (d *testRunnerDelegate) EmptyServiceAPIOutput(apiName string) (interface{}, case "StopTrainingJobWithContext": var output svcsdk.StopTrainingJobOutput return &output, nil + case "UpdateTrainingJobWithContext": + var output svcsdk.UpdateTrainingJobOutput + return &output, nil } return nil, errors.New(fmt.Sprintf("no matching API name found for: %s", apiName)) } diff --git a/pkg/resource/training_job/testdata/sdkapi/update/update_sucess.json b/pkg/resource/training_job/testdata/sdkapi/update/update_sucess.json new file mode 100644 index 00000000..f2290acb --- /dev/null +++ b/pkg/resource/training_job/testdata/sdkapi/update/update_sucess.json @@ -0,0 +1,3 @@ +{ + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test" +} \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/test_suite.yaml b/pkg/resource/training_job/testdata/test_suite.yaml index 5e370bf8..dd5d02ff 100644 --- a/pkg/resource/training_job/testdata/test_suite.yaml +++ b/pkg/resource/training_job/testdata/test_suite.yaml @@ -228,4 +228,35 @@ tests: - operation: StopTrainingJobWithContext invoke: Delete expect: - error: nil \ No newline at end of file + error: nil + - name: "Training job Update tests" + description: "Tests for update operation" + scenarios: + - name: "Update=RemoveProfilerBothParams" + desciption: "This test checks if the Controller can remove the profiler properly" + given: + desired_state: "v1alpha1/update/desired/removeProfilerboth.yaml" + latest_state: "v1alpha1/update/latest/profilerHappyBase.yaml" + svc_api: + - operation: UpdateTrainingJobWithContext + output_fixture: "sdkapi/update/update_sucess.json" + invoke: Update + expect: + latest_state: "v1alpha1/update/observed/removeProfilerBoth.yaml" + error: nil + # - name: "Update=RemoveProfilerRule" + # - name: "Update=RemoveProfilerConfig" + # - name: "Update=BothProfiler" + # - name: "Update=ProfilerConfig" + # - name: "Update=ProfilerStarting" + # - name: "Update=ProfilerTerminal" + # - name: "Update=ProfilerInvalid" + # - name: "Update=WarmPool" + # - name: "Update=WarmPoolInProgress" + # - name: "Update=WarmPoolTerminal" + # - name: "Update=AddProfiler" + # - name: "Update=AddWarmPool" + + + + diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/desired/invalidparameter.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/desired/invalidparameter.yaml new file mode 100644 index 00000000..83b4c7c5 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/desired/invalidparameter.yaml @@ -0,0 +1,49 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + name: training-test +spec: + trainingJobName: training-test + hyperParameters: + max_depth: "5" + gamma: "4" + eta: "0.2" + min_child_weight: "6" + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + outputDataConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + stoppingCondition: + maxRuntimeInSeconds: 86400 + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None + profilerConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilingIntervalInMilliseconds: 70 + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: ProfilerReport +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: [] diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/desired/ob1.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/desired/ob1.yaml new file mode 100644 index 00000000..59c84d9a --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/desired/ob1.yaml @@ -0,0 +1,49 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + name: training-test +spec: + trainingJobName: training-test + hyperParameters: + max_depth: "5" + gamma: "4" + eta: "0.2" + min_child_weight: "6" + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + outputDataConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + stoppingCondition: + maxRuntimeInSeconds: 86400 + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None + profilerConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilingIntervalInMilliseconds: 500 + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: ProfilerReport +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: [] diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/desired/removeProfilerConfig.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/desired/removeProfilerConfig.yaml new file mode 100644 index 00000000..7dfd05ec --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/desired/removeProfilerConfig.yaml @@ -0,0 +1,46 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + name: training-test +spec: + trainingJobName: training-test + hyperParameters: + max_depth: "5" + gamma: "4" + eta: "0.2" + min_child_weight: "6" + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + outputDataConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + stoppingCondition: + maxRuntimeInSeconds: 86400 + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: ProfilerReport +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: [] diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/desired/removeProfilerRule.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/desired/removeProfilerRule.yaml new file mode 100644 index 00000000..6937dc9f --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/desired/removeProfilerRule.yaml @@ -0,0 +1,44 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + name: training-test +spec: + trainingJobName: training-test + hyperParameters: + max_depth: "5" + gamma: "4" + eta: "0.2" + min_child_weight: "6" + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + outputDataConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + stoppingCondition: + maxRuntimeInSeconds: 86400 + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None + profilerConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilingIntervalInMilliseconds: 500 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: [] diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/desired/removeProfilerboth.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/desired/removeProfilerboth.yaml new file mode 100644 index 00000000..f443c032 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/desired/removeProfilerboth.yaml @@ -0,0 +1,41 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + name: training-test +spec: + trainingJobName: training-test + hyperParameters: + max_depth: "5" + gamma: "4" + eta: "0.2" + min_child_weight: "6" + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + outputDataConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + stoppingCondition: + maxRuntimeInSeconds: 86400 + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: [] diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateBothProfileParams.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateBothProfileParams.yaml new file mode 100644 index 00000000..57f69ac3 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateBothProfileParams.yaml @@ -0,0 +1,53 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + name: training-test +spec: + trainingJobName: training-test + hyperParameters: + max_depth: "5" + gamma: "4" + eta: "0.2" + min_child_weight: "6" + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + outputDataConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + stoppingCondition: + maxRuntimeInSeconds: 86400 + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None + profilerConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilingIntervalInMilliseconds: 200 + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: ProfilerReport + - ruleConfigurationName: CPUBottleneck + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: CPUBottleneck +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: [] diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateProfileConfig.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateProfileConfig.yaml new file mode 100644 index 00000000..7fdbafed --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateProfileConfig.yaml @@ -0,0 +1,49 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + name: training-test +spec: + trainingJobName: training-test + hyperParameters: + max_depth: "5" + gamma: "4" + eta: "0.2" + min_child_weight: "6" + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + outputDataConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + stoppingCondition: + maxRuntimeInSeconds: 86400 + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None + profilerConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilingIntervalInMilliseconds: 200 + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: ProfilerReport +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: [] diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateProfilerRule.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateProfilerRule.yaml new file mode 100644 index 00000000..d75ee8e7 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateProfilerRule.yaml @@ -0,0 +1,53 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + name: training-test +spec: + trainingJobName: training-test + hyperParameters: + max_depth: "5" + gamma: "4" + eta: "0.2" + min_child_weight: "6" + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + outputDataConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + stoppingCondition: + maxRuntimeInSeconds: 86400 + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None + profilerConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilingIntervalInMilliseconds: 500 + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: ProfilerReport + - ruleConfigurationName: CPUBottleneck + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: CPUBottleneck +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: [] diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateWarmPool.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateWarmPool.yaml new file mode 100644 index 00000000..140d2946 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateWarmPool.yaml @@ -0,0 +1,50 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + name: training-test +spec: + trainingJobName: training-test + hyperParameters: + max_depth: "5" + gamma: "4" + eta: "0.2" + min_child_weight: "6" + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + outputDataConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + keepAlivePeriodInSeconds: 69 + stoppingCondition: + maxRuntimeInSeconds: 86400 + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None + profilerConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilingIntervalInMilliseconds: 500 + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: ProfilerReport +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: [] diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPDownloading.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPDownloading.yaml new file mode 100644 index 00000000..e1a8d653 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPDownloading.yaml @@ -0,0 +1,117 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + keepAlivePeriodInSeconds: 100 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Disabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPHappyBase.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPHappyBase.yaml new file mode 100644 index 00000000..5a1bebc0 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPHappyBase.yaml @@ -0,0 +1,119 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + keepAlivePeriodInSeconds: 100 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: Warm Pool Infrastructure is in Available status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Completed + trainingJobStatus: Completed + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: NoIssuesFound + warmPoolStatus: + status: Available \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPTerminal.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPTerminal.yaml new file mode 100644 index 00000000..f231e6b2 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPTerminal.yaml @@ -0,0 +1,119 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + keepAlivePeriodInSeconds: 100 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in Completed status. + status: "True" + type: ACK.ResourceSynced + secondaryStatus: Completed + trainingJobStatus: Completed + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: NoIssuesFound + warmPoolStatus: + status: Terminated \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/base1.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/base1.yaml new file mode 100644 index 00000000..3178f2b1 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/latest/base1.yaml @@ -0,0 +1,116 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/base4.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/base4.yaml new file mode 100644 index 00000000..49ff0cde --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/latest/base4.yaml @@ -0,0 +1,116 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Disabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/base5.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/base5.yaml new file mode 100644 index 00000000..49ff0cde --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/latest/base5.yaml @@ -0,0 +1,116 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Disabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/base6.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/base6.yaml new file mode 100644 index 00000000..49ff0cde --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/latest/base6.yaml @@ -0,0 +1,116 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Disabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/noProfiler.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/noProfiler.yaml new file mode 100644 index 00000000..6af913ab --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/latest/noProfiler.yaml @@ -0,0 +1,102 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Disabled diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/noWP.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/noWP.yaml new file mode 100644 index 00000000..3178f2b1 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/latest/noWP.yaml @@ -0,0 +1,116 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/profilerHappyBase.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/profilerHappyBase.yaml new file mode 100644 index 00000000..3178f2b1 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/latest/profilerHappyBase.yaml @@ -0,0 +1,116 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/profilerStarting.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/profilerStarting.yaml new file mode 100644 index 00000000..7dbf53fb --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/latest/profilerStarting.yaml @@ -0,0 +1,111 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Starting + trainingJobStatus: InProgress + profilingStatus: Enabled diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/profilerTerminal.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/profilerTerminal.yaml new file mode 100644 index 00000000..d432ac5f --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/latest/profilerTerminal.yaml @@ -0,0 +1,116 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in Completed status. + status: "True" + type: ACK.ResourceSynced + secondaryStatus: Completed + trainingJobStatus: Completed + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: NoIssuesFound diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/base1.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/base1.yaml new file mode 100644 index 00000000..3178f2b1 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/base1.yaml @@ -0,0 +1,116 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/base10.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/base10.yaml new file mode 100644 index 00000000..3178f2b1 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/base10.yaml @@ -0,0 +1,116 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/base11.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/base11.yaml new file mode 100644 index 00000000..3178f2b1 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/base11.yaml @@ -0,0 +1,116 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/base12.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/base12.yaml new file mode 100644 index 00000000..3178f2b1 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/base12.yaml @@ -0,0 +1,116 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/base13.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/base13.yaml new file mode 100644 index 00000000..3178f2b1 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/base13.yaml @@ -0,0 +1,116 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml new file mode 100644 index 00000000..5909ba1a --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml @@ -0,0 +1,107 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml new file mode 100644 index 00000000..63869078 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml @@ -0,0 +1,113 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml new file mode 100644 index 00000000..d161c238 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml @@ -0,0 +1,110 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml new file mode 100644 index 00000000..924fb563 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml @@ -0,0 +1,120 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 200 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 + - ruleConfigurationName: CPUBottleneck + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: CPUBottleneck +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml new file mode 100644 index 00000000..f6b7e448 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml @@ -0,0 +1,116 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 200 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml new file mode 100644 index 00000000..449757ba --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml @@ -0,0 +1,120 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 + - ruleConfigurationName: CPUBottleneck + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: CPUBottleneck +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml new file mode 100644 index 00000000..172382b6 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml @@ -0,0 +1,120 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: training-test +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + keepAlivePeriodInSeconds: 69 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + trainingJobName: training-test + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress + warmPoolStatus: + status: Available + From 2f07868370b4bbb28c6f534aacd7902e9b49cc93 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Mon, 21 Nov 2022 07:13:37 +0000 Subject: [PATCH 20/71] final test suite --- .../training_job/testdata/test_suite.yaml | 115 +++++++++++++++-- .../update/desired/invalidparameter.yaml | 49 -------- .../testdata/v1alpha1/update/desired/ob1.yaml | 49 -------- .../update/desired/updateWarmPool.yaml | 8 -- .../v1alpha1/update/latest/WPDownloading.yaml | 9 -- .../latest/{WPHappyBase.yaml => WPHappy.yaml} | 9 -- .../v1alpha1/update/latest/WPTerminal.yaml | 9 -- .../v1alpha1/update/latest/base1.yaml | 116 ----------------- .../v1alpha1/update/latest/base4.yaml | 116 ----------------- .../v1alpha1/update/latest/base5.yaml | 116 ----------------- .../v1alpha1/update/latest/base6.yaml | 116 ----------------- .../testdata/v1alpha1/update/latest/noWP.yaml | 116 ----------------- .../latest/{noProfiler.yaml => vanilla.yaml} | 7 +- .../v1alpha1/update/observed/base1.yaml | 116 ----------------- .../v1alpha1/update/observed/base10.yaml | 116 ----------------- .../v1alpha1/update/observed/base11.yaml | 116 ----------------- .../v1alpha1/update/observed/base12.yaml | 116 ----------------- .../v1alpha1/update/observed/base13.yaml | 116 ----------------- .../update/observed/error_on_update.yaml | 61 --------- .../update/observed/removeProfilerBoth.yaml | 102 +++------------ .../update/observed/removeProfilerConfig.yaml | 105 ++++------------ .../update/observed/removeProfilerRule.yaml | 104 ++++----------- .../update/observed/updateProfilerBoth.yaml | 111 ++++------------ .../update/observed/updateProfilerConfig.yaml | 105 ++++------------ .../update/observed/updateProfilerRule.yaml | 111 ++++------------ .../v1alpha1/update/observed/updateWP.yaml | 118 ++++-------------- 26 files changed, 262 insertions(+), 1970 deletions(-) delete mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/desired/invalidparameter.yaml delete mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/desired/ob1.yaml rename pkg/resource/training_job/testdata/v1alpha1/update/latest/{WPHappyBase.yaml => WPHappy.yaml} (91%) delete mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/latest/base1.yaml delete mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/latest/base4.yaml delete mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/latest/base5.yaml delete mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/latest/base6.yaml delete mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/latest/noWP.yaml rename pkg/resource/training_job/testdata/v1alpha1/update/latest/{noProfiler.yaml => vanilla.yaml} (92%) delete mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/observed/base1.yaml delete mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/observed/base10.yaml delete mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/observed/base11.yaml delete mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/observed/base12.yaml delete mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/observed/base13.yaml delete mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/observed/error_on_update.yaml diff --git a/pkg/resource/training_job/testdata/test_suite.yaml b/pkg/resource/training_job/testdata/test_suite.yaml index dd5d02ff..c5886330 100644 --- a/pkg/resource/training_job/testdata/test_suite.yaml +++ b/pkg/resource/training_job/testdata/test_suite.yaml @@ -244,18 +244,109 @@ tests: expect: latest_state: "v1alpha1/update/observed/removeProfilerBoth.yaml" error: nil - # - name: "Update=RemoveProfilerRule" - # - name: "Update=RemoveProfilerConfig" - # - name: "Update=BothProfiler" - # - name: "Update=ProfilerConfig" - # - name: "Update=ProfilerStarting" - # - name: "Update=ProfilerTerminal" - # - name: "Update=ProfilerInvalid" - # - name: "Update=WarmPool" - # - name: "Update=WarmPoolInProgress" - # - name: "Update=WarmPoolTerminal" - # - name: "Update=AddProfiler" - # - name: "Update=AddWarmPool" + - name: "Update=RemoveProfilerRule" + desciption: "This test checks if the Controller can remove the profiler properly" + given: + desired_state: "v1alpha1/update/desired/removeProfilerRule.yaml" + latest_state: "v1alpha1/update/latest/profilerHappyBase.yaml" + svc_api: + - operation: UpdateTrainingJobWithContext + output_fixture: "sdkapi/update/update_sucess.json" + invoke: Update + expect: + latest_state: "v1alpha1/update/observed/removeProfilerRule.yaml" + error: nil + - name: "Update=RemoveProfilerConfig" + desciption: "This test checks if the Controller can remove the profiler properly" + given: + desired_state: "v1alpha1/update/desired/removeProfilerConfig.yaml" + latest_state: "v1alpha1/update/latest/profilerHappyBase.yaml" + svc_api: + - operation: UpdateTrainingJobWithContext + output_fixture: "sdkapi/update/update_sucess.json" + invoke: Update + expect: + latest_state: "v1alpha1/update/observed/removeProfilerConfig.yaml" + error: nil + - name: "Update=BothProfiler" + given: + desired_state: "v1alpha1/update/desired/updateBothProfileParams.yaml" + latest_state: "v1alpha1/update/latest/profilerHappyBase.yaml" + svc_api: + - operation: UpdateTrainingJobWithContext + output_fixture: "sdkapi/update/update_sucess.json" + invoke: Update + expect: + latest_state: "v1alpha1/update/observed/updateProfilerBoth.yaml" + error: nil + - name: "Update=ProfilerConfig" + given: + desired_state: "v1alpha1/update/desired/updateProfileConfig.yaml" + latest_state: "v1alpha1/update/latest/profilerHappyBase.yaml" + svc_api: + - operation: UpdateTrainingJobWithContext + output_fixture: "sdkapi/update/update_sucess.json" + invoke: Update + expect: + latest_state: "v1alpha1/update/observed/updateProfilerConfig.yaml" + error: nil + - name: "Update=ProfilerStarting" + given: + desired_state: "v1alpha1/update/desired/updateProfileConfig.yaml" + latest_state: "v1alpha1/update/latest/profilerStarting.yaml" + invoke: Update + expect: + error: Controller cannot update while secondary status is in Starting state. + - name: "Update=ProfilerTerminal" + given: + desired_state: "v1alpha1/update/desired/updateProfileConfig.yaml" + latest_state: "v1alpha1/update/latest/profilerTerminal.yaml" + invoke: Update + expect: + error: "resource is in terminal condition" + - name: "Update=WarmPool" + given: + desired_state: "v1alpha1/update/desired/updateWarmPool.yaml" + latest_state: "v1alpha1/update/latest/WPHappy.yaml" + svc_api: + - operation: UpdateTrainingJobWithContext + output_fixture: "sdkapi/update/update_sucess.json" + invoke: Update + expect: + latest_state: "v1alpha1/update/observed/updateWP.yaml" + - name: "Update=WarmPoolInProgress" + given: + desired_state: "v1alpha1/update/desired/updateWarmPool.yaml" + latest_state: "v1alpha1/update/latest/WPDownloading.yaml" + invoke: Update + expect: + error: Warm pool cannot be updated in InProgress state requeuing until TrainingJob reaches completed state. + - name: "Update=WarmPoolTerminal" + given: + desired_state: "v1alpha1/update/desired/updateWarmPool.yaml" + latest_state: "v1alpha1/update/latest/WPTerminal.yaml" + invoke: Update + expect: + error: "resource is in terminal condition" + - name: "Update=AddProfiler" + given: + desired_state: "v1alpha1/update/desired/updateProfileConfig.yaml" + latest_state: "v1alpha1/update/latest/vanilla.yaml" + svc_api: + - operation: UpdateTrainingJobWithContext + output_fixture: "sdkapi/update/update_sucess.json" + invoke: Update + expect: + latest_state: "v1alpha1/update/observed/updateProfilerConfig.yaml" + error: nil + - name: "Update=AddWarmPool" + given: + desired_state: "v1alpha1/update/desired/updateWarmPool.yaml" + latest_state: "v1alpha1/update/latest/vanilla.yaml" + invoke: Update + expect: + error: "resource is in terminal condition" + diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/desired/invalidparameter.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/desired/invalidparameter.yaml deleted file mode 100644 index 83b4c7c5..00000000 --- a/pkg/resource/training_job/testdata/v1alpha1/update/desired/invalidparameter.yaml +++ /dev/null @@ -1,49 +0,0 @@ -apiVersion: sagemaker.services.k8s.aws/v1alpha1 -kind: TrainingJob -metadata: - name: training-test -spec: - trainingJobName: training-test - hyperParameters: - max_depth: "5" - gamma: "4" - eta: "0.2" - min_child_weight: "6" - objective: "reg:squarederror" - subsample: "0.7" - num_round: "51" - algorithmSpecification: - trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 - trainingInputMode: File - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker - outputDataConfig: - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output - resourceConfig: - instanceCount: 1 - instanceType: ml.m4.xlarge - volumeSizeInGB: 5 - stoppingCondition: - maxRuntimeInSeconds: 86400 - inputDataConfig: - - channelName: train - dataSource: - s3DataSource: - s3DataType: S3Prefix - s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ - s3DataDistributionType: FullyReplicated - contentType: text/libsvm - compressionType: None - profilerConfig: - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ - profilingIntervalInMilliseconds: 70 - profilerRuleConfigurations: - - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest - ruleParameters: - rule_to_invoke: ProfilerReport -status: - ackResourceMetadata: - arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test - ownerAccountID: "" - region: "" - conditions: [] diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/desired/ob1.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/desired/ob1.yaml deleted file mode 100644 index 59c84d9a..00000000 --- a/pkg/resource/training_job/testdata/v1alpha1/update/desired/ob1.yaml +++ /dev/null @@ -1,49 +0,0 @@ -apiVersion: sagemaker.services.k8s.aws/v1alpha1 -kind: TrainingJob -metadata: - name: training-test -spec: - trainingJobName: training-test - hyperParameters: - max_depth: "5" - gamma: "4" - eta: "0.2" - min_child_weight: "6" - objective: "reg:squarederror" - subsample: "0.7" - num_round: "51" - algorithmSpecification: - trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 - trainingInputMode: File - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker - outputDataConfig: - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output - resourceConfig: - instanceCount: 1 - instanceType: ml.m4.xlarge - volumeSizeInGB: 5 - stoppingCondition: - maxRuntimeInSeconds: 86400 - inputDataConfig: - - channelName: train - dataSource: - s3DataSource: - s3DataType: S3Prefix - s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ - s3DataDistributionType: FullyReplicated - contentType: text/libsvm - compressionType: None - profilerConfig: - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ - profilingIntervalInMilliseconds: 500 - profilerRuleConfigurations: - - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest - ruleParameters: - rule_to_invoke: ProfilerReport -status: - ackResourceMetadata: - arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test - ownerAccountID: "" - region: "" - conditions: [] diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateWarmPool.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateWarmPool.yaml index 140d2946..d1874da9 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateWarmPool.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateWarmPool.yaml @@ -34,14 +34,6 @@ spec: s3DataDistributionType: FullyReplicated contentType: text/libsvm compressionType: None - profilerConfig: - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ - profilingIntervalInMilliseconds: 500 - profilerRuleConfigurations: - - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest - ruleParameters: - rule_to_invoke: ProfilerReport status: ackResourceMetadata: arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPDownloading.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPDownloading.yaml index e1a8d653..5e4dd1a4 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPDownloading.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPDownloading.yaml @@ -88,15 +88,6 @@ spec: stoppingCondition: maxRuntimeInSeconds: 86400 trainingJobName: training-test - profilerConfig: - profilingIntervalInMilliseconds: 500 - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ - profilerRuleConfigurations: - - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 - ruleParameters: - rule_to_invoke: ProfilerReport - volumeSizeInGB: 0 status: ackResourceMetadata: arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPHappyBase.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPHappy.yaml similarity index 91% rename from pkg/resource/training_job/testdata/v1alpha1/update/latest/WPHappyBase.yaml rename to pkg/resource/training_job/testdata/v1alpha1/update/latest/WPHappy.yaml index 5a1bebc0..f5272fee 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPHappyBase.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPHappy.yaml @@ -88,15 +88,6 @@ spec: stoppingCondition: maxRuntimeInSeconds: 86400 trainingJobName: training-test - profilerConfig: - profilingIntervalInMilliseconds: 500 - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ - profilerRuleConfigurations: - - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 - ruleParameters: - rule_to_invoke: ProfilerReport - volumeSizeInGB: 0 status: ackResourceMetadata: arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPTerminal.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPTerminal.yaml index f231e6b2..f6270ea5 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPTerminal.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/latest/WPTerminal.yaml @@ -88,15 +88,6 @@ spec: stoppingCondition: maxRuntimeInSeconds: 86400 trainingJobName: training-test - profilerConfig: - profilingIntervalInMilliseconds: 500 - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ - profilerRuleConfigurations: - - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 - ruleParameters: - rule_to_invoke: ProfilerReport - volumeSizeInGB: 0 status: ackResourceMetadata: arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/base1.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/base1.yaml deleted file mode 100644 index 3178f2b1..00000000 --- a/pkg/resource/training_job/testdata/v1alpha1/update/latest/base1.yaml +++ /dev/null @@ -1,116 +0,0 @@ -apiVersion: sagemaker.services.k8s.aws/v1alpha1 -kind: TrainingJob -metadata: - creationTimestamp: null - name: training-test -spec: - algorithmSpecification: - enableSageMakerMetricsTimeSeries: false - metricDefinitions: - - name: train:mae - regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:auc - regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:merror - regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:auc - regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mae - regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:error - regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:merror - regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:logloss - regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:rmse - regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:logloss - regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:mlogloss - regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:rmse - regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:ndcg - regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:error - regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mlogloss - regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:ndcg - regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:map - regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:map - regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 - trainingInputMode: File - enableInterContainerTrafficEncryption: false - enableManagedSpotTraining: false - enableNetworkIsolation: false - hyperParameters: - eta: "0.2" - gamma: "4" - max_depth: "5" - min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - recordWrapperType: None - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - recordWrapperType: None - outputDataConfig: - kmsKeyID: "" - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output - resourceConfig: - instanceCount: 1 - instanceType: ml.m4.xlarge - volumeSizeInGB: 5 - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker - stoppingCondition: - maxRuntimeInSeconds: 86400 - trainingJobName: training-test - profilerConfig: - profilingIntervalInMilliseconds: 500 - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ - profilerRuleConfigurations: - - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 - ruleParameters: - rule_to_invoke: ProfilerReport - volumeSizeInGB: 0 -status: - ackResourceMetadata: - arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test - ownerAccountID: "" - region: "" - conditions: - - lastTransitionTime: "0001-01-01T00:00:00Z" - message: TrainingJob is in InProgress status. - status: "False" - type: ACK.ResourceSynced - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Enabled - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 - ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/base4.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/base4.yaml deleted file mode 100644 index 49ff0cde..00000000 --- a/pkg/resource/training_job/testdata/v1alpha1/update/latest/base4.yaml +++ /dev/null @@ -1,116 +0,0 @@ -apiVersion: sagemaker.services.k8s.aws/v1alpha1 -kind: TrainingJob -metadata: - creationTimestamp: null - name: training-test -spec: - algorithmSpecification: - enableSageMakerMetricsTimeSeries: false - metricDefinitions: - - name: train:mae - regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:auc - regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:merror - regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:auc - regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mae - regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:error - regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:merror - regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:logloss - regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:rmse - regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:logloss - regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:mlogloss - regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:rmse - regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:ndcg - regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:error - regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mlogloss - regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:ndcg - regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:map - regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:map - regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 - trainingInputMode: File - enableInterContainerTrafficEncryption: false - enableManagedSpotTraining: false - enableNetworkIsolation: false - hyperParameters: - eta: "0.2" - gamma: "4" - max_depth: "5" - min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - recordWrapperType: None - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - recordWrapperType: None - outputDataConfig: - kmsKeyID: "" - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output - resourceConfig: - instanceCount: 1 - instanceType: ml.m4.xlarge - volumeSizeInGB: 5 - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker - stoppingCondition: - maxRuntimeInSeconds: 86400 - trainingJobName: training-test - profilerConfig: - profilingIntervalInMilliseconds: 500 - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ - profilerRuleConfigurations: - - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 - ruleParameters: - rule_to_invoke: ProfilerReport - volumeSizeInGB: 0 -status: - ackResourceMetadata: - arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test - ownerAccountID: "" - region: "" - conditions: - - lastTransitionTime: "0001-01-01T00:00:00Z" - message: TrainingJob is in InProgress status. - status: "False" - type: ACK.ResourceSynced - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Disabled - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 - ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/base5.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/base5.yaml deleted file mode 100644 index 49ff0cde..00000000 --- a/pkg/resource/training_job/testdata/v1alpha1/update/latest/base5.yaml +++ /dev/null @@ -1,116 +0,0 @@ -apiVersion: sagemaker.services.k8s.aws/v1alpha1 -kind: TrainingJob -metadata: - creationTimestamp: null - name: training-test -spec: - algorithmSpecification: - enableSageMakerMetricsTimeSeries: false - metricDefinitions: - - name: train:mae - regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:auc - regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:merror - regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:auc - regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mae - regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:error - regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:merror - regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:logloss - regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:rmse - regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:logloss - regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:mlogloss - regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:rmse - regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:ndcg - regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:error - regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mlogloss - regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:ndcg - regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:map - regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:map - regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 - trainingInputMode: File - enableInterContainerTrafficEncryption: false - enableManagedSpotTraining: false - enableNetworkIsolation: false - hyperParameters: - eta: "0.2" - gamma: "4" - max_depth: "5" - min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - recordWrapperType: None - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - recordWrapperType: None - outputDataConfig: - kmsKeyID: "" - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output - resourceConfig: - instanceCount: 1 - instanceType: ml.m4.xlarge - volumeSizeInGB: 5 - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker - stoppingCondition: - maxRuntimeInSeconds: 86400 - trainingJobName: training-test - profilerConfig: - profilingIntervalInMilliseconds: 500 - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ - profilerRuleConfigurations: - - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 - ruleParameters: - rule_to_invoke: ProfilerReport - volumeSizeInGB: 0 -status: - ackResourceMetadata: - arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test - ownerAccountID: "" - region: "" - conditions: - - lastTransitionTime: "0001-01-01T00:00:00Z" - message: TrainingJob is in InProgress status. - status: "False" - type: ACK.ResourceSynced - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Disabled - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 - ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/base6.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/base6.yaml deleted file mode 100644 index 49ff0cde..00000000 --- a/pkg/resource/training_job/testdata/v1alpha1/update/latest/base6.yaml +++ /dev/null @@ -1,116 +0,0 @@ -apiVersion: sagemaker.services.k8s.aws/v1alpha1 -kind: TrainingJob -metadata: - creationTimestamp: null - name: training-test -spec: - algorithmSpecification: - enableSageMakerMetricsTimeSeries: false - metricDefinitions: - - name: train:mae - regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:auc - regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:merror - regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:auc - regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mae - regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:error - regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:merror - regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:logloss - regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:rmse - regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:logloss - regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:mlogloss - regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:rmse - regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:ndcg - regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:error - regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mlogloss - regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:ndcg - regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:map - regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:map - regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 - trainingInputMode: File - enableInterContainerTrafficEncryption: false - enableManagedSpotTraining: false - enableNetworkIsolation: false - hyperParameters: - eta: "0.2" - gamma: "4" - max_depth: "5" - min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - recordWrapperType: None - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - recordWrapperType: None - outputDataConfig: - kmsKeyID: "" - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output - resourceConfig: - instanceCount: 1 - instanceType: ml.m4.xlarge - volumeSizeInGB: 5 - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker - stoppingCondition: - maxRuntimeInSeconds: 86400 - trainingJobName: training-test - profilerConfig: - profilingIntervalInMilliseconds: 500 - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ - profilerRuleConfigurations: - - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 - ruleParameters: - rule_to_invoke: ProfilerReport - volumeSizeInGB: 0 -status: - ackResourceMetadata: - arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test - ownerAccountID: "" - region: "" - conditions: - - lastTransitionTime: "0001-01-01T00:00:00Z" - message: TrainingJob is in InProgress status. - status: "False" - type: ACK.ResourceSynced - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Disabled - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 - ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/noWP.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/noWP.yaml deleted file mode 100644 index 3178f2b1..00000000 --- a/pkg/resource/training_job/testdata/v1alpha1/update/latest/noWP.yaml +++ /dev/null @@ -1,116 +0,0 @@ -apiVersion: sagemaker.services.k8s.aws/v1alpha1 -kind: TrainingJob -metadata: - creationTimestamp: null - name: training-test -spec: - algorithmSpecification: - enableSageMakerMetricsTimeSeries: false - metricDefinitions: - - name: train:mae - regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:auc - regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:merror - regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:auc - regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mae - regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:error - regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:merror - regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:logloss - regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:rmse - regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:logloss - regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:mlogloss - regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:rmse - regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:ndcg - regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:error - regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mlogloss - regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:ndcg - regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:map - regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:map - regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 - trainingInputMode: File - enableInterContainerTrafficEncryption: false - enableManagedSpotTraining: false - enableNetworkIsolation: false - hyperParameters: - eta: "0.2" - gamma: "4" - max_depth: "5" - min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - recordWrapperType: None - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - recordWrapperType: None - outputDataConfig: - kmsKeyID: "" - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output - resourceConfig: - instanceCount: 1 - instanceType: ml.m4.xlarge - volumeSizeInGB: 5 - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker - stoppingCondition: - maxRuntimeInSeconds: 86400 - trainingJobName: training-test - profilerConfig: - profilingIntervalInMilliseconds: 500 - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ - profilerRuleConfigurations: - - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 - ruleParameters: - rule_to_invoke: ProfilerReport - volumeSizeInGB: 0 -status: - ackResourceMetadata: - arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test - ownerAccountID: "" - region: "" - conditions: - - lastTransitionTime: "0001-01-01T00:00:00Z" - message: TrainingJob is in InProgress status. - status: "False" - type: ACK.ResourceSynced - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Enabled - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 - ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/noProfiler.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/vanilla.yaml similarity index 92% rename from pkg/resource/training_job/testdata/v1alpha1/update/latest/noProfiler.yaml rename to pkg/resource/training_job/testdata/v1alpha1/update/latest/vanilla.yaml index 6af913ab..5909ba1a 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/latest/noProfiler.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/latest/vanilla.yaml @@ -99,4 +99,9 @@ status: type: ACK.ResourceSynced secondaryStatus: Downloading trainingJobStatus: InProgress - profilingStatus: Disabled + profilingStatus: Enabled + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 + ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/base1.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/base1.yaml deleted file mode 100644 index 3178f2b1..00000000 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/base1.yaml +++ /dev/null @@ -1,116 +0,0 @@ -apiVersion: sagemaker.services.k8s.aws/v1alpha1 -kind: TrainingJob -metadata: - creationTimestamp: null - name: training-test -spec: - algorithmSpecification: - enableSageMakerMetricsTimeSeries: false - metricDefinitions: - - name: train:mae - regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:auc - regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:merror - regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:auc - regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mae - regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:error - regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:merror - regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:logloss - regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:rmse - regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:logloss - regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:mlogloss - regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:rmse - regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:ndcg - regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:error - regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mlogloss - regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:ndcg - regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:map - regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:map - regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 - trainingInputMode: File - enableInterContainerTrafficEncryption: false - enableManagedSpotTraining: false - enableNetworkIsolation: false - hyperParameters: - eta: "0.2" - gamma: "4" - max_depth: "5" - min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - recordWrapperType: None - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - recordWrapperType: None - outputDataConfig: - kmsKeyID: "" - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output - resourceConfig: - instanceCount: 1 - instanceType: ml.m4.xlarge - volumeSizeInGB: 5 - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker - stoppingCondition: - maxRuntimeInSeconds: 86400 - trainingJobName: training-test - profilerConfig: - profilingIntervalInMilliseconds: 500 - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ - profilerRuleConfigurations: - - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 - ruleParameters: - rule_to_invoke: ProfilerReport - volumeSizeInGB: 0 -status: - ackResourceMetadata: - arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test - ownerAccountID: "" - region: "" - conditions: - - lastTransitionTime: "0001-01-01T00:00:00Z" - message: TrainingJob is in InProgress status. - status: "False" - type: ACK.ResourceSynced - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Enabled - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 - ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/base10.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/base10.yaml deleted file mode 100644 index 3178f2b1..00000000 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/base10.yaml +++ /dev/null @@ -1,116 +0,0 @@ -apiVersion: sagemaker.services.k8s.aws/v1alpha1 -kind: TrainingJob -metadata: - creationTimestamp: null - name: training-test -spec: - algorithmSpecification: - enableSageMakerMetricsTimeSeries: false - metricDefinitions: - - name: train:mae - regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:auc - regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:merror - regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:auc - regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mae - regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:error - regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:merror - regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:logloss - regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:rmse - regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:logloss - regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:mlogloss - regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:rmse - regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:ndcg - regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:error - regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mlogloss - regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:ndcg - regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:map - regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:map - regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 - trainingInputMode: File - enableInterContainerTrafficEncryption: false - enableManagedSpotTraining: false - enableNetworkIsolation: false - hyperParameters: - eta: "0.2" - gamma: "4" - max_depth: "5" - min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - recordWrapperType: None - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - recordWrapperType: None - outputDataConfig: - kmsKeyID: "" - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output - resourceConfig: - instanceCount: 1 - instanceType: ml.m4.xlarge - volumeSizeInGB: 5 - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker - stoppingCondition: - maxRuntimeInSeconds: 86400 - trainingJobName: training-test - profilerConfig: - profilingIntervalInMilliseconds: 500 - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ - profilerRuleConfigurations: - - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 - ruleParameters: - rule_to_invoke: ProfilerReport - volumeSizeInGB: 0 -status: - ackResourceMetadata: - arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test - ownerAccountID: "" - region: "" - conditions: - - lastTransitionTime: "0001-01-01T00:00:00Z" - message: TrainingJob is in InProgress status. - status: "False" - type: ACK.ResourceSynced - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Enabled - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 - ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/base11.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/base11.yaml deleted file mode 100644 index 3178f2b1..00000000 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/base11.yaml +++ /dev/null @@ -1,116 +0,0 @@ -apiVersion: sagemaker.services.k8s.aws/v1alpha1 -kind: TrainingJob -metadata: - creationTimestamp: null - name: training-test -spec: - algorithmSpecification: - enableSageMakerMetricsTimeSeries: false - metricDefinitions: - - name: train:mae - regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:auc - regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:merror - regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:auc - regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mae - regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:error - regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:merror - regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:logloss - regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:rmse - regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:logloss - regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:mlogloss - regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:rmse - regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:ndcg - regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:error - regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mlogloss - regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:ndcg - regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:map - regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:map - regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 - trainingInputMode: File - enableInterContainerTrafficEncryption: false - enableManagedSpotTraining: false - enableNetworkIsolation: false - hyperParameters: - eta: "0.2" - gamma: "4" - max_depth: "5" - min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - recordWrapperType: None - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - recordWrapperType: None - outputDataConfig: - kmsKeyID: "" - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output - resourceConfig: - instanceCount: 1 - instanceType: ml.m4.xlarge - volumeSizeInGB: 5 - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker - stoppingCondition: - maxRuntimeInSeconds: 86400 - trainingJobName: training-test - profilerConfig: - profilingIntervalInMilliseconds: 500 - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ - profilerRuleConfigurations: - - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 - ruleParameters: - rule_to_invoke: ProfilerReport - volumeSizeInGB: 0 -status: - ackResourceMetadata: - arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test - ownerAccountID: "" - region: "" - conditions: - - lastTransitionTime: "0001-01-01T00:00:00Z" - message: TrainingJob is in InProgress status. - status: "False" - type: ACK.ResourceSynced - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Enabled - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 - ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/base12.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/base12.yaml deleted file mode 100644 index 3178f2b1..00000000 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/base12.yaml +++ /dev/null @@ -1,116 +0,0 @@ -apiVersion: sagemaker.services.k8s.aws/v1alpha1 -kind: TrainingJob -metadata: - creationTimestamp: null - name: training-test -spec: - algorithmSpecification: - enableSageMakerMetricsTimeSeries: false - metricDefinitions: - - name: train:mae - regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:auc - regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:merror - regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:auc - regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mae - regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:error - regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:merror - regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:logloss - regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:rmse - regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:logloss - regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:mlogloss - regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:rmse - regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:ndcg - regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:error - regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mlogloss - regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:ndcg - regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:map - regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:map - regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 - trainingInputMode: File - enableInterContainerTrafficEncryption: false - enableManagedSpotTraining: false - enableNetworkIsolation: false - hyperParameters: - eta: "0.2" - gamma: "4" - max_depth: "5" - min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - recordWrapperType: None - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - recordWrapperType: None - outputDataConfig: - kmsKeyID: "" - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output - resourceConfig: - instanceCount: 1 - instanceType: ml.m4.xlarge - volumeSizeInGB: 5 - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker - stoppingCondition: - maxRuntimeInSeconds: 86400 - trainingJobName: training-test - profilerConfig: - profilingIntervalInMilliseconds: 500 - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ - profilerRuleConfigurations: - - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 - ruleParameters: - rule_to_invoke: ProfilerReport - volumeSizeInGB: 0 -status: - ackResourceMetadata: - arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test - ownerAccountID: "" - region: "" - conditions: - - lastTransitionTime: "0001-01-01T00:00:00Z" - message: TrainingJob is in InProgress status. - status: "False" - type: ACK.ResourceSynced - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Enabled - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 - ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/base13.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/base13.yaml deleted file mode 100644 index 3178f2b1..00000000 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/base13.yaml +++ /dev/null @@ -1,116 +0,0 @@ -apiVersion: sagemaker.services.k8s.aws/v1alpha1 -kind: TrainingJob -metadata: - creationTimestamp: null - name: training-test -spec: - algorithmSpecification: - enableSageMakerMetricsTimeSeries: false - metricDefinitions: - - name: train:mae - regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:auc - regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:merror - regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:auc - regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mae - regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:error - regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:merror - regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:logloss - regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:rmse - regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:logloss - regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:mlogloss - regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:rmse - regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:ndcg - regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:error - regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mlogloss - regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:ndcg - regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:map - regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:map - regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 - trainingInputMode: File - enableInterContainerTrafficEncryption: false - enableManagedSpotTraining: false - enableNetworkIsolation: false - hyperParameters: - eta: "0.2" - gamma: "4" - max_depth: "5" - min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - recordWrapperType: None - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - recordWrapperType: None - outputDataConfig: - kmsKeyID: "" - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output - resourceConfig: - instanceCount: 1 - instanceType: ml.m4.xlarge - volumeSizeInGB: 5 - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker - stoppingCondition: - maxRuntimeInSeconds: 86400 - trainingJobName: training-test - profilerConfig: - profilingIntervalInMilliseconds: 500 - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ - profilerRuleConfigurations: - - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 - ruleParameters: - rule_to_invoke: ProfilerReport - volumeSizeInGB: 0 -status: - ackResourceMetadata: - arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test - ownerAccountID: "" - region: "" - conditions: - - lastTransitionTime: "0001-01-01T00:00:00Z" - message: TrainingJob is in InProgress status. - status: "False" - type: ACK.ResourceSynced - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Enabled - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 - ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/error_on_update.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/error_on_update.yaml deleted file mode 100644 index 5eac090a..00000000 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/error_on_update.yaml +++ /dev/null @@ -1,61 +0,0 @@ -apiVersion: sagemaker.services.k8s.aws/v1alpha1 -kind: TrainingJob -metadata: - creationTimestamp: null - name: unit-testing-training-job -spec: - algorithmSpecification: - trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com - trainingInputMode: File - hyperParameters: - eta: "0.2" - gamma: "4" - max_depth: "5" - min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - outputDataConfig: - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output - resourceConfig: - instanceCount: 1 - instanceType: ml.m4.xlarge - volumeSizeInGB: 5 - roleARN: arn:aws:iam::123456789012:role/ack-sagemaker-execution-role - stoppingCondition: - maxRuntimeInSeconds: 86400 - tags: - - key: algorithm - value: xgboost - - key: environment - value: testing - - key: customer - value: test-user - trainingJobName: xgboost-training-job -status: - ackResourceMetadata: - arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job - ownerAccountID: "" - region: "" - conditions: - - message: not implemented - status: "True" - type: ACK.Recoverable diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml index 5909ba1a..d4a26189 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml @@ -1,92 +1,38 @@ apiVersion: sagemaker.services.k8s.aws/v1alpha1 kind: TrainingJob metadata: - creationTimestamp: null name: training-test spec: - algorithmSpecification: - enableSageMakerMetricsTimeSeries: false - metricDefinitions: - - name: train:mae - regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:auc - regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:merror - regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:auc - regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mae - regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:error - regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:merror - regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:logloss - regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:rmse - regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:logloss - regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:mlogloss - regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:rmse - regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:ndcg - regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:error - regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mlogloss - regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:ndcg - regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:map - regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:map - regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 - trainingInputMode: File - enableInterContainerTrafficEncryption: false - enableManagedSpotTraining: false - enableNetworkIsolation: false - hyperParameters: - eta: "0.2" - gamma: "4" + trainingJobName: training-test + hyperParameters: max_depth: "5" + gamma: "4" + eta: "0.2" min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - recordWrapperType: None - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - recordWrapperType: None + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker outputDataConfig: - kmsKeyID: "" s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output resourceConfig: instanceCount: 1 instanceType: ml.m4.xlarge volumeSizeInGB: 5 - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker stoppingCondition: maxRuntimeInSeconds: 86400 - trainingJobName: training-test + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None status: ackResourceMetadata: arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test @@ -97,11 +43,3 @@ status: message: TrainingJob is in InProgress status. status: "False" type: ACK.ResourceSynced - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Enabled - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 - ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml index 63869078..190c8fb3 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml @@ -1,98 +1,43 @@ apiVersion: sagemaker.services.k8s.aws/v1alpha1 kind: TrainingJob metadata: - creationTimestamp: null name: training-test spec: - algorithmSpecification: - enableSageMakerMetricsTimeSeries: false - metricDefinitions: - - name: train:mae - regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:auc - regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:merror - regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:auc - regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mae - regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:error - regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:merror - regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:logloss - regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:rmse - regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:logloss - regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:mlogloss - regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:rmse - regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:ndcg - regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:error - regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mlogloss - regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:ndcg - regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:map - regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:map - regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 - trainingInputMode: File - enableInterContainerTrafficEncryption: false - enableManagedSpotTraining: false - enableNetworkIsolation: false - hyperParameters: - eta: "0.2" - gamma: "4" + trainingJobName: training-test + hyperParameters: max_depth: "5" + gamma: "4" + eta: "0.2" min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - recordWrapperType: None - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - recordWrapperType: None + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker outputDataConfig: - kmsKeyID: "" s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output resourceConfig: instanceCount: 1 instanceType: ml.m4.xlarge volumeSizeInGB: 5 - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker stoppingCondition: maxRuntimeInSeconds: 86400 - trainingJobName: training-test + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None profilerRuleConfigurations: - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest ruleParameters: rule_to_invoke: ProfilerReport - volumeSizeInGB: 0 status: ackResourceMetadata: arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test @@ -103,11 +48,3 @@ status: message: TrainingJob is in InProgress status. status: "False" type: ACK.ResourceSynced - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Enabled - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 - ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml index d161c238..e532e87a 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml @@ -1,92 +1,38 @@ apiVersion: sagemaker.services.k8s.aws/v1alpha1 kind: TrainingJob metadata: - creationTimestamp: null name: training-test spec: - algorithmSpecification: - enableSageMakerMetricsTimeSeries: false - metricDefinitions: - - name: train:mae - regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:auc - regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:merror - regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:auc - regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mae - regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:error - regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:merror - regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:logloss - regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:rmse - regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:logloss - regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:mlogloss - regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:rmse - regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:ndcg - regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:error - regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mlogloss - regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:ndcg - regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:map - regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:map - regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 - trainingInputMode: File - enableInterContainerTrafficEncryption: false - enableManagedSpotTraining: false - enableNetworkIsolation: false - hyperParameters: - eta: "0.2" - gamma: "4" + trainingJobName: training-test + hyperParameters: max_depth: "5" + gamma: "4" + eta: "0.2" min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - recordWrapperType: None - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - recordWrapperType: None + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker outputDataConfig: - kmsKeyID: "" s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output resourceConfig: instanceCount: 1 instanceType: ml.m4.xlarge volumeSizeInGB: 5 - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker stoppingCondition: maxRuntimeInSeconds: 86400 - trainingJobName: training-test + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None profilerConfig: profilingIntervalInMilliseconds: 500 s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ @@ -99,12 +45,4 @@ status: - lastTransitionTime: "0001-01-01T00:00:00Z" message: TrainingJob is in InProgress status. status: "False" - type: ACK.ResourceSynced - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Enabled - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 - ruleEvaluationStatus: InProgress + type: ACK.ResourceSynced \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml index 924fb563..0283c287 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml @@ -1,105 +1,50 @@ apiVersion: sagemaker.services.k8s.aws/v1alpha1 kind: TrainingJob metadata: - creationTimestamp: null name: training-test spec: - algorithmSpecification: - enableSageMakerMetricsTimeSeries: false - metricDefinitions: - - name: train:mae - regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:auc - regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:merror - regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:auc - regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mae - regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:error - regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:merror - regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:logloss - regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:rmse - regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:logloss - regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:mlogloss - regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:rmse - regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:ndcg - regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:error - regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mlogloss - regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:ndcg - regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:map - regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:map - regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 - trainingInputMode: File - enableInterContainerTrafficEncryption: false - enableManagedSpotTraining: false - enableNetworkIsolation: false - hyperParameters: - eta: "0.2" - gamma: "4" + trainingJobName: training-test + hyperParameters: max_depth: "5" + gamma: "4" + eta: "0.2" min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - recordWrapperType: None - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - recordWrapperType: None + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker outputDataConfig: - kmsKeyID: "" s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output resourceConfig: instanceCount: 1 instanceType: ml.m4.xlarge volumeSizeInGB: 5 - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker stoppingCondition: maxRuntimeInSeconds: 86400 - trainingJobName: training-test + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None profilerConfig: profilingIntervalInMilliseconds: 200 s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ profilerRuleConfigurations: - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest ruleParameters: rule_to_invoke: ProfilerReport - volumeSizeInGB: 0 - ruleConfigurationName: CPUBottleneck - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest - ruleParameters: - rule_to_invoke: CPUBottleneck + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: CPUBottleneck status: ackResourceMetadata: arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test @@ -110,11 +55,3 @@ status: message: TrainingJob is in InProgress status. status: "False" type: ACK.ResourceSynced - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Enabled - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 - ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml index f6b7e448..3f400c0a 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml @@ -1,101 +1,46 @@ apiVersion: sagemaker.services.k8s.aws/v1alpha1 kind: TrainingJob metadata: - creationTimestamp: null name: training-test spec: - algorithmSpecification: - enableSageMakerMetricsTimeSeries: false - metricDefinitions: - - name: train:mae - regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:auc - regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:merror - regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:auc - regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mae - regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:error - regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:merror - regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:logloss - regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:rmse - regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:logloss - regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:mlogloss - regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:rmse - regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:ndcg - regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:error - regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mlogloss - regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:ndcg - regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:map - regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:map - regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 - trainingInputMode: File - enableInterContainerTrafficEncryption: false - enableManagedSpotTraining: false - enableNetworkIsolation: false - hyperParameters: - eta: "0.2" - gamma: "4" + trainingJobName: training-test + hyperParameters: max_depth: "5" + gamma: "4" + eta: "0.2" min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - recordWrapperType: None - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - recordWrapperType: None + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker outputDataConfig: - kmsKeyID: "" s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output resourceConfig: instanceCount: 1 instanceType: ml.m4.xlarge volumeSizeInGB: 5 - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker stoppingCondition: maxRuntimeInSeconds: 86400 - trainingJobName: training-test + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None profilerConfig: profilingIntervalInMilliseconds: 200 s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ profilerRuleConfigurations: - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest ruleParameters: rule_to_invoke: ProfilerReport - volumeSizeInGB: 0 status: ackResourceMetadata: arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test @@ -106,11 +51,3 @@ status: message: TrainingJob is in InProgress status. status: "False" type: ACK.ResourceSynced - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Enabled - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 - ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml index 449757ba..324c1d05 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml @@ -1,105 +1,50 @@ apiVersion: sagemaker.services.k8s.aws/v1alpha1 kind: TrainingJob metadata: - creationTimestamp: null name: training-test spec: - algorithmSpecification: - enableSageMakerMetricsTimeSeries: false - metricDefinitions: - - name: train:mae - regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:auc - regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:merror - regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:auc - regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mae - regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:error - regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:merror - regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:logloss - regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:rmse - regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:logloss - regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:mlogloss - regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:rmse - regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:ndcg - regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:error - regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mlogloss - regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:ndcg - regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:map - regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:map - regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 - trainingInputMode: File - enableInterContainerTrafficEncryption: false - enableManagedSpotTraining: false - enableNetworkIsolation: false - hyperParameters: - eta: "0.2" - gamma: "4" + trainingJobName: training-test + hyperParameters: max_depth: "5" + gamma: "4" + eta: "0.2" min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - recordWrapperType: None - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - recordWrapperType: None + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker outputDataConfig: - kmsKeyID: "" s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output resourceConfig: instanceCount: 1 instanceType: ml.m4.xlarge volumeSizeInGB: 5 - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker stoppingCondition: maxRuntimeInSeconds: 86400 - trainingJobName: training-test + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None profilerConfig: profilingIntervalInMilliseconds: 500 s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ profilerRuleConfigurations: - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest ruleParameters: rule_to_invoke: ProfilerReport - volumeSizeInGB: 0 - ruleConfigurationName: CPUBottleneck - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest - ruleParameters: - rule_to_invoke: CPUBottleneck + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: CPUBottleneck status: ackResourceMetadata: arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test @@ -110,11 +55,3 @@ status: message: TrainingJob is in InProgress status. status: "False" type: ACK.ResourceSynced - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Enabled - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 - ruleEvaluationStatus: InProgress diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml index 172382b6..c3ab737c 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml @@ -1,102 +1,39 @@ apiVersion: sagemaker.services.k8s.aws/v1alpha1 kind: TrainingJob metadata: - creationTimestamp: null name: training-test spec: - algorithmSpecification: - enableSageMakerMetricsTimeSeries: false - metricDefinitions: - - name: train:mae - regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:auc - regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:merror - regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:auc - regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mae - regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:error - regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:merror - regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:logloss - regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:rmse - regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:logloss - regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:mlogloss - regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:rmse - regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:ndcg - regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:error - regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:mlogloss - regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:ndcg - regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: train:map - regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - - name: validation:map - regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* - trainingImage: 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 - trainingInputMode: File - enableInterContainerTrafficEncryption: false - enableManagedSpotTraining: false - enableNetworkIsolation: false - hyperParameters: - eta: "0.2" - gamma: "4" + trainingJobName: training-test + hyperParameters: max_depth: "5" + gamma: "4" + eta: "0.2" min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - recordWrapperType: None - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - recordWrapperType: None + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker outputDataConfig: - kmsKeyID: "" s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output resourceConfig: instanceCount: 1 instanceType: ml.m4.xlarge volumeSizeInGB: 5 keepAlivePeriodInSeconds: 69 - roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker stoppingCondition: maxRuntimeInSeconds: 86400 - trainingJobName: training-test - profilerConfig: - profilingIntervalInMilliseconds: 500 - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ - profilerRuleConfigurations: - - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 - ruleParameters: - rule_to_invoke: ProfilerReport - volumeSizeInGB: 0 + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None status: ackResourceMetadata: arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test @@ -104,17 +41,6 @@ status: region: "" conditions: - lastTransitionTime: "0001-01-01T00:00:00Z" - message: TrainingJob is in InProgress status. + message: Warm Pool Infrastructure is in Available status. status: "False" - type: ACK.ResourceSynced - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Enabled - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationJobARN: arn:aws:sagemaker:us-west-2:123456789012:processing-job/xgboost-training-debugger--profilerreport-f81883f8 - ruleEvaluationStatus: InProgress - warmPoolStatus: - status: Available - + type: ACK.ResourceSynced \ No newline at end of file From a8a5eaba48c5c8cf62f625bbc6a603f92c877c08 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Mon, 21 Nov 2022 07:16:24 +0000 Subject: [PATCH 21/71] fix: corrected grammar mistake --- pkg/resource/training_job/testdata/test_suite.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/resource/training_job/testdata/test_suite.yaml b/pkg/resource/training_job/testdata/test_suite.yaml index c5886330..e272fe96 100644 --- a/pkg/resource/training_job/testdata/test_suite.yaml +++ b/pkg/resource/training_job/testdata/test_suite.yaml @@ -320,7 +320,7 @@ tests: latest_state: "v1alpha1/update/latest/WPDownloading.yaml" invoke: Update expect: - error: Warm pool cannot be updated in InProgress state requeuing until TrainingJob reaches completed state. + error: Warm pool cannot be updated in InProgress state, requeuing until TrainingJob reaches completed state. - name: "Update=WarmPoolTerminal" given: desired_state: "v1alpha1/update/desired/updateWarmPool.yaml" From 585fbe3e760a43a66b6c4ff99fc6ddab5524c524 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Mon, 21 Nov 2022 17:43:26 +0000 Subject: [PATCH 22/71] test: added another test and enhanced descriptions. --- .../training_job/testdata/test_suite.yaml | 17 ++++++ .../update/desired/updateProfilerWP.yaml | 54 +++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/desired/updateProfilerWP.yaml diff --git a/pkg/resource/training_job/testdata/test_suite.yaml b/pkg/resource/training_job/testdata/test_suite.yaml index e272fe96..68a87d1b 100644 --- a/pkg/resource/training_job/testdata/test_suite.yaml +++ b/pkg/resource/training_job/testdata/test_suite.yaml @@ -269,6 +269,7 @@ tests: latest_state: "v1alpha1/update/observed/removeProfilerConfig.yaml" error: nil - name: "Update=BothProfiler" + description: "Update Both Profiler parameters" given: desired_state: "v1alpha1/update/desired/updateBothProfileParams.yaml" latest_state: "v1alpha1/update/latest/profilerHappyBase.yaml" @@ -280,6 +281,7 @@ tests: latest_state: "v1alpha1/update/observed/updateProfilerBoth.yaml" error: nil - name: "Update=ProfilerConfig" + description: "Update just the profiler config" given: desired_state: "v1alpha1/update/desired/updateProfileConfig.yaml" latest_state: "v1alpha1/update/latest/profilerHappyBase.yaml" @@ -291,6 +293,7 @@ tests: latest_state: "v1alpha1/update/observed/updateProfilerConfig.yaml" error: nil - name: "Update=ProfilerStarting" + description: "Tests if profiler returns a requeue error when TJ is updated in Starting." given: desired_state: "v1alpha1/update/desired/updateProfileConfig.yaml" latest_state: "v1alpha1/update/latest/profilerStarting.yaml" @@ -298,6 +301,7 @@ tests: expect: error: Controller cannot update while secondary status is in Starting state. - name: "Update=ProfilerTerminal" + description: "Tests if profiler returns a terminal error when TJ is updated in Completed." given: desired_state: "v1alpha1/update/desired/updateProfileConfig.yaml" latest_state: "v1alpha1/update/latest/profilerTerminal.yaml" @@ -305,6 +309,7 @@ tests: expect: error: "resource is in terminal condition" - name: "Update=WarmPool" + description: "Update a warm pool" given: desired_state: "v1alpha1/update/desired/updateWarmPool.yaml" latest_state: "v1alpha1/update/latest/WPHappy.yaml" @@ -315,6 +320,7 @@ tests: expect: latest_state: "v1alpha1/update/observed/updateWP.yaml" - name: "Update=WarmPoolInProgress" + description: "Return a requeue error if trainingjob is in InProgress state when a warm pool is being updated." given: desired_state: "v1alpha1/update/desired/updateWarmPool.yaml" latest_state: "v1alpha1/update/latest/WPDownloading.yaml" @@ -322,6 +328,7 @@ tests: expect: error: Warm pool cannot be updated in InProgress state, requeuing until TrainingJob reaches completed state. - name: "Update=WarmPoolTerminal" + description: "Check if controller behaves correctly when WarmPool cannot be updated." given: desired_state: "v1alpha1/update/desired/updateWarmPool.yaml" latest_state: "v1alpha1/update/latest/WPTerminal.yaml" @@ -329,6 +336,7 @@ tests: expect: error: "resource is in terminal condition" - name: "Update=AddProfiler" + description: "Add a Profiler when a previous one does not exist." given: desired_state: "v1alpha1/update/desired/updateProfileConfig.yaml" latest_state: "v1alpha1/update/latest/vanilla.yaml" @@ -340,12 +348,21 @@ tests: latest_state: "v1alpha1/update/observed/updateProfilerConfig.yaml" error: nil - name: "Update=AddWarmPool" + description: "Attempt to add Warm Pool when a previous one does not exist." given: desired_state: "v1alpha1/update/desired/updateWarmPool.yaml" latest_state: "v1alpha1/update/latest/vanilla.yaml" invoke: Update expect: error: "resource is in terminal condition" + - name: "Update=WarmPoolAndProfiler" + description: "Attempt to add Warm Pool and Profiler at the same time." + given: + desired_state: "v1alpha1/update/desired/updateProfilerWP.yaml" + latest_state: "v1alpha1/update/latest/WPDownloading.yaml" + invoke: Update + expect: + error: "resource is in terminal condition" diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateProfilerWP.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateProfilerWP.yaml new file mode 100644 index 00000000..da1e81a4 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateProfilerWP.yaml @@ -0,0 +1,54 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + name: training-test +spec: + trainingJobName: training-test + hyperParameters: + max_depth: "5" + gamma: "4" + eta: "0.2" + min_child_weight: "6" + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + outputDataConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + keepAlivePeriodInSeconds: 69 + stoppingCondition: + maxRuntimeInSeconds: 86400 + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None + profilerConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilingIntervalInMilliseconds: 200 + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: ProfilerReport + - ruleConfigurationName: CPUBottleneck + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: CPUBottleneck +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: [] From ac99419df5503b6d05a888463cd262e3c2637c18 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Tue, 22 Nov 2022 19:09:10 +0000 Subject: [PATCH 23/71] updated unit teat --- pkg/resource/training_job/testdata/test_suite.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/resource/training_job/testdata/test_suite.yaml b/pkg/resource/training_job/testdata/test_suite.yaml index 68a87d1b..c2260ce4 100644 --- a/pkg/resource/training_job/testdata/test_suite.yaml +++ b/pkg/resource/training_job/testdata/test_suite.yaml @@ -299,7 +299,7 @@ tests: latest_state: "v1alpha1/update/latest/profilerStarting.yaml" invoke: Update expect: - error: Controller cannot update while secondary status is in Starting state. + error: controller cannot update while secondary status is in Starting state. - name: "Update=ProfilerTerminal" description: "Tests if profiler returns a terminal error when TJ is updated in Completed." given: @@ -326,7 +326,7 @@ tests: latest_state: "v1alpha1/update/latest/WPDownloading.yaml" invoke: Update expect: - error: Warm pool cannot be updated in InProgress state, requeuing until TrainingJob reaches completed state. + error: warm pool cannot be updated in InProgress state, requeuing until TrainingJob reaches completed state. - name: "Update=WarmPoolTerminal" description: "Check if controller behaves correctly when WarmPool cannot be updated." given: From 84a11eef4454f88228e59d29affa4d67ef629160 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Mon, 28 Nov 2022 22:58:47 +0000 Subject: [PATCH 24/71] [test] corrected rebase errors --- apis/v1alpha1/ack-generate-metadata.yaml | 2 +- apis/v1alpha1/generator.yaml | 1 - pkg/resource/training_job/hooks.go | 159 ----------------------- pkg/resource/training_job/sdk.go | 4 +- 4 files changed, 2 insertions(+), 164 deletions(-) diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index b95cdb0a..b8b1cb4b 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,5 +1,5 @@ ack_generate_info: - build_date: "2022-11-22T18:47:47Z" + build_date: "2022-11-28T22:55:21Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc diff --git a/apis/v1alpha1/generator.yaml b/apis/v1alpha1/generator.yaml index 485ffd82..2b099c73 100644 --- a/apis/v1alpha1/generator.yaml +++ b/apis/v1alpha1/generator.yaml @@ -123,7 +123,6 @@ resources: - InvalidParameterCombination - InvalidParameterValue - MissingParameter - update_conditions_custom_method_name: CustomUpdateConditions hooks: delta_pre_compare: code: customSetDefaults(a, b) diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index 6929ab50..0df5b553 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -186,93 +186,6 @@ func profilerRemovalCheck(desired *resource, latest *resource) bool { return false } -// customSetOutputPostUpdate sets the synced condition at the end of the update. -func customSetOutputPostUpdate(ko *svcapitypes.TrainingJob, delta *ackcompare.Delta) { - warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") - profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") - if profiler_diff { - svccommon.SetSyncedCondition(&resource{ko}, aws.String("InProgress"), &resourceName, &trainingJobModifyingStatuses) - } - if warmpool_diff { - svccommon.SetSyncedCondition(&resource{ko}, aws.String("Available"), aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) - } - } - -} - -// customSetOutputUpdateWarmpool makes the controller requeue if there is an update and -// the training job is still in InProgress -func customSetOutputUpdateWarmpool(r *resource) error { - trainingJobStatus := r.ko.Status.TrainingJobStatus - if ackcompare.IsNotNil(trainingJobStatus) && *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { - return requeueBeforeUpdate - } - return nil -} - -// warmPoolTerminalCheck checks if warm pool has reached a state where it is not updateable -func warmPoolTerminalCheck(latest *resource) bool { - trainingJobStatus := latest.ko.Status.TrainingJobStatus - if ackcompare.IsNotNil(latest.ko.Spec.ResourceConfig) { - if ackcompare.IsNil(latest.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds) { - return true // Warm pool can only be updated iff there is a provisioned cluster. - } - } else { - return false - } - - if ackcompare.IsNotNil(trainingJobStatus) { - if *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { - return false - } - if *trainingJobStatus == svcsdk.TrainingJobStatusCompleted { - if ackcompare.IsNotNil(latest.ko.Status.WarmPoolStatus) { - wp_modifying := svccommon.IsModifyingStatus(latest.ko.Status.WarmPoolStatus.Status, &WarmPoolModifyingStatuses) - return !wp_modifying - } else { - return false // Sometimes the API (briefly) does not return the WP status even if it completes. - } - } else { - // Training Job is in 'Failed'|'Stopping'|'Stopped' (Terminal) - return true - } - } - - // ACK OIDC is misconfigured (Terminal) - return true -} - -// customSetOutputUpdateProfiler decides whether the training job is ready/eligible for update -// depending on the status. -func customSetOutputUpdateProfiler(r *resource) error { - trainingSecondaryStatus := r.ko.Status.SecondaryStatus - trainingJobStatus := r.ko.Status.TrainingJobStatus - if ackcompare.IsNotNil(trainingSecondaryStatus) && *trainingSecondaryStatus == svcsdk.SecondaryStatusStarting { - return requeueBeforeUpdateStarting - } - if ackcompare.IsNotNil(trainingJobStatus) { - for _, terminalStatus := range TrainingJobTerminalProfiler { - if terminalStatus == *trainingJobStatus { - return ackerr.NewTerminalError(errors.New("profiler can only be updated when Training Job is in InProgress state")) - } - } - } - return nil -} - -// profilerRemovalCheck checks if the profiler was removed. -func profilerRemovalCheck(desired *resource, latest *resource) bool { - if ackcompare.IsNotNil(desired.ko.Spec) && ackcompare.IsNotNil(latest.ko.Spec) { - if ackcompare.IsNil(desired.ko.Spec.ProfilerRuleConfigurations) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerRuleConfigurations) { - return true - } - if ackcompare.IsNil(desired.ko.Spec.ProfilerConfig) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerConfig) { - return true - } - } - return false -} - // customSetOutputPostUpdate sets the synced condition at the end of the update. func customSetOutputPostUpdate(ko *svcapitypes.TrainingJob, delta *ackcompare.Delta) { warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") @@ -285,75 +198,3 @@ func customSetOutputPostUpdate(ko *svcapitypes.TrainingJob, delta *ackcompare.De } } - -// This function makes the controller requeue if there is an update and -// the training job is still in InProgress -func customSetOutputUpdateWarmpool(r *resource) error { - trainingJobStatus := r.ko.Status.TrainingJobStatus - if ackcompare.IsNotNil(trainingJobStatus) && *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { - return requeueBeforeUpdate - } - return nil -} - -// Check if warm pool has reached a state where it is not updateable -func warmPoolTerminalCheck(latest *resource) bool { - trainingJobStatus := latest.ko.Status.TrainingJobStatus - if ackcompare.IsNotNil(latest.ko.Spec.ResourceConfig) { - if ackcompare.IsNil(latest.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds) { - return true // Warm pool can only be updated iff there is a provisioned cluster. - } - } else { - return false - } - - if ackcompare.IsNotNil(trainingJobStatus) { - if *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { - return false - } - if *trainingJobStatus == svcsdk.TrainingJobStatusCompleted { - if ackcompare.IsNotNil(latest.ko.Status.WarmPoolStatus) { - wp_modifying := svccommon.IsModifyingStatus(latest.ko.Status.WarmPoolStatus.Status, &WarmPoolModifyingStatuses) - return !wp_modifying - } else { - return false // Sometimes the API (briefly) does not return the WP status even if it completes. - } - } else { - // Training Job is in 'Failed'|'Stopping'|'Stopped' (Terminal) - return true - } - } - - // ACK OIDC is misconfigured (Terminal) - return true -} - -// Profiler cannot be updated at certain statuses. -func customSetOutputUpdateProfiler(r *resource) error { - trainingSecondaryStatus := r.ko.Status.SecondaryStatus - trainingJobStatus := r.ko.Status.TrainingJobStatus - if ackcompare.IsNotNil(trainingSecondaryStatus) && *trainingSecondaryStatus == svcsdk.SecondaryStatusStarting { - return requeueBeforeUpdateStarting - } - if ackcompare.IsNotNil(trainingJobStatus) { - for _, terminalStatus := range TrainingJobTerminalProfiler { - if terminalStatus == *trainingJobStatus { - return errors.New("[ACK_SM] Profiler can only be updated when Training Job is in InProgress state") - } - } - } - return nil -} - -// Checks if the profiler was removed. -func profilerRemovalCheck(desired *resource, latest *resource) bool { - if ackcompare.IsNotNil(desired.ko.Spec) && ackcompare.IsNotNil(latest.ko.Spec) { - if ackcompare.IsNil(desired.ko.Spec.ProfilerRuleConfigurations) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerRuleConfigurations) { - return true - } - if ackcompare.IsNil(desired.ko.Spec.ProfilerConfig) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerConfig) { - return true - } - } - return false -} diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index 3ba81b89..78eb3196 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -1337,9 +1337,7 @@ func (rm *resourceManager) updateConditions( } // Required to avoid the "declared but not used" error in the default case _ = syncCondition - // custom update conditions - customUpdate := rm.CustomUpdateConditions(ko, r, err) - if terminalCondition != nil || recoverableCondition != nil || syncCondition != nil || customUpdate { + if terminalCondition != nil || recoverableCondition != nil || syncCondition != nil { return &resource{ko}, true // updated } return nil, false // not updated From 3e1f116b4371a8a49af8216b172d80c49cbf7311 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Mon, 28 Nov 2022 23:04:11 +0000 Subject: [PATCH 25/71] . --- .../training_job/custom_update_conditions.go | 53 ------------------- 1 file changed, 53 deletions(-) delete mode 100644 pkg/resource/training_job/custom_update_conditions.go diff --git a/pkg/resource/training_job/custom_update_conditions.go b/pkg/resource/training_job/custom_update_conditions.go deleted file mode 100644 index c94cfcf0..00000000 --- a/pkg/resource/training_job/custom_update_conditions.go +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"). You may -// not use this file except in compliance with the License. A copy of the -// License is located at -// -// http://aws.amazon.com/apache2.0/ -// -// or in the "license" file accompanying this file. This file is distributed -// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either -// express or implied. See the License for the specific language governing -// permissions and limitations under the License. - -// Use this file if conditions need to be updated based on the latest status -// of training job which is not evident from API response - -package training_job - -import ( - "strings" - - ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" - ackcondition "github.com/aws-controllers-k8s/runtime/pkg/condition" - svcapitypes "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" - corev1 "k8s.io/api/core/v1" -) - -var ( - terminalCode string = "[ACK_SM]" -) - -// If the controller runs into an error that contains "[ACK_SM]" -// it will set the resource to a terminal state because it is an unrecoverable error. -func (rm *resourceManager) CustomUpdateConditions( - ko *svcapitypes.TrainingJob, - r *resource, - err error, -) bool { - - if ackcompare.IsNil(err) { - return false - } - - if strings.Contains(err.Error(), terminalCode) { - conditionManager := &resource{ko} - exception := err.Error() - ackcondition.SetTerminal(conditionManager, corev1.ConditionTrue, &exception, nil) - return true - } - - return false - -} From cc44a3b78e12cbc17fc3f5f27f25c72fb7c2b64b Mon Sep 17 00:00:00 2001 From: ananth102 Date: Mon, 28 Nov 2022 23:33:33 +0000 Subject: [PATCH 26/71] fix/test: added nil check and new unit test --- pkg/resource/training_job/hooks.go | 2 +- .../describe/warmpool_describe_no_status.json | 158 ++++++++++++++++++ .../training_job/testdata/test_suite.yaml | 11 ++ .../readone/observed/warmpool_nostatus.yaml | 109 ++++++++++++ 4 files changed, 279 insertions(+), 1 deletion(-) create mode 100644 pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_no_status.json create mode 100644 pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_nostatus.yaml diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index 9fbff6bb..54d0411f 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -83,7 +83,7 @@ func (rm *resourceManager) customSetOutput(r *resource) { svccommon.SetSyncedCondition(r, aws.String("Available"), aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) } - if svccommon.IsModifyingStatus(r.ko.Status.WarmPoolStatus.Status, &WarmPoolModifyingStatuses) { + if ackcompare.IsNotNil(r.ko.Status.WarmPoolStatus) && svccommon.IsModifyingStatus(r.ko.Status.WarmPoolStatus.Status, &WarmPoolModifyingStatuses) { svccommon.SetSyncedCondition(r, r.ko.Status.WarmPoolStatus.Status, aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) } } diff --git a/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_no_status.json b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_no_status.json new file mode 100644 index 00000000..3c78b803 --- /dev/null +++ b/pkg/resource/training_job/testdata/sdkapi/describe/warmpool_describe_no_status.json @@ -0,0 +1,158 @@ +{ + "AlgorithmSpecification": { + "EnableSageMakerMetricsTimeSeries": false, + "MetricDefinitions": [ + { + "Name": "train:mae", + "Regex": ".*\\[[0-9]+\\].*#011train-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:auc", + "Regex": ".*\\[[0-9]+\\].*#011validation-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:merror", + "Regex": ".*\\[[0-9]+\\].*#011train-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:auc", + "Regex": ".*\\[[0-9]+\\].*#011train-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mae", + "Regex": ".*\\[[0-9]+\\].*#011validation-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:error", + "Regex": ".*\\[[0-9]+\\].*#011validation-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:merror", + "Regex": ".*\\[[0-9]+\\].*#011validation-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:logloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:rmse", + "Regex": ".*\\[[0-9]+\\].*#011train-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:logloss", + "Regex": ".*\\[[0-9]+\\].*#011train-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011train-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:rmse", + "Regex": ".*\\[[0-9]+\\].*#011validation-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011validation-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:error", + "Regex": ".*\\[[0-9]+\\].*#011train-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011train-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:map", + "Regex": ".*\\[[0-9]+\\].*#011train-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:map", + "Regex": ".*\\[[0-9]+\\].*#011validation-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + } + ], + "TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/xgboost:1", + "TrainingInputMode": "File" + }, + "CreationTime": "2021-10-12T05:49:40.493Z", + "EnableInterContainerTrafficEncryption": false, + "EnableManagedSpotTraining": false, + "EnableNetworkIsolation": false, + "HyperParameters": { + "eta": "0.2", + "gamma": "4", + "max_depth": "5", + "min_child_weight": "6", + "num_class": "10", + "num_round": "10", + "objective": "multi:softmax", + "silent": "0" + }, + "InputDataConfig": [ + { + "ChannelName": "train", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "S3DataSource": { + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train" + } + }, + "RecordWrapperType": "None" + }, + { + "ChannelName": "validation", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "S3DataSource": { + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation" + } + }, + "RecordWrapperType": "None" + } + ], + "LastModifiedTime": "2021-10-12T05:52:46.108Z", + "OutputDataConfig": { + "KmsKeyId": "", + "S3OutputPath": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output" + }, + "ProfilingStatus": "Disabled", + "ResourceConfig": { + "InstanceCount": 1, + "InstanceType": "ml.m4.xlarge", + "VolumeSizeInGB": 5, + "KeepAlivePeriodInSeconds": 70 + }, + "RoleArn": "arn:aws:iam::123456789012:role/service-role/AmazonSageMaker", + "SecondaryStatus": "Completed", + "SecondaryStatusTransitions": [ + { + "EndTime": "2021-10-12T05:52:46.108Z", + "StartTime": "2021-10-12T05:49:40.493Z", + "Status": "Starting", + "StatusMessage": "Preparing the instances for training" + }, + { + "StartTime": "2021-10-12T05:52:46.108Z", + "Status": "Downloading", + "StatusMessage": "Downloading input data" + } + ], + "StoppingCondition": { + "MaxRuntimeInSeconds": 86400 + }, + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job", + "TrainingJobName": "xgboost-training-job", + "TrainingJobStatus": "Completed", + "TrainingStartTime": "2021-10-12T05:52:46.108Z", + "TrainingTimeInSeconds": 31 +} \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/test_suite.yaml b/pkg/resource/training_job/testdata/test_suite.yaml index da6b4dd3..69c72046 100644 --- a/pkg/resource/training_job/testdata/test_suite.yaml +++ b/pkg/resource/training_job/testdata/test_suite.yaml @@ -182,6 +182,17 @@ tests: invoke: ReadOne expect: latest_state: "v1alpha1/readone/observed/warmpool_inuse.yaml" + - name: "ReadOne=WarmPoolNoStatus" + description: "Test Warmpool when the training job is complete but has no status" + given: + desired_state: "v1alpha1/readone/desired/warmpool.yaml" + svc_api: + - operation: DescribeTrainingJobWithContext + output_fixture: "sdkapi/describe/warmpool_describe_no_status.json" + invoke: ReadOne + expect: + latest_state: "v1alpha1/readone/observed/warmpool_nostatus.yaml" + - name: "Training job update tests" description: "Testing the Update operation" scenarios: diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_nostatus.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_nostatus.yaml new file mode 100644 index 00000000..ce6c228f --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_nostatus.yaml @@ -0,0 +1,109 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + creationTimestamp: null + name: unit-testing-training-job +spec: + algorithmSpecification: + enableSageMakerMetricsTimeSeries: false + metricDefinitions: + - name: train:mae + regex: .*\[[0-9]+\].*#011train-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:auc + regex: .*\[[0-9]+\].*#011validation-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:merror + regex: .*\[[0-9]+\].*#011train-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:auc + regex: .*\[[0-9]+\].*#011train-auc:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mae + regex: .*\[[0-9]+\].*#011validation-mae:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:error + regex: .*\[[0-9]+\].*#011validation-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:merror + regex: .*\[[0-9]+\].*#011validation-merror:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:logloss + regex: .*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:rmse + regex: .*\[[0-9]+\].*#011train-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:logloss + regex: .*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:mlogloss + regex: .*\[[0-9]+\].*#011train-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:rmse + regex: .*\[[0-9]+\].*#011validation-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:ndcg + regex: .*\[[0-9]+\].*#011validation-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:error + regex: .*\[[0-9]+\].*#011train-error:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:mlogloss + regex: .*\[[0-9]+\].*#011validation-mlogloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:ndcg + regex: .*\[[0-9]+\].*#011train-ndcg:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: train:map + regex: .*\[[0-9]+\].*#011train-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + - name: validation:map + regex: .*\[[0-9]+\].*#011validation-map:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).* + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/xgboost:1 + trainingInputMode: File + enableInterContainerTrafficEncryption: false + enableManagedSpotTraining: false + enableNetworkIsolation: false + hyperParameters: + eta: "0.2" + gamma: "4" + max_depth: "5" + min_child_weight: "6" + num_class: "10" + num_round: "10" + objective: multi:softmax + silent: "0" + inputDataConfig: + - channelName: train + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train + recordWrapperType: None + - channelName: validation + compressionType: None + contentType: text/csv + dataSource: + s3DataSource: + s3DataDistributionType: FullyReplicated + s3DataType: S3Prefix + s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation + recordWrapperType: None + outputDataConfig: + kmsKeyID: "" + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + keepAlivePeriodInSeconds: 70 + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + stoppingCondition: + maxRuntimeInSeconds: 86400 + tags: + - key: algorithm + value: xgboost + - key: environment + value: testing + - key: customer + value: test-user + trainingJobName: xgboost-training-job +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: Warm Pool Infrastructure is in Available status. + status: "False" + type: ACK.ResourceSynced + secondaryStatus: Completed + trainingJobStatus: Completed \ No newline at end of file From 5cfd8f86858a16d7981e296801e2f7774682745c Mon Sep 17 00:00:00 2001 From: ananth102 Date: Fri, 18 Nov 2022 17:35:48 +0000 Subject: [PATCH 27/71] Feature: Update TrainingJob --- apis/v1alpha1/ack-generate-metadata.yaml | 6 +- apis/v1alpha1/generator.yaml | 16 +- apis/v1alpha1/training_job.go | 3 + apis/v1alpha1/zz_generated.deepcopy.go | 5 + ...gemaker.services.k8s.aws_trainingjobs.yaml | 3 + generator.yaml | 16 +- ...gemaker.services.k8s.aws_trainingjobs.yaml | 3 + pkg/resource/training_job/custom_delta.go | 53 ++++++ .../training_job/custom_set_update_input.go | 125 +++++++++++++++ .../training_job/custom_update_conditions.go | 53 ++++++ pkg/resource/training_job/delta.go | 1 + pkg/resource/training_job/hooks.go | 93 +++++++++++ pkg/resource/training_job/sdk.go | 151 +++++++++++++++++- .../training_job/testdata/test_suite.yaml | 12 -- .../v1alpha1/readone/observed/completed.yaml | 1 + .../completed_debugger_variation.yaml | 3 +- .../observed/conditions_clear_on_success.yaml | 1 + .../v1alpha1/readone/observed/created.yaml | 1 + .../observed/created_debugger_variation.yaml | 1 + .../readone/observed/late_initialize.yaml | 1 + .../v1alpha1/readone/observed/stopping.yaml | 1 + .../observed/stopping_debugger_variation.yaml | 1 + .../readone/observed/warmpool_available.yaml | 3 +- .../readone/observed/warmpool_inuse.yaml | 3 +- .../readone/observed/warmpool_reused.yaml | 3 +- .../readone/observed/warmpool_terminated.yaml | 3 +- .../sdk_update_post_build_request.go.tpl | 31 ++++ 27 files changed, 565 insertions(+), 28 deletions(-) create mode 100644 pkg/resource/training_job/custom_set_update_input.go create mode 100644 pkg/resource/training_job/custom_update_conditions.go create mode 100644 templates/training_job/sdk_update_post_build_request.go.tpl diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index 25193da2..1bb00153 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,13 +1,13 @@ ack_generate_info: - build_date: "2022-11-23T21:48:31Z" + build_date: "2022-11-18T16:40:15Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc -api_directory_checksum: afd89795e3cceb09d028722ecbd9512b6197eb6a +api_directory_checksum: a11209e81e188afecb6812ec3080cead7ce995b1 api_version: v1alpha1 aws_sdk_go_version: v1.44.117 generator_config_info: - file_checksum: 7fb39ef5f630ff8286ff19472f423304aaa1c69c + file_checksum: 9678da88ff1b4061cfae104e44116cf0d87ebe09 original_file_name: generator.yaml last_modification: reason: API generation diff --git a/apis/v1alpha1/generator.yaml b/apis/v1alpha1/generator.yaml index d31c683d..eacb3dce 100644 --- a/apis/v1alpha1/generator.yaml +++ b/apis/v1alpha1/generator.yaml @@ -123,11 +123,18 @@ resources: - InvalidParameterCombination - InvalidParameterValue - MissingParameter + update_conditions_custom_method_name: CustomUpdateConditions hooks: delta_pre_compare: code: customSetDefaults(a, b) + delta_post_compare: + code: customPostCompare(b, a, delta) sdk_read_one_post_set_output: code: rm.customSetOutput(&resource{ko}) + sdk_update_post_build_request: + template_path: training_job/sdk_update_post_build_request.go.tpl + sdk_update_post_set_output: + code: rm.customSetOutput(&resource{ko}) sdk_delete_pre_build_request: template_path: training_job/sdk_delete_pre_build_request.go.tpl sdk_delete_post_request: @@ -175,6 +182,11 @@ resources: from: operation: DescribeTrainingJob path: WarmPoolStatus + ProfilingStatus: + is_read_only: true + from: + operation: DescribeTrainingJob + path: ProfilingStatus AlgorithmSpecification.MetricDefinitions: compare: is_ignored: true @@ -879,7 +891,6 @@ ignore: - Workforce - Workteam operations: - - UpdateTrainingJob - UpdateFeatureGroup shape_names: # RSessionAppSettings is an empty struct that causes generation errors @@ -894,4 +905,5 @@ ignore: - InstanceGroupNames - CanvasAppSettings - ExplainerConfig - - HyperParameterTuningJobStrategyConfig \ No newline at end of file + - HyperParameterTuningJobStrategyConfig + - DisableProfiler \ No newline at end of file diff --git a/apis/v1alpha1/training_job.go b/apis/v1alpha1/training_job.go index cf349c11..31b3a351 100644 --- a/apis/v1alpha1/training_job.go +++ b/apis/v1alpha1/training_job.go @@ -184,6 +184,9 @@ type TrainingJobStatus struct { // Evaluation status of Debugger rules for profiling on a training job. // +kubebuilder:validation:Optional ProfilerRuleEvaluationStatuses []*ProfilerRuleEvaluationStatus `json:"profilerRuleEvaluationStatuses,omitempty"` + // Profiling status of a training job. + // +kubebuilder:validation:Optional + ProfilingStatus *string `json:"profilingStatus,omitempty"` // Provides detailed information about the state of the training job. For detailed // information on the secondary status of the training job, see StatusMessage // under SecondaryStatusTransition. diff --git a/apis/v1alpha1/zz_generated.deepcopy.go b/apis/v1alpha1/zz_generated.deepcopy.go index 8b272a64..df2c8444 100644 --- a/apis/v1alpha1/zz_generated.deepcopy.go +++ b/apis/v1alpha1/zz_generated.deepcopy.go @@ -12748,6 +12748,11 @@ func (in *TrainingJobStatus) DeepCopyInto(out *TrainingJobStatus) { } } } + if in.ProfilingStatus != nil { + in, out := &in.ProfilingStatus, &out.ProfilingStatus + *out = new(string) + **out = **in + } if in.SecondaryStatus != nil { in, out := &in.SecondaryStatus, &out.SecondaryStatus *out = new(string) diff --git a/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml b/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml index 6f60c60d..d0f4ca28 100644 --- a/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml +++ b/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml @@ -618,6 +618,9 @@ spec: type: string type: object type: array + profilingStatus: + description: Profiling status of a training job. + type: string secondaryStatus: description: "Provides detailed information about the state of the training job. For detailed information on the secondary status of diff --git a/generator.yaml b/generator.yaml index d31c683d..eacb3dce 100644 --- a/generator.yaml +++ b/generator.yaml @@ -123,11 +123,18 @@ resources: - InvalidParameterCombination - InvalidParameterValue - MissingParameter + update_conditions_custom_method_name: CustomUpdateConditions hooks: delta_pre_compare: code: customSetDefaults(a, b) + delta_post_compare: + code: customPostCompare(b, a, delta) sdk_read_one_post_set_output: code: rm.customSetOutput(&resource{ko}) + sdk_update_post_build_request: + template_path: training_job/sdk_update_post_build_request.go.tpl + sdk_update_post_set_output: + code: rm.customSetOutput(&resource{ko}) sdk_delete_pre_build_request: template_path: training_job/sdk_delete_pre_build_request.go.tpl sdk_delete_post_request: @@ -175,6 +182,11 @@ resources: from: operation: DescribeTrainingJob path: WarmPoolStatus + ProfilingStatus: + is_read_only: true + from: + operation: DescribeTrainingJob + path: ProfilingStatus AlgorithmSpecification.MetricDefinitions: compare: is_ignored: true @@ -879,7 +891,6 @@ ignore: - Workforce - Workteam operations: - - UpdateTrainingJob - UpdateFeatureGroup shape_names: # RSessionAppSettings is an empty struct that causes generation errors @@ -894,4 +905,5 @@ ignore: - InstanceGroupNames - CanvasAppSettings - ExplainerConfig - - HyperParameterTuningJobStrategyConfig \ No newline at end of file + - HyperParameterTuningJobStrategyConfig + - DisableProfiler \ No newline at end of file diff --git a/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml b/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml index 6f60c60d..d0f4ca28 100644 --- a/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml +++ b/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml @@ -618,6 +618,9 @@ spec: type: string type: object type: array + profilingStatus: + description: Profiling status of a training job. + type: string secondaryStatus: description: "Provides detailed information about the state of the training job. For detailed information on the secondary status of diff --git a/pkg/resource/training_job/custom_delta.go b/pkg/resource/training_job/custom_delta.go index 7653044c..a67c3490 100644 --- a/pkg/resource/training_job/custom_delta.go +++ b/pkg/resource/training_job/custom_delta.go @@ -29,6 +29,10 @@ func customSetDefaults( if ackcompare.IsNotNil(a.ko.Spec.ProfilerRuleConfigurations) && ackcompare.IsNotNil(b.ko.Spec.ProfilerRuleConfigurations) { for index := range a.ko.Spec.ProfilerRuleConfigurations { + // Prevent out of bounds panics. + if index == len(a.ko.Spec.ProfilerRuleConfigurations) || index == len(b.ko.Spec.ProfilerRuleConfigurations) { + break + } if ackcompare.IsNil(a.ko.Spec.ProfilerRuleConfigurations[index].VolumeSizeInGB) && ackcompare.IsNotNil(b.ko.Spec.ProfilerRuleConfigurations[index].VolumeSizeInGB) { a.ko.Spec.ProfilerRuleConfigurations[index].VolumeSizeInGB = defaultVolumeSizeInGB } @@ -60,3 +64,52 @@ func customSetDefaults( } } } + +// SM returns profiler related objects even if the user disables the profiler +// this function detects if there is a diff +func customPostCompare(latest *resource, desired *resource, delta *ackcompare.Delta) { + profilerConfigDiff := delta.DifferentAt("Spec.ProfilerConfig") + profilerRuleDiff := delta.DifferentAt("Spec.ProfilerRuleConfigurations") + if !profilerConfigDiff && !profilerRuleDiff { + return + } + profilerStatus := latest.ko.Status.ProfilingStatus + profilerDisabled := false + + if ackcompare.IsNotNil(profilerStatus) { + //Do not remove profiler if user wants to enable it + if *profilerStatus == "Disabled" && !userInitiatesProfilerCheck(desired) { + profilerDisabled = true + } else { + return + } + } else { + return + } + // TODO: Replace remove delta with an ack version when its natively supported + if profilerConfigDiff && profilerDisabled { + removeDelta(delta, "Spec.ProfilerConfig") + } + if profilerRuleDiff { + removeDelta(delta, "Spec.ProfilerRuleConfigurations") + } +} + +func userInitiatesProfilerCheck(desired *resource) bool { + profilerConfigPresent := ackcompare.IsNotNil(desired.ko.Spec.ProfilerConfig) + profilerRuleConfigPresent := ackcompare.IsNotNil(desired.ko.Spec.ProfilerRuleConfigurations) + return profilerConfigPresent && profilerRuleConfigPresent +} + +// Removes fieldName from the delta slice. +// TODO: Replace when ack runtime can do this. +func removeDelta(delta *ackcompare.Delta, fieldName string) { + differences := delta.Differences + for index, diff := range differences { + if diff.Path.Contains(fieldName) { + differences = append(differences[:index], differences[index+1:]...) + delta.Differences = differences + return + } + } +} diff --git a/pkg/resource/training_job/custom_set_update_input.go b/pkg/resource/training_job/custom_set_update_input.go new file mode 100644 index 00000000..cede7eed --- /dev/null +++ b/pkg/resource/training_job/custom_set_update_input.go @@ -0,0 +1,125 @@ +// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +// Use this file if conditions need to be updated based on the latest status +// of training job which is not evident from API response + +package training_job + +import ( + "errors" + + ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" + smv1alpha "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" + svcsdk "github.com/aws/aws-sdk-go/service/sagemaker" +) + +// Three conditions: +// 1. Customer updates both profiler parameters: Recreate the input for profiler Rule. +// 2. Customer only updates Profiler Config: Set the profiler rule configuration to nil to avoid validation error. +// 3. Customer only updates Rule Configurations: Recreate the input for profiler Rule and set Profiler config to nil. +// safer to do this because the "only add" behavior might reappear. + +func customSetUpdateInput(desired *resource, latest *resource, delta *ackcompare.Delta, input *svcsdk.UpdateTrainingJobInput) error { + if delta.DifferentAt("Spec.ProfilerConfig") && delta.DifferentAt("Spec.ProfilerRuleConfigurations") { + err := handleProfilerRuleConfig(desired, latest, input) + return err + } + if delta.DifferentAt("Spec.ProfilerConfig") && !delta.DifferentAt("Spec.ProfilerRuleConfigurations") { + input.SetProfilerRuleConfigurations(nil) + return nil + } + if delta.DifferentAt("Spec.ProfilerRuleConfigurations") && !delta.DifferentAt("Spec.ProfilerConfig") { + err := handleProfilerRuleConfig(desired, latest, input) + input.SetProfilerConfig(nil) // SM still assumes the profiler config is the same. + return err + } + return nil +} + +// Update training job is post operation wrt to the profiler parameters. +// Because of this only NEW rules can be specified. +// In this function we check to see if any new profiler configurstions have been added. +func handleProfilerRuleConfig(desired *resource, latest *resource, input *svcsdk.UpdateTrainingJobInput) error { + profilerRuleDesired := desired.ko.Spec.ProfilerRuleConfigurations + profilerRuleLatest := latest.ko.Spec.ProfilerRuleConfigurations + + if ackcompare.IsNil(profilerRuleDesired) { + return errors.New("[ACK_SM] Cannot remove a profiler rule.") + } + if ackcompare.IsNil(profilerRuleLatest) { + return nil + } + if len(profilerRuleDesired) < len(profilerRuleLatest) { + return errors.New("[ACK_SM] Cannot remove a profiler rule.") + } + + ruleMap := map[string]int{} + profilerRuleInput := []*svcsdk.ProfilerRuleConfiguration{} + for _, rule := range profilerRuleLatest { + if ackcompare.IsNotNil(rule) && ackcompare.IsNotNil(rule.RuleConfigurationName) { + ruleMap[*rule.RuleConfigurationName] = 1 + } + } + for _, rule := range profilerRuleDesired { + if ackcompare.IsNotNil(rule) && ackcompare.IsNotNil(rule.RuleConfigurationName) { + _, present := ruleMap[*rule.RuleConfigurationName] + if !present { + profilerRuleInput = append(profilerRuleInput, convertProfileRuleType(rule)) + } + } + } + input.SetProfilerRuleConfigurations(profilerRuleInput) + return nil +} + +// Recreates input and sets disable profiler to true +func handleProfilerRemoval(input *svcsdk.UpdateTrainingJobInput) { + input.SetProfilerRuleConfigurations(nil) + profilerConfig := svcsdk.ProfilerConfigForUpdate{} + profilerConfig.SetDisableProfiler(true) + input.SetProfilerConfig(&profilerConfig) +} + +// Sagemaker and kubernetes types are not the same so the input has to be reconstructed. +func convertProfileRuleType(rule *smv1alpha.ProfilerRuleConfiguration) *svcsdk.ProfilerRuleConfiguration { + smRule := &svcsdk.ProfilerRuleConfiguration{} + if rule.InstanceType != nil { + smRule.SetInstanceType(*rule.InstanceType) + } + if rule.LocalPath != nil { + smRule.SetLocalPath(*rule.LocalPath) + } + if rule.RuleConfigurationName != nil { + smRule.SetRuleConfigurationName(*rule.RuleConfigurationName) + } + if rule.RuleEvaluatorImage != nil { + smRule.SetRuleEvaluatorImage(*rule.RuleEvaluatorImage) + } + if rule.RuleParameters != nil { + f1elemf4 := map[string]*string{} + for f1elemf4key, f1elemf4valiter := range rule.RuleParameters { + var f1elemf4val string + f1elemf4val = *f1elemf4valiter + f1elemf4[f1elemf4key] = &f1elemf4val + } + smRule.SetRuleParameters(f1elemf4) + } + if rule.S3OutputPath != nil { + smRule.SetS3OutputPath(*rule.S3OutputPath) + } + if rule.VolumeSizeInGB != nil { + smRule.SetVolumeSizeInGB(*rule.VolumeSizeInGB) + } + return smRule +} diff --git a/pkg/resource/training_job/custom_update_conditions.go b/pkg/resource/training_job/custom_update_conditions.go new file mode 100644 index 00000000..c94cfcf0 --- /dev/null +++ b/pkg/resource/training_job/custom_update_conditions.go @@ -0,0 +1,53 @@ +// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +// Use this file if conditions need to be updated based on the latest status +// of training job which is not evident from API response + +package training_job + +import ( + "strings" + + ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" + ackcondition "github.com/aws-controllers-k8s/runtime/pkg/condition" + svcapitypes "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" + corev1 "k8s.io/api/core/v1" +) + +var ( + terminalCode string = "[ACK_SM]" +) + +// If the controller runs into an error that contains "[ACK_SM]" +// it will set the resource to a terminal state because it is an unrecoverable error. +func (rm *resourceManager) CustomUpdateConditions( + ko *svcapitypes.TrainingJob, + r *resource, + err error, +) bool { + + if ackcompare.IsNil(err) { + return false + } + + if strings.Contains(err.Error(), terminalCode) { + conditionManager := &resource{ko} + exception := err.Error() + ackcondition.SetTerminal(conditionManager, corev1.ConditionTrue, &exception, nil) + return true + } + + return false + +} diff --git a/pkg/resource/training_job/delta.go b/pkg/resource/training_job/delta.go index 251c2246..bbe75e10 100644 --- a/pkg/resource/training_job/delta.go +++ b/pkg/resource/training_job/delta.go @@ -332,5 +332,6 @@ func newResourceDelta( } } + customPostCompare(b, a, delta) return delta } diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index 54d0411f..c805e9cb 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -36,12 +36,27 @@ var ( svcsdk.WarmPoolResourceStatusAvailable, svcsdk.WarmPoolResourceStatusInUse, } + TrainingJobTerminalProfiler = []string{ + svcsdk.TrainingJobStatusCompleted, + svcsdk.TrainingJobStatusFailed, + svcsdk.TrainingJobStatusStopping, + svcsdk.TrainingJobStatusStopped, + } resourceName = GroupKind.Kind requeueWaitWhileDeleting = ackrequeue.NeededAfter( errors.New(resourceName+" is Stopping."), ackrequeue.DefaultRequeueAfterDuration, ) + + requeueBeforeUpdate = ackrequeue.NeededAfter( + errors.New("Warm pool cannot be updated in InProgress state requeuing until TrainingJob reaches completed state."), + ackrequeue.DefaultRequeueAfterDuration, + ) + requeueBeforeUpdateStarting = ackrequeue.NeededAfter( + errors.New("Controller cannot update while secondary status is in Starting state."), + ackrequeue.DefaultRequeueAfterDuration, + ) ) // customSetOutput sets the resource ResourceSynced condition to False if @@ -62,6 +77,12 @@ func (rm *resourceManager) customSetOutput(r *resource) { } for _, rule := range r.ko.Status.ProfilerRuleEvaluationStatuses { + if ackcompare.IsNotNil(r.ko.Status.ProfilingStatus) { + // Sometimes rule evaluation status will stay in InProgress state. + if *r.ko.Status.ProfilingStatus == "Disabled" { + break + } + } if rule.RuleEvaluationStatus != nil && svccommon.IsModifyingStatus(rule.RuleEvaluationStatus, &ruleModifyingStatuses) { svccommon.SetSyncedCondition(r, rule.RuleEvaluationStatus, aws.String("ProfilerRule"), &ruleModifyingStatuses) return @@ -89,3 +110,75 @@ func (rm *resourceManager) customSetOutput(r *resource) { } } + +// This function makes the controller requeue if there is an update and +// the training job is still in InProgress +func customSetOutputUpdateWarmpool(r *resource) error { + trainingJobStatus := r.ko.Status.TrainingJobStatus + if ackcompare.IsNotNil(trainingJobStatus) && *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { + return requeueBeforeUpdate + } + return nil +} + +// Check if warm pool has reached a state where it is not updateable +func warmPoolTerminalCheck(latest *resource) bool { + trainingJobStatus := latest.ko.Status.TrainingJobStatus + if ackcompare.IsNotNil(latest.ko.Spec.ResourceConfig) { + if ackcompare.IsNil(latest.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds) { + return true // Warm pool can only be updated iff there is a provisioned cluster. + } + } else { + return false + } + + if ackcompare.IsNotNil(trainingJobStatus) { + if *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { + return false + } + if *trainingJobStatus == svcsdk.TrainingJobStatusCompleted { + if ackcompare.IsNotNil(latest.ko.Status.WarmPoolStatus) { + wp_modifying := svccommon.IsModifyingStatus(latest.ko.Status.WarmPoolStatus.Status, &WarmPoolModifyingStatuses) + return !wp_modifying + } else { + return false // Sometimes the API (briefly) does not return the WP status even if it completes. + } + } else { + // Training Job is in 'Failed'|'Stopping'|'Stopped' (Terminal) + return true + } + } + + // ACK OIDC is misconfigured (Terminal) + return true +} + +// Profiler cannot be updated at certain statuses. +func customSetOutputUpdateProfiler(r *resource) error { + trainingSecondaryStatus := r.ko.Status.SecondaryStatus + trainingJobStatus := r.ko.Status.TrainingJobStatus + if ackcompare.IsNotNil(trainingSecondaryStatus) && *trainingSecondaryStatus == svcsdk.SecondaryStatusStarting { + return requeueBeforeUpdateStarting + } + if ackcompare.IsNotNil(trainingJobStatus) { + for _, terminalStatus := range TrainingJobTerminalProfiler { + if terminalStatus == *trainingJobStatus { + return errors.New("[ACK_SM] Profiler can only be updated when Training Job is in InProgress state") + } + } + } + return nil +} + +// Checks if the profiler was removed. +func profilerRemovalCheck(desired *resource, latest *resource) bool { + if ackcompare.IsNotNil(desired.ko.Spec) && ackcompare.IsNotNil(latest.ko.Spec) { + if ackcompare.IsNil(desired.ko.Spec.ProfilerRuleConfigurations) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerRuleConfigurations) { + return true + } + if ackcompare.IsNil(desired.ko.Spec.ProfilerConfig) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerConfig) { + return true + } + } + return false +} diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index b00ecbfc..023f3e63 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -469,6 +469,11 @@ func (rm *resourceManager) sdkFind( } else { ko.Status.ProfilerRuleEvaluationStatuses = nil } + if resp.ProfilingStatus != nil { + ko.Status.ProfilingStatus = resp.ProfilingStatus + } else { + ko.Status.ProfilingStatus = nil + } if resp.ResourceConfig != nil { f25 := &svcapitypes.ResourceConfig{} if resp.ResourceConfig.InstanceCount != nil { @@ -1040,9 +1045,145 @@ func (rm *resourceManager) sdkUpdate( desired *resource, latest *resource, delta *ackcompare.Delta, -) (*resource, error) { - // TODO(jaypipes): Figure this out... - return nil, ackerr.NotImplemented +) (updated *resource, err error) { + rlog := ackrtlog.FromContext(ctx) + exit := rlog.Trace("rm.sdkUpdate") + defer func() { + exit(err) + }() + input, err := rm.newUpdateRequestPayload(ctx, desired) + if err != nil { + return nil, err + } + warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") + profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") + if warmpool_diff && profiler_diff { + return latest, errors.New("[ACK_SM] Cannot update Warm pool and Profiler at the same time.") + } + if warmpool_diff { + input.SetProfilerConfig(nil) + input.SetProfilerRuleConfigurations(nil) + warmpool_terminal := warmPoolTerminalCheck(latest) + if warmpool_terminal { + return latest, errors.New("[ACK_SM] Warm pool either does not exist or has reached a non updatable state.") + } + if err := customSetOutputUpdateWarmpool(latest); err != nil { + return nil, err + } + } + if profiler_diff { + if up_err := customSetOutputUpdateProfiler(latest); up_err != nil { + return nil, up_err + } + input.SetResourceConfig(nil) + if profilerRemovalCheck(desired, latest) { + handleProfilerRemoval(input) + } else { + inp_err := customSetUpdateInput(desired, latest, delta, input) + if inp_err != nil { + return nil, err + } + } + } + + var resp *svcsdk.UpdateTrainingJobOutput + _ = resp + resp, err = rm.sdkapi.UpdateTrainingJobWithContext(ctx, input) + rm.metrics.RecordAPICall("UPDATE", "UpdateTrainingJob", err) + if err != nil { + return nil, err + } + // Merge in the information we read from the API call above to the copy of + // the original Kubernetes object we passed to the function + ko := desired.ko.DeepCopy() + + if ko.Status.ACKResourceMetadata == nil { + ko.Status.ACKResourceMetadata = &ackv1alpha1.ResourceMetadata{} + } + if resp.TrainingJobArn != nil { + arn := ackv1alpha1.AWSResourceName(*resp.TrainingJobArn) + ko.Status.ACKResourceMetadata.ARN = &arn + } + + rm.setStatusDefaults(ko) + rm.customSetOutput(&resource{ko}) + return &resource{ko}, nil +} + +// newUpdateRequestPayload returns an SDK-specific struct for the HTTP request +// payload of the Update API call for the resource +func (rm *resourceManager) newUpdateRequestPayload( + ctx context.Context, + r *resource, +) (*svcsdk.UpdateTrainingJobInput, error) { + res := &svcsdk.UpdateTrainingJobInput{} + + if r.ko.Spec.ProfilerConfig != nil { + f0 := &svcsdk.ProfilerConfigForUpdate{} + if r.ko.Spec.ProfilerConfig.ProfilingIntervalInMilliseconds != nil { + f0.SetProfilingIntervalInMilliseconds(*r.ko.Spec.ProfilerConfig.ProfilingIntervalInMilliseconds) + } + if r.ko.Spec.ProfilerConfig.ProfilingParameters != nil { + f0f1 := map[string]*string{} + for f0f1key, f0f1valiter := range r.ko.Spec.ProfilerConfig.ProfilingParameters { + var f0f1val string + f0f1val = *f0f1valiter + f0f1[f0f1key] = &f0f1val + } + f0.SetProfilingParameters(f0f1) + } + if r.ko.Spec.ProfilerConfig.S3OutputPath != nil { + f0.SetS3OutputPath(*r.ko.Spec.ProfilerConfig.S3OutputPath) + } + res.SetProfilerConfig(f0) + } + if r.ko.Spec.ProfilerRuleConfigurations != nil { + f1 := []*svcsdk.ProfilerRuleConfiguration{} + for _, f1iter := range r.ko.Spec.ProfilerRuleConfigurations { + f1elem := &svcsdk.ProfilerRuleConfiguration{} + if f1iter.InstanceType != nil { + f1elem.SetInstanceType(*f1iter.InstanceType) + } + if f1iter.LocalPath != nil { + f1elem.SetLocalPath(*f1iter.LocalPath) + } + if f1iter.RuleConfigurationName != nil { + f1elem.SetRuleConfigurationName(*f1iter.RuleConfigurationName) + } + if f1iter.RuleEvaluatorImage != nil { + f1elem.SetRuleEvaluatorImage(*f1iter.RuleEvaluatorImage) + } + if f1iter.RuleParameters != nil { + f1elemf4 := map[string]*string{} + for f1elemf4key, f1elemf4valiter := range f1iter.RuleParameters { + var f1elemf4val string + f1elemf4val = *f1elemf4valiter + f1elemf4[f1elemf4key] = &f1elemf4val + } + f1elem.SetRuleParameters(f1elemf4) + } + if f1iter.S3OutputPath != nil { + f1elem.SetS3OutputPath(*f1iter.S3OutputPath) + } + if f1iter.VolumeSizeInGB != nil { + f1elem.SetVolumeSizeInGB(*f1iter.VolumeSizeInGB) + } + f1 = append(f1, f1elem) + } + res.SetProfilerRuleConfigurations(f1) + } + if r.ko.Spec.ResourceConfig != nil { + f2 := &svcsdk.ResourceConfigForUpdate{} + if r.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds != nil { + f2.SetKeepAlivePeriodInSeconds(*r.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds) + } + res.SetResourceConfig(f2) + } + if r.ko.Spec.TrainingJobName != nil { + res.SetTrainingJobName(*r.ko.Spec.TrainingJobName) + } + + return res, nil } // sdkDelete deletes the supplied resource in the backend AWS service API @@ -1192,7 +1333,9 @@ func (rm *resourceManager) updateConditions( } // Required to avoid the "declared but not used" error in the default case _ = syncCondition - if terminalCondition != nil || recoverableCondition != nil || syncCondition != nil { + // custom update conditions + customUpdate := rm.CustomUpdateConditions(ko, r, err) + if terminalCondition != nil || recoverableCondition != nil || syncCondition != nil || customUpdate { return &resource{ko}, true // updated } return nil, false // not updated diff --git a/pkg/resource/training_job/testdata/test_suite.yaml b/pkg/resource/training_job/testdata/test_suite.yaml index 69c72046..7b172a94 100644 --- a/pkg/resource/training_job/testdata/test_suite.yaml +++ b/pkg/resource/training_job/testdata/test_suite.yaml @@ -193,18 +193,6 @@ tests: expect: latest_state: "v1alpha1/readone/observed/warmpool_nostatus.yaml" - - name: "Training job update tests" - description: "Testing the Update operation" - scenarios: - - name: "Update=NotSupported" - description: "This test checks if the controller throws error for update" - given: - desired_state: "v1alpha1/update/desired/updated_base.yaml" - latest_state: "v1alpha1/create/observed/success_after_create.yaml" - invoke: Update - expect: - latest_state: "v1alpha1/update/observed/error_on_update.yaml" - error: "not implemented" - name: "Training job delete tests" description: "Testing the delete operation" scenarios: diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed.yaml index 9b3e4bca..c086d9a3 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed.yaml @@ -108,3 +108,4 @@ status: s3ModelArtifacts: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output/model.tar.gz secondaryStatus: Completed trainingJobStatus: Completed + profilingStatus: Disabled diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed_debugger_variation.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed_debugger_variation.yaml index 827f165b..6b78b3d2 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed_debugger_variation.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed_debugger_variation.yaml @@ -149,4 +149,5 @@ status: ruleConfigurationName: ProfilerReport ruleEvaluationStatus: Completed secondaryStatus: Completed - trainingJobStatus: Completed \ No newline at end of file + trainingJobStatus: Completed + profilingStatus: Enabled \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/conditions_clear_on_success.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/conditions_clear_on_success.yaml index 21abd8ed..b4431657 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/conditions_clear_on_success.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/conditions_clear_on_success.yaml @@ -108,3 +108,4 @@ status: type: ACK.ResourceSynced secondaryStatus: Downloading trainingJobStatus: InProgress + profilingStatus: Disabled diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created.yaml index 6bd29e20..ace5bc9c 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created.yaml @@ -106,3 +106,4 @@ status: type: ACK.ResourceSynced secondaryStatus: Downloading trainingJobStatus: InProgress + profilingStatus: Disabled diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_debugger_variation.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_debugger_variation.yaml index 3432a708..006d0382 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_debugger_variation.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_debugger_variation.yaml @@ -150,3 +150,4 @@ status: ruleEvaluationStatus: InProgress secondaryStatus: Starting trainingJobStatus: InProgress + profilingStatus: Enabled diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/late_initialize.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/late_initialize.yaml index a08bbe9f..7d41233d 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/late_initialize.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/late_initialize.yaml @@ -110,3 +110,4 @@ status: type: ACK.LateInitialized secondaryStatus: Downloading trainingJobStatus: InProgress + profilingStatus: Disabled diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping.yaml index c90f2114..bbd786d2 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping.yaml @@ -106,3 +106,4 @@ status: type: ACK.ResourceSynced secondaryStatus: Starting trainingJobStatus: Stopping + profilingStatus: Disabled diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping_debugger_variation.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping_debugger_variation.yaml index a06b365c..0558f186 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping_debugger_variation.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping_debugger_variation.yaml @@ -152,3 +152,4 @@ status: ruleEvaluationStatus: InProgress secondaryStatus: Starting trainingJobStatus: Stopping + profilingStatus: Enabled diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml index 9658b7ab..9819560f 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml @@ -108,4 +108,5 @@ status: warmPoolStatus: status: Available secondaryStatus: Completed - trainingJobStatus: Completed \ No newline at end of file + trainingJobStatus: Completed + profilingStatus: Disabled \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml index d5bfe159..f7acc768 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml @@ -108,4 +108,5 @@ status: warmPoolStatus: status: InUse secondaryStatus: Starting - trainingJobStatus: InProgress \ No newline at end of file + trainingJobStatus: InProgress + profilingStatus: Disabled \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml index 115764af..2cd053eb 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml @@ -110,4 +110,5 @@ status: reusedByJob: Trainingjob-ccsjjbdsjhhcsvdj resourceRetainedBillableTimeInSeconds: 69 secondaryStatus: Completed - trainingJobStatus: Completed \ No newline at end of file + trainingJobStatus: Completed + profilingStatus: Disabled \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml index 99829bad..b0d56c11 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml @@ -110,4 +110,5 @@ status: resourceRetainedBillableTimeInSeconds: 69 secondaryStatus: Completed trainingJobStatus: Completed - + profilingStatus: Disabled + diff --git a/templates/training_job/sdk_update_post_build_request.go.tpl b/templates/training_job/sdk_update_post_build_request.go.tpl new file mode 100644 index 00000000..68984daf --- /dev/null +++ b/templates/training_job/sdk_update_post_build_request.go.tpl @@ -0,0 +1,31 @@ +warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") +profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") +if warmpool_diff && profiler_diff{ + return latest, errors.New("[ACK_SM] Cannot update Warm pool and Profiler at the same time.") +} +if warmpool_diff { + input.SetProfilerConfig(nil) + input.SetProfilerRuleConfigurations(nil) + warmpool_terminal := warmPoolTerminalCheck(latest) + if warmpool_terminal { + return latest, errors.New("[ACK_SM] Warm pool either does not exist or has reached a non updatable state.") + } + //Requeue if TrainingJob is in InProgress state + if err := customSetOutputUpdateWarmpool(latest); err != nil { + return nil,err + } +} +if profiler_diff { + if up_err := customSetOutputUpdateProfiler(latest); up_err != nil { + return nil, up_err + } + input.SetResourceConfig(nil) + if profilerRemovalCheck(desired, latest) { + handleProfilerRemoval(input) + } else{ + inp_err := customSetUpdateInput(desired, latest, delta, input) + if inp_err != nil { + return nil, err + } + } +} From f4b6dc32c0e50112f92d290043d1272ac95b065e Mon Sep 17 00:00:00 2001 From: ananth102 Date: Fri, 18 Nov 2022 20:26:11 +0000 Subject: [PATCH 28/71] test: added integration test --- test/e2e/tests/test_trainingjob_debugger.py | 91 ++++++++++++++++++--- 1 file changed, 80 insertions(+), 11 deletions(-) diff --git a/test/e2e/tests/test_trainingjob_debugger.py b/test/e2e/tests/test_trainingjob_debugger.py index 5bccd3bc..efd1a3bc 100644 --- a/test/e2e/tests/test_trainingjob_debugger.py +++ b/test/e2e/tests/test_trainingjob_debugger.py @@ -30,6 +30,7 @@ from e2e.common import config as cfg RESOURCE_PLURAL = "trainingjobs" +NEW_PROFILER_INTERVAL = 200 @pytest.fixture(scope="function") @@ -37,7 +38,7 @@ def xgboost_training_job_debugger(): resource_name = random_suffix_name("xgboost-trainingjob-debugger", 50) replacements = REPLACEMENT_VALUES.copy() replacements["TRAINING_JOB_NAME"] = resource_name - reference, _, resource = create_sagemaker_resource( + reference, spec, resource = create_sagemaker_resource( resource_plural=RESOURCE_PLURAL, resource_name=resource_name, spec_file="xgboost_trainingjob_debugger", @@ -45,7 +46,7 @@ def xgboost_training_job_debugger(): ) assert resource is not None - yield (reference, resource) + yield (reference, resource, spec) if k8s.get_resource_exists(reference): _, deleted = k8s.delete_custom_resource(reference, 3, 10) @@ -54,17 +55,20 @@ def xgboost_training_job_debugger(): def get_training_rule_eval_sagemaker_status(training_job_name: str, rule_type: str): training_sm_desc = get_sagemaker_training_job(training_job_name) - return training_sm_desc[rule_type+"EvaluationStatuses"][0]["RuleEvaluationStatus"] + return training_sm_desc[rule_type + "EvaluationStatuses"][0]["RuleEvaluationStatus"] -def get_training_rule_eval_resource_status(reference: k8s.CustomResourceReference, rule_type: str): +def get_training_rule_eval_resource_status( + reference: k8s.CustomResourceReference, rule_type: str +): resource = k8s.get_resource(reference) - resource_status = resource["status"][rule_type+"EvaluationStatuses"][0][ + resource_status = resource["status"][rule_type + "EvaluationStatuses"][0][ "ruleEvaluationStatus" ] assert resource_status is not None return resource_status + @service_marker class TestTrainingDebuggerJob: def _wait_sagemaker_training_rule_eval_status( @@ -107,14 +111,18 @@ def _assert_training_rule_eval_status_in_sync( resource_rule_type = sagemaker_rule_type[0].lower() + sagemaker_rule_type[1:] assert ( self._wait_sagemaker_training_rule_eval_status( - training_job_name, sagemaker_rule_type, expected_status, + training_job_name, + sagemaker_rule_type, + expected_status, + ) + == self._wait_resource_training_rule_eval_status( + reference, resource_rule_type, expected_status ) - == self._wait_resource_training_rule_eval_status(reference, resource_rule_type, expected_status) == expected_status ) def test_completed(self, xgboost_training_job_debugger): - (reference, resource) = xgboost_training_job_debugger + (reference, resource, _) = xgboost_training_job_debugger assert k8s.get_resource_exists(reference) training_job_name = resource["spec"].get("trainingJobName", None) @@ -122,7 +130,7 @@ def test_completed(self, xgboost_training_job_debugger): training_job_desc = get_sagemaker_training_job(training_job_name) training_job_arn = training_job_desc["TrainingJobArn"] - + resource_arn = k8s.get_resource_arn(resource) if resource_arn is None: logging.error( @@ -142,7 +150,7 @@ def test_completed(self, xgboost_training_job_debugger): self._assert_training_rule_eval_status_in_sync( training_job_name, "DebugRule", reference, cfg.RULE_STATUS_COMPLETED ) - + # Assert profiler rule evaluation completed self._assert_training_rule_eval_status_in_sync( training_job_name, "ProfilerRule", reference, cfg.RULE_STATUS_COMPLETED @@ -153,5 +161,66 @@ def test_completed(self, xgboost_training_job_debugger): assert_tags_in_sync(training_job_arn, resource_tags) # Check that you can delete a completed resource from k8s - _, deleted = k8s.delete_custom_resource(reference, cfg.JOB_DELETE_WAIT_PERIODS, cfg.JOB_DELETE_WAIT_LENGTH) + _, deleted = k8s.delete_custom_resource( + reference, cfg.JOB_DELETE_WAIT_PERIODS, cfg.JOB_DELETE_WAIT_LENGTH + ) + assert deleted is True + + def test_update(self, xgboost_training_job_debugger): + (reference, resource, spec) = xgboost_training_job_debugger + assert k8s.get_resource_exists(reference) + + training_job_name = resource["spec"].get("trainingJobName", None) + assert training_job_name is not None + + training_job_desc = get_sagemaker_training_job(training_job_name) + training_job_arn = training_job_desc["TrainingJobArn"] + + resource_arn = k8s.get_resource_arn(resource) + if resource_arn is None: + logging.error( + f"ARN for this resource is None, resource status is: {resource['status']}" + ) + assert resource_arn == training_job_arn + + assert training_job_desc["TrainingJobStatus"] == cfg.JOB_STATUS_INPROGRESS + assert k8s.wait_on_condition(reference, "ACK.ResourceSynced", "False") + + spec["spec"]["profilerConfig"][ + "profilingIntervalInMilliseconds" + ] = NEW_PROFILER_INTERVAL + k8s.patch_custom_resource(reference, spec) + + assert_training_status_in_sync( + training_job_name, reference, cfg.JOB_STATUS_COMPLETED + ) + assert k8s.wait_on_condition(reference, "ACK.ResourceSynced", "False") + + # Assert debugger rule evaluation completed + self._assert_training_rule_eval_status_in_sync( + training_job_name, "DebugRule", reference, cfg.RULE_STATUS_COMPLETED + ) + + # Assert profiler rule evaluation completed + self._assert_training_rule_eval_status_in_sync( + training_job_name, "ProfilerRule", reference, cfg.RULE_STATUS_COMPLETED + ) + assert k8s.wait_on_condition(reference, "ACK.ResourceSynced", "True") + + # Check if the update worked. + training_sm_desc = get_sagemaker_training_job(training_job_name) + assert ( + training_sm_desc["ProfilerConfig"]["ProfilingIntervalInMilliseconds"] + == NEW_PROFILER_INTERVAL + ) + resource = k8s.get_resource(reference) + assert ( + resource["spec"]["profilerConfig"]["profilingIntervalInMilliseconds"] + == NEW_PROFILER_INTERVAL + ) + + # Check that you can delete a completed resource from k8s + _, deleted = k8s.delete_custom_resource( + reference, cfg.JOB_DELETE_WAIT_PERIODS, cfg.JOB_DELETE_WAIT_LENGTH + ) assert deleted is True From 8cc3af38e2ce43abaee618b3b5d37c5835232262 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Mon, 21 Nov 2022 06:12:23 +0000 Subject: [PATCH 29/71] fix: corrected hook --- apis/v1alpha1/ack-generate-metadata.yaml | 4 ++-- apis/v1alpha1/generator.yaml | 2 +- generator.yaml | 2 +- pkg/resource/training_job/hooks.go | 14 ++++++++++++++ pkg/resource/training_job/sdk.go | 3 ++- 5 files changed, 20 insertions(+), 5 deletions(-) diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index 1bb00153..ccc0e77e 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,5 +1,5 @@ ack_generate_info: - build_date: "2022-11-18T16:40:15Z" + build_date: "2022-11-21T06:07:29Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc @@ -7,7 +7,7 @@ api_directory_checksum: a11209e81e188afecb6812ec3080cead7ce995b1 api_version: v1alpha1 aws_sdk_go_version: v1.44.117 generator_config_info: - file_checksum: 9678da88ff1b4061cfae104e44116cf0d87ebe09 + file_checksum: ac5bdf0ea0d52467b65d65438608bbb1eb0ee571 original_file_name: generator.yaml last_modification: reason: API generation diff --git a/apis/v1alpha1/generator.yaml b/apis/v1alpha1/generator.yaml index eacb3dce..485ffd82 100644 --- a/apis/v1alpha1/generator.yaml +++ b/apis/v1alpha1/generator.yaml @@ -134,7 +134,7 @@ resources: sdk_update_post_build_request: template_path: training_job/sdk_update_post_build_request.go.tpl sdk_update_post_set_output: - code: rm.customSetOutput(&resource{ko}) + code: customSetOutputPostUpdate(ko, delta) sdk_delete_pre_build_request: template_path: training_job/sdk_delete_pre_build_request.go.tpl sdk_delete_post_request: diff --git a/generator.yaml b/generator.yaml index eacb3dce..485ffd82 100644 --- a/generator.yaml +++ b/generator.yaml @@ -134,7 +134,7 @@ resources: sdk_update_post_build_request: template_path: training_job/sdk_update_post_build_request.go.tpl sdk_update_post_set_output: - code: rm.customSetOutput(&resource{ko}) + code: customSetOutputPostUpdate(ko, delta) sdk_delete_pre_build_request: template_path: training_job/sdk_delete_pre_build_request.go.tpl sdk_delete_post_request: diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index c805e9cb..7c3056e0 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -18,6 +18,7 @@ import ( ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" ackrequeue "github.com/aws-controllers-k8s/runtime/pkg/requeue" + svcapitypes "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" svccommon "github.com/aws-controllers-k8s/sagemaker-controller/pkg/common" "github.com/aws/aws-sdk-go/aws" svcsdk "github.com/aws/aws-sdk-go/service/sagemaker" @@ -182,3 +183,16 @@ func profilerRemovalCheck(desired *resource, latest *resource) bool { } return false } + +// The statuses in ko object in the end of update are empty, using customSetOutput wont work. +func customSetOutputPostUpdate(ko *svcapitypes.TrainingJob, delta *ackcompare.Delta) { + warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") + profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") + if profiler_diff { + svccommon.SetSyncedCondition(&resource{ko}, aws.string("InProgress"), &resourceName, &trainingJobModifyingStatuses) + } + if warmpool_diff { + svccommon.SetSyncedCondition(&resource{ko}, aws.string("Availible"), aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) + } + +} diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index 023f3e63..ec5a0146 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -1067,6 +1067,7 @@ func (rm *resourceManager) sdkUpdate( if warmpool_terminal { return latest, errors.New("[ACK_SM] Warm pool either does not exist or has reached a non updatable state.") } + //Requeue if TrainingJob is in InProgress state if err := customSetOutputUpdateWarmpool(latest); err != nil { return nil, err } @@ -1106,7 +1107,7 @@ func (rm *resourceManager) sdkUpdate( } rm.setStatusDefaults(ko) - rm.customSetOutput(&resource{ko}) + customSetOutputPostUpdate(ko, delta) return &resource{ko}, nil } From 80bdeb1a0e4fc26a7bf0e2d51ffcc559a0879bcb Mon Sep 17 00:00:00 2001 From: ananth102 Date: Mon, 21 Nov 2022 06:17:43 +0000 Subject: [PATCH 30/71] fix: small typo --- pkg/resource/training_job/hooks.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index 7c3056e0..1a7c03cf 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -189,10 +189,10 @@ func customSetOutputPostUpdate(ko *svcapitypes.TrainingJob, delta *ackcompare.De warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") if profiler_diff { - svccommon.SetSyncedCondition(&resource{ko}, aws.string("InProgress"), &resourceName, &trainingJobModifyingStatuses) + svccommon.SetSyncedCondition(&resource{ko}, aws.String("InProgress"), &resourceName, &trainingJobModifyingStatuses) } if warmpool_diff { - svccommon.SetSyncedCondition(&resource{ko}, aws.string("Availible"), aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) + svccommon.SetSyncedCondition(&resource{ko}, aws.String("Availible"), aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) } } From 2937df2990aa28c78e68deaaf7f67b5b12a2e42b Mon Sep 17 00:00:00 2001 From: ananth102 Date: Mon, 21 Nov 2022 07:14:57 +0000 Subject: [PATCH 31/71] fix: corrected spelling/grammar erorrs --- pkg/resource/training_job/hooks.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index 1a7c03cf..2bcb3854 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -51,7 +51,7 @@ var ( ) requeueBeforeUpdate = ackrequeue.NeededAfter( - errors.New("Warm pool cannot be updated in InProgress state requeuing until TrainingJob reaches completed state."), + errors.New("Warm pool cannot be updated in InProgress state, requeuing until TrainingJob reaches completed state."), ackrequeue.DefaultRequeueAfterDuration, ) requeueBeforeUpdateStarting = ackrequeue.NeededAfter( @@ -192,7 +192,7 @@ func customSetOutputPostUpdate(ko *svcapitypes.TrainingJob, delta *ackcompare.De svccommon.SetSyncedCondition(&resource{ko}, aws.String("InProgress"), &resourceName, &trainingJobModifyingStatuses) } if warmpool_diff { - svccommon.SetSyncedCondition(&resource{ko}, aws.String("Availible"), aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) + svccommon.SetSyncedCondition(&resource{ko}, aws.String("Available"), aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) } } From 284d0112b87848e4c175fa87ed6af0e028713d2c Mon Sep 17 00:00:00 2001 From: ananth102 Date: Tue, 22 Nov 2022 18:02:52 +0000 Subject: [PATCH 32/71] fix: handle invalid update --- apis/v1alpha1/ack-generate-metadata.yaml | 2 +- pkg/resource/training_job/sdk.go | 3 +++ templates/training_job/sdk_update_post_build_request.go.tpl | 5 ++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index ccc0e77e..791bf8d3 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,5 +1,5 @@ ack_generate_info: - build_date: "2022-11-21T06:07:29Z" + build_date: "2022-11-22T17:54:48Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index ec5a0146..5358b78b 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -1060,6 +1060,9 @@ func (rm *resourceManager) sdkUpdate( if warmpool_diff && profiler_diff { return latest, errors.New("[ACK_SM] Cannot update Warm pool and Profiler at the same time.") } + if !warmpool_diff && !profiler_diff { + return latest, errors.New("[ACK_SM] Only Warm Pool or Profiler can be updated") + } if warmpool_diff { input.SetProfilerConfig(nil) input.SetProfilerRuleConfigurations(nil) diff --git a/templates/training_job/sdk_update_post_build_request.go.tpl b/templates/training_job/sdk_update_post_build_request.go.tpl index 68984daf..38d60b0c 100644 --- a/templates/training_job/sdk_update_post_build_request.go.tpl +++ b/templates/training_job/sdk_update_post_build_request.go.tpl @@ -1,8 +1,11 @@ warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") -if warmpool_diff && profiler_diff{ +if warmpool_diff && profiler_diff { return latest, errors.New("[ACK_SM] Cannot update Warm pool and Profiler at the same time.") } +if !warmpool_diff && !profiler_diff { + return latest, errors.New("[ACK_SM] Only Warm Pool or Profiler can be updated") +} if warmpool_diff { input.SetProfilerConfig(nil) input.SetProfilerRuleConfigurations(nil) From e939d5884ee27a77e66eec52798e78641cf8cac8 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Tue, 22 Nov 2022 19:00:13 +0000 Subject: [PATCH 33/71] refactor: changed comments/terminal condition --- apis/v1alpha1/ack-generate-metadata.yaml | 4 +- apis/v1alpha1/generator.yaml | 1 - generator.yaml | 1 - .../training_job/custom_set_update_input.go | 13 +++-- .../training_job/custom_update_conditions.go | 53 ------------------- pkg/resource/training_job/hooks.go | 18 ++++--- pkg/resource/training_job/sdk.go | 10 ++-- .../sdk_update_post_build_request.go.tpl | 6 +-- 8 files changed, 28 insertions(+), 78 deletions(-) delete mode 100644 pkg/resource/training_job/custom_update_conditions.go diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index 791bf8d3..b95cdb0a 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,5 +1,5 @@ ack_generate_info: - build_date: "2022-11-22T17:54:48Z" + build_date: "2022-11-22T18:47:47Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc @@ -7,7 +7,7 @@ api_directory_checksum: a11209e81e188afecb6812ec3080cead7ce995b1 api_version: v1alpha1 aws_sdk_go_version: v1.44.117 generator_config_info: - file_checksum: ac5bdf0ea0d52467b65d65438608bbb1eb0ee571 + file_checksum: ecbd3d6faa6352c2e9af3cbbe365a6d75c19c3ce original_file_name: generator.yaml last_modification: reason: API generation diff --git a/apis/v1alpha1/generator.yaml b/apis/v1alpha1/generator.yaml index 485ffd82..2b099c73 100644 --- a/apis/v1alpha1/generator.yaml +++ b/apis/v1alpha1/generator.yaml @@ -123,7 +123,6 @@ resources: - InvalidParameterCombination - InvalidParameterValue - MissingParameter - update_conditions_custom_method_name: CustomUpdateConditions hooks: delta_pre_compare: code: customSetDefaults(a, b) diff --git a/generator.yaml b/generator.yaml index 485ffd82..2b099c73 100644 --- a/generator.yaml +++ b/generator.yaml @@ -123,7 +123,6 @@ resources: - InvalidParameterCombination - InvalidParameterValue - MissingParameter - update_conditions_custom_method_name: CustomUpdateConditions hooks: delta_pre_compare: code: customSetDefaults(a, b) diff --git a/pkg/resource/training_job/custom_set_update_input.go b/pkg/resource/training_job/custom_set_update_input.go index cede7eed..2ea43247 100644 --- a/pkg/resource/training_job/custom_set_update_input.go +++ b/pkg/resource/training_job/custom_set_update_input.go @@ -20,16 +20,17 @@ import ( "errors" ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" + ackerr "github.com/aws-controllers-k8s/runtime/pkg/errors" smv1alpha "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" svcsdk "github.com/aws/aws-sdk-go/service/sagemaker" ) +// customSetUpdateInput modifies the input of UpdateTrainingJob. // Three conditions: // 1. Customer updates both profiler parameters: Recreate the input for profiler Rule. // 2. Customer only updates Profiler Config: Set the profiler rule configuration to nil to avoid validation error. // 3. Customer only updates Rule Configurations: Recreate the input for profiler Rule and set Profiler config to nil. // safer to do this because the "only add" behavior might reappear. - func customSetUpdateInput(desired *resource, latest *resource, delta *ackcompare.Delta, input *svcsdk.UpdateTrainingJobInput) error { if delta.DifferentAt("Spec.ProfilerConfig") && delta.DifferentAt("Spec.ProfilerRuleConfigurations") { err := handleProfilerRuleConfig(desired, latest, input) @@ -47,6 +48,8 @@ func customSetUpdateInput(desired *resource, latest *resource, delta *ackcompare return nil } +// handleProfilerRuleConfig sets the input of the ProfilerRuleConfiguration so that +// it is compatible with the sagemaker API. // Update training job is post operation wrt to the profiler parameters. // Because of this only NEW rules can be specified. // In this function we check to see if any new profiler configurstions have been added. @@ -55,13 +58,13 @@ func handleProfilerRuleConfig(desired *resource, latest *resource, input *svcsdk profilerRuleLatest := latest.ko.Spec.ProfilerRuleConfigurations if ackcompare.IsNil(profilerRuleDesired) { - return errors.New("[ACK_SM] Cannot remove a profiler rule.") + return ackerr.NewTerminalError(errors.New("cannot remove a profiler rule.")) } if ackcompare.IsNil(profilerRuleLatest) { return nil } if len(profilerRuleDesired) < len(profilerRuleLatest) { - return errors.New("[ACK_SM] Cannot remove a profiler rule.") + return ackerr.NewTerminalError(errors.New("cannot remove a profiler rule.")) } ruleMap := map[string]int{} @@ -83,7 +86,7 @@ func handleProfilerRuleConfig(desired *resource, latest *resource, input *svcsdk return nil } -// Recreates input and sets disable profiler to true +// handleProfilerRemoval sets the input parameters to disable the profiler. func handleProfilerRemoval(input *svcsdk.UpdateTrainingJobInput) { input.SetProfilerRuleConfigurations(nil) profilerConfig := svcsdk.ProfilerConfigForUpdate{} @@ -91,6 +94,8 @@ func handleProfilerRemoval(input *svcsdk.UpdateTrainingJobInput) { input.SetProfilerConfig(&profilerConfig) } +// convertProfileRuleType converts the kubernetes object ProfilerRuleConfiguration into +// a type that is compatible with the AWS API. // Sagemaker and kubernetes types are not the same so the input has to be reconstructed. func convertProfileRuleType(rule *smv1alpha.ProfilerRuleConfiguration) *svcsdk.ProfilerRuleConfiguration { smRule := &svcsdk.ProfilerRuleConfiguration{} diff --git a/pkg/resource/training_job/custom_update_conditions.go b/pkg/resource/training_job/custom_update_conditions.go deleted file mode 100644 index c94cfcf0..00000000 --- a/pkg/resource/training_job/custom_update_conditions.go +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"). You may -// not use this file except in compliance with the License. A copy of the -// License is located at -// -// http://aws.amazon.com/apache2.0/ -// -// or in the "license" file accompanying this file. This file is distributed -// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either -// express or implied. See the License for the specific language governing -// permissions and limitations under the License. - -// Use this file if conditions need to be updated based on the latest status -// of training job which is not evident from API response - -package training_job - -import ( - "strings" - - ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" - ackcondition "github.com/aws-controllers-k8s/runtime/pkg/condition" - svcapitypes "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" - corev1 "k8s.io/api/core/v1" -) - -var ( - terminalCode string = "[ACK_SM]" -) - -// If the controller runs into an error that contains "[ACK_SM]" -// it will set the resource to a terminal state because it is an unrecoverable error. -func (rm *resourceManager) CustomUpdateConditions( - ko *svcapitypes.TrainingJob, - r *resource, - err error, -) bool { - - if ackcompare.IsNil(err) { - return false - } - - if strings.Contains(err.Error(), terminalCode) { - conditionManager := &resource{ko} - exception := err.Error() - ackcondition.SetTerminal(conditionManager, corev1.ConditionTrue, &exception, nil) - return true - } - - return false - -} diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index 2bcb3854..685ba9d9 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -17,6 +17,7 @@ import ( "errors" ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" + ackerr "github.com/aws-controllers-k8s/runtime/pkg/errors" ackrequeue "github.com/aws-controllers-k8s/runtime/pkg/requeue" svcapitypes "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" svccommon "github.com/aws-controllers-k8s/sagemaker-controller/pkg/common" @@ -51,11 +52,11 @@ var ( ) requeueBeforeUpdate = ackrequeue.NeededAfter( - errors.New("Warm pool cannot be updated in InProgress state, requeuing until TrainingJob reaches completed state."), + errors.New("warm pool cannot be updated in InProgress state, requeuing until TrainingJob reaches completed state."), ackrequeue.DefaultRequeueAfterDuration, ) requeueBeforeUpdateStarting = ackrequeue.NeededAfter( - errors.New("Controller cannot update while secondary status is in Starting state."), + errors.New("controller cannot update while secondary status is in Starting state."), ackrequeue.DefaultRequeueAfterDuration, ) ) @@ -112,7 +113,7 @@ func (rm *resourceManager) customSetOutput(r *resource) { } -// This function makes the controller requeue if there is an update and +// customSetOutputUpdateWarmpool makes the controller requeue if there is an update and // the training job is still in InProgress func customSetOutputUpdateWarmpool(r *resource) error { trainingJobStatus := r.ko.Status.TrainingJobStatus @@ -122,7 +123,7 @@ func customSetOutputUpdateWarmpool(r *resource) error { return nil } -// Check if warm pool has reached a state where it is not updateable +// warmPoolTerminalCheck checks if warm pool has reached a state where it is not updateable func warmPoolTerminalCheck(latest *resource) bool { trainingJobStatus := latest.ko.Status.TrainingJobStatus if ackcompare.IsNotNil(latest.ko.Spec.ResourceConfig) { @@ -154,7 +155,8 @@ func warmPoolTerminalCheck(latest *resource) bool { return true } -// Profiler cannot be updated at certain statuses. +// customSetOutputUpdateProfiler decides whether the training job is ready/eligible for update +// depending on the status. func customSetOutputUpdateProfiler(r *resource) error { trainingSecondaryStatus := r.ko.Status.SecondaryStatus trainingJobStatus := r.ko.Status.TrainingJobStatus @@ -164,14 +166,14 @@ func customSetOutputUpdateProfiler(r *resource) error { if ackcompare.IsNotNil(trainingJobStatus) { for _, terminalStatus := range TrainingJobTerminalProfiler { if terminalStatus == *trainingJobStatus { - return errors.New("[ACK_SM] Profiler can only be updated when Training Job is in InProgress state") + return ackerr.NewTerminalError(errors.New("profiler can only be updated when Training Job is in InProgress state")) } } } return nil } -// Checks if the profiler was removed. +// profilerRemovalCheck checks if the profiler was removed. func profilerRemovalCheck(desired *resource, latest *resource) bool { if ackcompare.IsNotNil(desired.ko.Spec) && ackcompare.IsNotNil(latest.ko.Spec) { if ackcompare.IsNil(desired.ko.Spec.ProfilerRuleConfigurations) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerRuleConfigurations) { @@ -184,7 +186,7 @@ func profilerRemovalCheck(desired *resource, latest *resource) bool { return false } -// The statuses in ko object in the end of update are empty, using customSetOutput wont work. +// customSetOutputPostUpdate sets the synced condition at the end of the update. func customSetOutputPostUpdate(ko *svcapitypes.TrainingJob, delta *ackcompare.Delta) { warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index 5358b78b..78eb3196 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -1058,17 +1058,17 @@ func (rm *resourceManager) sdkUpdate( warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") if warmpool_diff && profiler_diff { - return latest, errors.New("[ACK_SM] Cannot update Warm pool and Profiler at the same time.") + return latest, ackerr.NewTerminalError(errors.New("cannot update Warm pool and Profiler at the same time")) } if !warmpool_diff && !profiler_diff { - return latest, errors.New("[ACK_SM] Only Warm Pool or Profiler can be updated") + return latest, ackerr.NewTerminalError(errors.New("only Warm Pool or Profiler can be updated")) } if warmpool_diff { input.SetProfilerConfig(nil) input.SetProfilerRuleConfigurations(nil) warmpool_terminal := warmPoolTerminalCheck(latest) if warmpool_terminal { - return latest, errors.New("[ACK_SM] Warm pool either does not exist or has reached a non updatable state.") + return latest, ackerr.NewTerminalError(errors.New("warm pool either does not exist or has reached a non updatable state")) } //Requeue if TrainingJob is in InProgress state if err := customSetOutputUpdateWarmpool(latest); err != nil { @@ -1337,9 +1337,7 @@ func (rm *resourceManager) updateConditions( } // Required to avoid the "declared but not used" error in the default case _ = syncCondition - // custom update conditions - customUpdate := rm.CustomUpdateConditions(ko, r, err) - if terminalCondition != nil || recoverableCondition != nil || syncCondition != nil || customUpdate { + if terminalCondition != nil || recoverableCondition != nil || syncCondition != nil { return &resource{ko}, true // updated } return nil, false // not updated diff --git a/templates/training_job/sdk_update_post_build_request.go.tpl b/templates/training_job/sdk_update_post_build_request.go.tpl index 38d60b0c..bbc63400 100644 --- a/templates/training_job/sdk_update_post_build_request.go.tpl +++ b/templates/training_job/sdk_update_post_build_request.go.tpl @@ -1,17 +1,17 @@ warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") if warmpool_diff && profiler_diff { - return latest, errors.New("[ACK_SM] Cannot update Warm pool and Profiler at the same time.") + return latest, ackerr.NewTerminalError(errors.New("cannot update Warm pool and Profiler at the same time")) } if !warmpool_diff && !profiler_diff { - return latest, errors.New("[ACK_SM] Only Warm Pool or Profiler can be updated") + return latest, ackerr.NewTerminalError(errors.New("only Warm Pool or Profiler can be updated")) } if warmpool_diff { input.SetProfilerConfig(nil) input.SetProfilerRuleConfigurations(nil) warmpool_terminal := warmPoolTerminalCheck(latest) if warmpool_terminal { - return latest, errors.New("[ACK_SM] Warm pool either does not exist or has reached a non updatable state.") + return latest, ackerr.NewTerminalError(errors.New("warm pool either does not exist or has reached a non updatable state")) } //Requeue if TrainingJob is in InProgress state if err := customSetOutputUpdateWarmpool(latest); err != nil { From 6f159028f6b1479996d9e88ce74abf97d7aa7833 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Mon, 28 Nov 2022 23:36:56 +0000 Subject: [PATCH 34/71] updated test --- .../testdata/v1alpha1/readone/observed/warmpool_nostatus.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_nostatus.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_nostatus.yaml index ce6c228f..028300ac 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_nostatus.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_nostatus.yaml @@ -106,4 +106,5 @@ status: status: "False" type: ACK.ResourceSynced secondaryStatus: Completed - trainingJobStatus: Completed \ No newline at end of file + trainingJobStatus: Completed + profilingStatus: Disabled \ No newline at end of file From 1255e9cf97b3a4f9272a5005377b98a98b50f573 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Tue, 29 Nov 2022 00:13:59 +0000 Subject: [PATCH 35/71] test: added more cases to improve coverage --- .../training_job/testdata/test_suite.yaml | 21 +++++++ .../update/desired/invalidupdate.yaml | 42 ++++++++++++++ .../update/desired/updateProfileConfig.yaml | 9 +-- .../v1alpha1/update/desired/updated_base.yaml | 58 ------------------- .../update/observed/updateProfilerConfig.yaml | 3 +- 5 files changed, 70 insertions(+), 63 deletions(-) create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/desired/invalidupdate.yaml delete mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/desired/updated_base.yaml diff --git a/pkg/resource/training_job/testdata/test_suite.yaml b/pkg/resource/training_job/testdata/test_suite.yaml index 85e3fb01..95fa6e93 100644 --- a/pkg/resource/training_job/testdata/test_suite.yaml +++ b/pkg/resource/training_job/testdata/test_suite.yaml @@ -374,6 +374,27 @@ tests: invoke: Update expect: error: "resource is in terminal condition" + - name: "Update=Invalid update" + description: "Attempt to update something else" + given: + desired_state: "v1alpha1/update/desired/invalidupdate.yaml" + latest_state: "v1alpha1/update/latest/WPHappy.yaml" + invoke: Update + expect: + error: "resource is in terminal condition" + - name: "Update=Update Profiler Rule" + description: "Attempt to update something else" + given: + desired_state: "v1alpha1/update/desired/updateProfilerRule.yaml" + latest_state: "v1alpha1/update/latest/profilerHappyBase.yaml" + svc_api: + - operation: UpdateTrainingJobWithContext + output_fixture: "sdkapi/update/update_sucess.json" + invoke: Update + expect: + latest_state: "v1alpha1/update/observed/updateProfilerRule.yaml" + error: nil + diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/desired/invalidupdate.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/desired/invalidupdate.yaml new file mode 100644 index 00000000..0b4734c4 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/desired/invalidupdate.yaml @@ -0,0 +1,42 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + name: training-test +spec: + trainingJobName: training-test + hyperParameters: + max_depth: "5" + gamma: "4" + eta: "0.2" + min_child_weight: "6" + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + outputDataConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 2 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + keepAlivePeriodInSeconds: 100 + stoppingCondition: + maxRuntimeInSeconds: 86400 + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: [] diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateProfileConfig.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateProfileConfig.yaml index 7fdbafed..0bcefcb4 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateProfileConfig.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/desired/updateProfileConfig.yaml @@ -37,10 +37,11 @@ spec: s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ profilingIntervalInMilliseconds: 200 profilerRuleConfigurations: - - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest - ruleParameters: - rule_to_invoke: ProfilerReport + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 + ruleParameters: + rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 status: ackResourceMetadata: arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/desired/updated_base.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/desired/updated_base.yaml deleted file mode 100644 index ade7c30d..00000000 --- a/pkg/resource/training_job/testdata/v1alpha1/update/desired/updated_base.yaml +++ /dev/null @@ -1,58 +0,0 @@ -apiVersion: sagemaker.services.k8s.aws/v1alpha1 -kind: TrainingJob -metadata: - creationTimestamp: null - name: unit-testing-training-job -spec: - algorithmSpecification: - trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com - trainingInputMode: File - hyperParameters: - eta: "0.2" - gamma: "4" - max_depth: "5" - min_child_weight: "6" - num_class: "10" - num_round: "10" - objective: multi:softmax - silent: "0" - inputDataConfig: - - channelName: train - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train - - channelName: validation - compressionType: None - contentType: text/csv - dataSource: - s3DataSource: - s3DataDistributionType: FullyReplicated - s3DataType: S3Prefix - s3URI: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation - outputDataConfig: - s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output - resourceConfig: - instanceCount: 1 - instanceType: ml.m4.xlarge - volumeSizeInGB: 10 - roleARN: arn:aws:iam::123456789012:role/ack-sagemaker-execution-role - stoppingCondition: - maxRuntimeInSeconds: 86400 - tags: - - key: algorithm - value: xgboost - - key: environment - value: testing - - key: customer - value: test-user - trainingJobName: xgboost-training-job -status: - ackResourceMetadata: - arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job - ownerAccountID: "" - region: "" - conditions: [] \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml index 3f400c0a..29c32faa 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml @@ -38,9 +38,10 @@ spec: s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ profilerRuleConfigurations: - ruleConfigurationName: ProfilerReport - ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:1 ruleParameters: rule_to_invoke: ProfilerReport + volumeSizeInGB: 0 status: ackResourceMetadata: arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test From a4ee4ebb6b4ca7e6f658ac33a284e6651f07e1d7 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Tue, 29 Nov 2022 01:48:43 +0000 Subject: [PATCH 36/71] updates comments and removed unnecessary nil check --- pkg/resource/training_job/custom_delta.go | 5 +++-- pkg/resource/training_job/hooks.go | 12 +++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/pkg/resource/training_job/custom_delta.go b/pkg/resource/training_job/custom_delta.go index a67c3490..8caad645 100644 --- a/pkg/resource/training_job/custom_delta.go +++ b/pkg/resource/training_job/custom_delta.go @@ -66,7 +66,7 @@ func customSetDefaults( } // SM returns profiler related objects even if the user disables the profiler -// this function detects if there is a diff +// customPostCompare detects if there is a diff func customPostCompare(latest *resource, desired *resource, delta *ackcompare.Delta) { profilerConfigDiff := delta.DifferentAt("Spec.ProfilerConfig") profilerRuleDiff := delta.DifferentAt("Spec.ProfilerRuleConfigurations") @@ -95,13 +95,14 @@ func customPostCompare(latest *resource, desired *resource, delta *ackcompare.De } } +// userInitiatesProfilerCheck checks if the user enabled/re enabled the profiler. func userInitiatesProfilerCheck(desired *resource) bool { profilerConfigPresent := ackcompare.IsNotNil(desired.ko.Spec.ProfilerConfig) profilerRuleConfigPresent := ackcompare.IsNotNil(desired.ko.Spec.ProfilerRuleConfigurations) return profilerConfigPresent && profilerRuleConfigPresent } -// Removes fieldName from the delta slice. +// removeDelta Removes fieldName from the delta slice. // TODO: Replace when ack runtime can do this. func removeDelta(delta *ackcompare.Delta, fieldName string) { differences := delta.Differences diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index 685ba9d9..8a470798 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -175,13 +175,11 @@ func customSetOutputUpdateProfiler(r *resource) error { // profilerRemovalCheck checks if the profiler was removed. func profilerRemovalCheck(desired *resource, latest *resource) bool { - if ackcompare.IsNotNil(desired.ko.Spec) && ackcompare.IsNotNil(latest.ko.Spec) { - if ackcompare.IsNil(desired.ko.Spec.ProfilerRuleConfigurations) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerRuleConfigurations) { - return true - } - if ackcompare.IsNil(desired.ko.Spec.ProfilerConfig) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerConfig) { - return true - } + if ackcompare.IsNil(desired.ko.Spec.ProfilerRuleConfigurations) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerRuleConfigurations) { + return true + } + if ackcompare.IsNil(desired.ko.Spec.ProfilerConfig) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerConfig) { + return true } return false } From 3c67e9d1e4f443d9d6aefcd7a2ea022907505e2f Mon Sep 17 00:00:00 2001 From: ananth102 Date: Tue, 29 Nov 2022 19:14:31 +0000 Subject: [PATCH 37/71] fix: do not remove profiler --- pkg/resource/training_job/custom_delta.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/resource/training_job/custom_delta.go b/pkg/resource/training_job/custom_delta.go index 8caad645..c63ab1df 100644 --- a/pkg/resource/training_job/custom_delta.go +++ b/pkg/resource/training_job/custom_delta.go @@ -90,7 +90,7 @@ func customPostCompare(latest *resource, desired *resource, delta *ackcompare.De if profilerConfigDiff && profilerDisabled { removeDelta(delta, "Spec.ProfilerConfig") } - if profilerRuleDiff { + if profilerRuleDiff && profilerDisabled { removeDelta(delta, "Spec.ProfilerRuleConfigurations") } } From c77bd6ae8d55e62b37805d8db5186cb1b9d67da5 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 30 Nov 2022 18:34:28 +0000 Subject: [PATCH 38/71] regnerated controller --- apis/v1alpha1/ack-generate-metadata.yaml | 6 +++--- apis/v1alpha1/types.go | 4 ++-- pkg/resource/training_job/sdk.go | 10 +++++----- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index 25193da2..d7a138e2 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,13 +1,13 @@ ack_generate_info: - build_date: "2022-11-23T21:48:31Z" + build_date: "2022-11-30T18:33:29Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc -api_directory_checksum: afd89795e3cceb09d028722ecbd9512b6197eb6a +api_directory_checksum: 5f19ccb828e1fbe136c1367237a5dd114c73e8af api_version: v1alpha1 aws_sdk_go_version: v1.44.117 generator_config_info: - file_checksum: 7fb39ef5f630ff8286ff19472f423304aaa1c69c + file_checksum: ea1f2f34499bfcd478dcad1dbbb9015e03ec10d5 original_file_name: generator.yaml last_modification: reason: API generation diff --git a/apis/v1alpha1/types.go b/apis/v1alpha1/types.go index 56b9d30f..b6c794ba 100644 --- a/apis/v1alpha1/types.go +++ b/apis/v1alpha1/types.go @@ -2686,9 +2686,9 @@ type RepositoryAuthConfig struct { // to use for model training. type ResourceConfig struct { InstanceCount *int64 `json:"instanceCount,omitempty"` - InstanceGroups []*InstanceGroup `json:"instanceGroups,omitempty"` + InstanceGroups []*InstanceGroup `json:"instanceGroups,omitempty"` InstanceType *string `json:"instanceType,omitempty"` - KeepAlivePeriodInSeconds *int64 `json:"keepAlivePeriodInSeconds,omitempty"` + KeepAlivePeriodInSeconds *int64 `json:"keepAlivePeriodInSeconds,omitempty"` VolumeKMSKeyID *string `json:"volumeKMSKeyID,omitempty"` VolumeSizeInGB *int64 `json:"volumeSizeInGB,omitempty"` } diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index de90839c..bfd2fe23 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -601,17 +601,17 @@ func (rm *resourceManager) sdkFind( ko.Spec.VPCConfig = nil } if resp.WarmPoolStatus != nil { - f39 := &svcapitypes.WarmPoolStatus{} + f40 := &svcapitypes.WarmPoolStatus{} if resp.WarmPoolStatus.ResourceRetainedBillableTimeInSeconds != nil { - f39.ResourceRetainedBillableTimeInSeconds = resp.WarmPoolStatus.ResourceRetainedBillableTimeInSeconds + f40.ResourceRetainedBillableTimeInSeconds = resp.WarmPoolStatus.ResourceRetainedBillableTimeInSeconds } if resp.WarmPoolStatus.ReusedByJob != nil { - f39.ReusedByJob = resp.WarmPoolStatus.ReusedByJob + f40.ReusedByJob = resp.WarmPoolStatus.ReusedByJob } if resp.WarmPoolStatus.Status != nil { - f39.Status = resp.WarmPoolStatus.Status + f40.Status = resp.WarmPoolStatus.Status } - ko.Status.WarmPoolStatus = f39 + ko.Status.WarmPoolStatus = f40 } else { ko.Status.WarmPoolStatus = nil } From 00368893b0c82a0b17d12e9def568162111de22e Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 30 Nov 2022 18:42:50 +0000 Subject: [PATCH 39/71] regenerating controller --- apis/v1alpha1/ack-generate-metadata.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index b95cdb0a..60c7a3ce 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,13 +1,13 @@ ack_generate_info: - build_date: "2022-11-22T18:47:47Z" + build_date: "2022-11-30T18:42:20Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc -api_directory_checksum: a11209e81e188afecb6812ec3080cead7ce995b1 +api_directory_checksum: 60a5467396bc517bbf66356cb8a47f57c8c07d22 api_version: v1alpha1 aws_sdk_go_version: v1.44.117 generator_config_info: - file_checksum: ecbd3d6faa6352c2e9af3cbbe365a6d75c19c3ce + file_checksum: d500d2833dc3000ce75dd115996a2ac26c157774 original_file_name: generator.yaml last_modification: reason: API generation From 83afbb72d830fb2e4d517e9a6c0eb4db093f1e96 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 30 Nov 2022 18:46:43 +0000 Subject: [PATCH 40/71] test: unit test change --- .../v1alpha1/readone/observed/created_instance_groups.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_instance_groups.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_instance_groups.yaml index 6911a1df..4e6d9d16 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_instance_groups.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_instance_groups.yaml @@ -114,3 +114,4 @@ status: type: ACK.ResourceSynced secondaryStatus: Downloading trainingJobStatus: InProgress + profilingStatus: Disabled \ No newline at end of file From b3098cddac5741709d73b0d2fbc778c81262175f Mon Sep 17 00:00:00 2001 From: ananth102 Date: Fri, 2 Dec 2022 00:21:15 +0000 Subject: [PATCH 41/71] pr changes --- apis/v1alpha1/ack-generate-metadata.yaml | 6 +- apis/v1alpha1/generator.yaml | 7 +- apis/v1alpha1/training_job.go | 3 + apis/v1alpha1/zz_generated.deepcopy.go | 4 + ...gemaker.services.k8s.aws_trainingjobs.yaml | 5 ++ generator.yaml | 7 +- ...gemaker.services.k8s.aws_trainingjobs.yaml | 5 ++ .../training_job/custom_set_update_input.go | 54 ++++--------- pkg/resource/training_job/hooks.go | 79 +++++++++---------- pkg/resource/training_job/sdk.go | 28 ++++--- .../sdk_update_post_build_request.go.tpl | 17 ++-- .../sdk_update_post_set_output.go.tpl | 6 ++ 12 files changed, 111 insertions(+), 110 deletions(-) create mode 100644 templates/training_job/sdk_update_post_set_output.go.tpl diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index 60c7a3ce..62b95d8e 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,13 +1,13 @@ ack_generate_info: - build_date: "2022-11-30T18:42:20Z" + build_date: "2022-12-02T00:18:37Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc -api_directory_checksum: 60a5467396bc517bbf66356cb8a47f57c8c07d22 +api_directory_checksum: c512f6809b1f599b720320cf3980bfaab609eef7 api_version: v1alpha1 aws_sdk_go_version: v1.44.117 generator_config_info: - file_checksum: d500d2833dc3000ce75dd115996a2ac26c157774 + file_checksum: 8a919b614dd3e79aae4bdc5afb9f8c4b1fc8e8db original_file_name: generator.yaml last_modification: reason: API generation diff --git a/apis/v1alpha1/generator.yaml b/apis/v1alpha1/generator.yaml index 987b84cd..00872904 100644 --- a/apis/v1alpha1/generator.yaml +++ b/apis/v1alpha1/generator.yaml @@ -133,7 +133,7 @@ resources: sdk_update_post_build_request: template_path: training_job/sdk_update_post_build_request.go.tpl sdk_update_post_set_output: - code: customSetOutputPostUpdate(ko, delta) + template_path: training_job/sdk_update_post_set_output.go.tpl sdk_delete_pre_build_request: template_path: training_job/sdk_delete_pre_build_request.go.tpl sdk_delete_post_request: @@ -186,6 +186,11 @@ resources: from: operation: DescribeTrainingJob path: ProfilingStatus + LastModifiedTime: + is_read_only: true + from: + operation: DescribeTrainingJob + path: LastModifiedTime AlgorithmSpecification.MetricDefinitions: compare: is_ignored: true diff --git a/apis/v1alpha1/training_job.go b/apis/v1alpha1/training_job.go index 8f6b19f9..50fcc994 100644 --- a/apis/v1alpha1/training_job.go +++ b/apis/v1alpha1/training_job.go @@ -179,6 +179,9 @@ type TrainingJobStatus struct { // If the training job failed, the reason it failed. // +kubebuilder:validation:Optional FailureReason *string `json:"failureReason,omitempty"` + // A timestamp that indicates when the status of the training job was last modified. + // +kubebuilder:validation:Optional + LastModifiedTime *metav1.Time `json:"lastModifiedTime,omitempty"` // Information about the Amazon S3 location that is configured for storing model // artifacts. // +kubebuilder:validation:Optional diff --git a/apis/v1alpha1/zz_generated.deepcopy.go b/apis/v1alpha1/zz_generated.deepcopy.go index d2916563..f3292c33 100644 --- a/apis/v1alpha1/zz_generated.deepcopy.go +++ b/apis/v1alpha1/zz_generated.deepcopy.go @@ -12789,6 +12789,10 @@ func (in *TrainingJobStatus) DeepCopyInto(out *TrainingJobStatus) { *out = new(string) **out = **in } + if in.LastModifiedTime != nil { + in, out := &in.LastModifiedTime, &out.LastModifiedTime + *out = (*in).DeepCopy() + } if in.ModelArtifacts != nil { in, out := &in.ModelArtifacts, &out.ModelArtifacts *out = new(ModelArtifacts) diff --git a/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml b/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml index 025df796..e4913b44 100644 --- a/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml +++ b/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml @@ -620,6 +620,11 @@ spec: failureReason: description: If the training job failed, the reason it failed. type: string + lastModifiedTime: + description: A timestamp that indicates when the status of the training + job was last modified. + format: date-time + type: string modelArtifacts: description: Information about the Amazon S3 location that is configured for storing model artifacts. diff --git a/generator.yaml b/generator.yaml index 987b84cd..00872904 100644 --- a/generator.yaml +++ b/generator.yaml @@ -133,7 +133,7 @@ resources: sdk_update_post_build_request: template_path: training_job/sdk_update_post_build_request.go.tpl sdk_update_post_set_output: - code: customSetOutputPostUpdate(ko, delta) + template_path: training_job/sdk_update_post_set_output.go.tpl sdk_delete_pre_build_request: template_path: training_job/sdk_delete_pre_build_request.go.tpl sdk_delete_post_request: @@ -186,6 +186,11 @@ resources: from: operation: DescribeTrainingJob path: ProfilingStatus + LastModifiedTime: + is_read_only: true + from: + operation: DescribeTrainingJob + path: LastModifiedTime AlgorithmSpecification.MetricDefinitions: compare: is_ignored: true diff --git a/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml b/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml index 025df796..e4913b44 100644 --- a/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml +++ b/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml @@ -620,6 +620,11 @@ spec: failureReason: description: If the training job failed, the reason it failed. type: string + lastModifiedTime: + description: A timestamp that indicates when the status of the training + job was last modified. + format: date-time + type: string modelArtifacts: description: Information about the Amazon S3 location that is configured for storing model artifacts. diff --git a/pkg/resource/training_job/custom_set_update_input.go b/pkg/resource/training_job/custom_set_update_input.go index 2ea43247..963fe09e 100644 --- a/pkg/resource/training_job/custom_set_update_input.go +++ b/pkg/resource/training_job/custom_set_update_input.go @@ -21,45 +21,19 @@ import ( ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" ackerr "github.com/aws-controllers-k8s/runtime/pkg/errors" - smv1alpha "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" + svcapitypes "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" svcsdk "github.com/aws/aws-sdk-go/service/sagemaker" ) -// customSetUpdateInput modifies the input of UpdateTrainingJob. -// Three conditions: -// 1. Customer updates both profiler parameters: Recreate the input for profiler Rule. -// 2. Customer only updates Profiler Config: Set the profiler rule configuration to nil to avoid validation error. -// 3. Customer only updates Rule Configurations: Recreate the input for profiler Rule and set Profiler config to nil. -// safer to do this because the "only add" behavior might reappear. -func customSetUpdateInput(desired *resource, latest *resource, delta *ackcompare.Delta, input *svcsdk.UpdateTrainingJobInput) error { - if delta.DifferentAt("Spec.ProfilerConfig") && delta.DifferentAt("Spec.ProfilerRuleConfigurations") { - err := handleProfilerRuleConfig(desired, latest, input) - return err - } - if delta.DifferentAt("Spec.ProfilerConfig") && !delta.DifferentAt("Spec.ProfilerRuleConfigurations") { - input.SetProfilerRuleConfigurations(nil) - return nil - } - if delta.DifferentAt("Spec.ProfilerRuleConfigurations") && !delta.DifferentAt("Spec.ProfilerConfig") { - err := handleProfilerRuleConfig(desired, latest, input) - input.SetProfilerConfig(nil) // SM still assumes the profiler config is the same. - return err - } - return nil -} - -// handleProfilerRuleConfig sets the input of the ProfilerRuleConfiguration so that +// buildProfilerRuleConfigUpdateInput sets the input of the ProfilerRuleConfiguration so that // it is compatible with the sagemaker API. // Update training job is post operation wrt to the profiler parameters. // Because of this only NEW rules can be specified. // In this function we check to see if any new profiler configurstions have been added. -func handleProfilerRuleConfig(desired *resource, latest *resource, input *svcsdk.UpdateTrainingJobInput) error { +func buildProfilerRuleConfigUpdateInput(desired *resource, latest *resource, input *svcsdk.UpdateTrainingJobInput) error { profilerRuleDesired := desired.ko.Spec.ProfilerRuleConfigurations profilerRuleLatest := latest.ko.Spec.ProfilerRuleConfigurations - if ackcompare.IsNil(profilerRuleDesired) { - return ackerr.NewTerminalError(errors.New("cannot remove a profiler rule.")) - } if ackcompare.IsNil(profilerRuleLatest) { return nil } @@ -70,7 +44,7 @@ func handleProfilerRuleConfig(desired *resource, latest *resource, input *svcsdk ruleMap := map[string]int{} profilerRuleInput := []*svcsdk.ProfilerRuleConfiguration{} for _, rule := range profilerRuleLatest { - if ackcompare.IsNotNil(rule) && ackcompare.IsNotNil(rule.RuleConfigurationName) { + if ackcompare.IsNotNil(rule.RuleConfigurationName) { ruleMap[*rule.RuleConfigurationName] = 1 } } @@ -97,19 +71,19 @@ func handleProfilerRemoval(input *svcsdk.UpdateTrainingJobInput) { // convertProfileRuleType converts the kubernetes object ProfilerRuleConfiguration into // a type that is compatible with the AWS API. // Sagemaker and kubernetes types are not the same so the input has to be reconstructed. -func convertProfileRuleType(rule *smv1alpha.ProfilerRuleConfiguration) *svcsdk.ProfilerRuleConfiguration { - smRule := &svcsdk.ProfilerRuleConfiguration{} +func convertProfileRuleType(rule *svcapitypes.ProfilerRuleConfiguration) *svcsdk.ProfilerRuleConfiguration { + rule := &svcsdk.ProfilerRuleConfiguration{} if rule.InstanceType != nil { - smRule.SetInstanceType(*rule.InstanceType) + rule.SetInstanceType(*rule.InstanceType) } if rule.LocalPath != nil { - smRule.SetLocalPath(*rule.LocalPath) + rule.SetLocalPath(*rule.LocalPath) } if rule.RuleConfigurationName != nil { - smRule.SetRuleConfigurationName(*rule.RuleConfigurationName) + rule.SetRuleConfigurationName(*rule.RuleConfigurationName) } if rule.RuleEvaluatorImage != nil { - smRule.SetRuleEvaluatorImage(*rule.RuleEvaluatorImage) + rule.SetRuleEvaluatorImage(*rule.RuleEvaluatorImage) } if rule.RuleParameters != nil { f1elemf4 := map[string]*string{} @@ -118,13 +92,13 @@ func convertProfileRuleType(rule *smv1alpha.ProfilerRuleConfiguration) *svcsdk.P f1elemf4val = *f1elemf4valiter f1elemf4[f1elemf4key] = &f1elemf4val } - smRule.SetRuleParameters(f1elemf4) + rule.SetRuleParameters(f1elemf4) } if rule.S3OutputPath != nil { - smRule.SetS3OutputPath(*rule.S3OutputPath) + rule.SetS3OutputPath(*rule.S3OutputPath) } if rule.VolumeSizeInGB != nil { - smRule.SetVolumeSizeInGB(*rule.VolumeSizeInGB) + rule.SetVolumeSizeInGB(*rule.VolumeSizeInGB) } - return smRule + return rule } diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index 8a470798..d23cf748 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -19,7 +19,6 @@ import ( ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" ackerr "github.com/aws-controllers-k8s/runtime/pkg/errors" ackrequeue "github.com/aws-controllers-k8s/runtime/pkg/requeue" - svcapitypes "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" svccommon "github.com/aws-controllers-k8s/sagemaker-controller/pkg/common" "github.com/aws/aws-sdk-go/aws" svcsdk "github.com/aws/aws-sdk-go/service/sagemaker" @@ -56,7 +55,7 @@ var ( ackrequeue.DefaultRequeueAfterDuration, ) requeueBeforeUpdateStarting = ackrequeue.NeededAfter( - errors.New("controller cannot update while secondary status is in Starting state."), + errors.New("training job cannot be updated while secondary status is in Starting state."), ackrequeue.DefaultRequeueAfterDuration, ) ) @@ -79,11 +78,9 @@ func (rm *resourceManager) customSetOutput(r *resource) { } for _, rule := range r.ko.Status.ProfilerRuleEvaluationStatuses { - if ackcompare.IsNotNil(r.ko.Status.ProfilingStatus) { + if ackcompare.IsNotNil(r.ko.Status.ProfilingStatus) && *r.ko.Status.ProfilingStatus == "Disabled" { // Sometimes rule evaluation status will stay in InProgress state. - if *r.ko.Status.ProfilingStatus == "Disabled" { - break - } + break } if rule.RuleEvaluationStatus != nil && svccommon.IsModifyingStatus(rule.RuleEvaluationStatus, &ruleModifyingStatuses) { svccommon.SetSyncedCondition(r, rule.RuleEvaluationStatus, aws.String("ProfilerRule"), &ruleModifyingStatuses) @@ -113,51 +110,40 @@ func (rm *resourceManager) customSetOutput(r *resource) { } -// customSetOutputUpdateWarmpool makes the controller requeue if there is an update and -// the training job is still in InProgress -func customSetOutputUpdateWarmpool(r *resource) error { - trainingJobStatus := r.ko.Status.TrainingJobStatus - if ackcompare.IsNotNil(trainingJobStatus) && *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { - return requeueBeforeUpdate - } - return nil -} - -// warmPoolTerminalCheck checks if warm pool has reached a state where it is not updateable -func warmPoolTerminalCheck(latest *resource) bool { +// isWarmPoolUpdateable returns a requeue or terminal error depending on the warmpool/training job state +func (rm *resourceManager) isWarmPoolUpdatable(latest *resource) error { trainingJobStatus := latest.ko.Status.TrainingJobStatus - if ackcompare.IsNotNil(latest.ko.Spec.ResourceConfig) { - if ackcompare.IsNil(latest.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds) { - return true // Warm pool can only be updated iff there is a provisioned cluster. - } - } else { - return false + if ackcompare.IsNil(latest.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds) { + return ackerr.TerminalError("warm pool does not exist") } - if ackcompare.IsNotNil(trainingJobStatus) { if *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { - return false + return requeueBeforeUpdate } if *trainingJobStatus == svcsdk.TrainingJobStatusCompleted { if ackcompare.IsNotNil(latest.ko.Status.WarmPoolStatus) { wp_modifying := svccommon.IsModifyingStatus(latest.ko.Status.WarmPoolStatus.Status, &WarmPoolModifyingStatuses) - return !wp_modifying + if wp_modifying { + return nil + } else { + return ackerr.TerminalError("warm pool is in a non updateable state") + } } else { - return false // Sometimes the API (briefly) does not return the WP status even if it completes. + return nil // Sometimes the API (briefly) does not return the WP status even if it completes. } } else { // Training Job is in 'Failed'|'Stopping'|'Stopped' (Terminal) - return true + return ackerr.TerminalError("warm pool is in a non updateable state") } + } + return nil - // ACK OIDC is misconfigured (Terminal) - return true } // customSetOutputUpdateProfiler decides whether the training job is ready/eligible for update // depending on the status. -func customSetOutputUpdateProfiler(r *resource) error { +func (rm *resourceManager) customSetOutputUpdateProfiler(r *resource) error { trainingSecondaryStatus := r.ko.Status.SecondaryStatus trainingJobStatus := r.ko.Status.TrainingJobStatus if ackcompare.IsNotNil(trainingSecondaryStatus) && *trainingSecondaryStatus == svcsdk.SecondaryStatusStarting { @@ -173,8 +159,8 @@ func customSetOutputUpdateProfiler(r *resource) error { return nil } -// profilerRemovalCheck checks if the profiler was removed. -func profilerRemovalCheck(desired *resource, latest *resource) bool { +// isProfilerRemoved checks if the profiler was removed. +func (rm *resourceManager) isProfilerRemoved(desired *resource, latest *resource) bool { if ackcompare.IsNil(desired.ko.Spec.ProfilerRuleConfigurations) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerRuleConfigurations) { return true } @@ -184,15 +170,22 @@ func profilerRemovalCheck(desired *resource, latest *resource) bool { return false } -// customSetOutputPostUpdate sets the synced condition at the end of the update. -func customSetOutputPostUpdate(ko *svcapitypes.TrainingJob, delta *ackcompare.Delta) { - warmpool_diff := delta.DifferentAt("Spec.ResourceConfig.KeepAlivePeriodInSeconds") - profiler_diff := delta.DifferentAt("Spec.ProfilerConfig") || delta.DifferentAt("Spec.ProfilerRuleConfigurations") - if profiler_diff { - svccommon.SetSyncedCondition(&resource{ko}, aws.String("InProgress"), &resourceName, &trainingJobModifyingStatuses) - } - if warmpool_diff { - svccommon.SetSyncedCondition(&resource{ko}, aws.String("Available"), aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) +// customSetUpdateInput modifies the input of UpdateTrainingJob. +// Three conditions: +// 1. Customer updates both profiler parameters: Recreate the input for profiler Rule. +// 2. Customer only updates Profiler Config: Set the profiler rule configuration to nil to avoid validation error. +// 3. Customer only updates Rule Configurations: Recreate the input for profiler Rule and set Profiler config to nil. +// safer to do this because the "only add" behavior might reappear. +func customSetUpdateInput(desired *resource, latest *resource, delta *ackcompare.Delta, input *svcsdk.UpdateTrainingJobInput) error { + if !delta.DifferentAt("Spec.ProfilerConfig") { + input.SetProfilerConfig(nil) + } + if !delta.DifferentAt("Spec.ProfilerRuleConfigurations") { + input.SetProfilerRuleConfigurations(nil) + } else { + err := buildProfilerRuleConfigUpdateInput(desired, latest, input) + return err } + return nil } diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index d9d8f754..7f7d65be 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -374,6 +374,11 @@ func (rm *resourceManager) sdkFind( } else { ko.Spec.InputDataConfig = nil } + if resp.LastModifiedTime != nil { + ko.Status.LastModifiedTime = &metav1.Time{*resp.LastModifiedTime} + } else { + ko.Status.LastModifiedTime = nil + } if resp.ModelArtifacts != nil { f19 := &svcapitypes.ModelArtifacts{} if resp.ModelArtifacts.S3ModelArtifacts != nil { @@ -1134,24 +1139,19 @@ func (rm *resourceManager) sdkUpdate( if warmpool_diff { input.SetProfilerConfig(nil) input.SetProfilerRuleConfigurations(nil) - warmpool_terminal := warmPoolTerminalCheck(latest) - if warmpool_terminal { - return latest, ackerr.NewTerminalError(errors.New("warm pool either does not exist or has reached a non updatable state")) - } - //Requeue if TrainingJob is in InProgress state - if err := customSetOutputUpdateWarmpool(latest); err != nil { + if err := rm.isWarmPoolUpdatable(latest); err != nil { return nil, err } } if profiler_diff { - if up_err := customSetOutputUpdateProfiler(latest); up_err != nil { + if up_err := rm.customSetOutputUpdateProfiler(latest); up_err != nil { return nil, up_err } input.SetResourceConfig(nil) - if profilerRemovalCheck(desired, latest) { - handleProfilerRemoval(input) + if rm.isProfilerRemoved(desired, latest) { + rm.handleProfilerRemoval(input) } else { - inp_err := customSetUpdateInput(desired, latest, delta, input) + inp_err := rm.customSetUpdateInput(desired, latest, delta, input) if inp_err != nil { return nil, err } @@ -1178,7 +1178,13 @@ func (rm *resourceManager) sdkUpdate( } rm.setStatusDefaults(ko) - customSetOutputPostUpdate(ko, delta) + observed, err := rm.sdkFind(ctx, latest) + if err != nil { + return observed, err + } + tmp_resource := &resource{ko} + tmp_resource.SetStatus(observed) + return &resource{ko}, nil } diff --git a/templates/training_job/sdk_update_post_build_request.go.tpl b/templates/training_job/sdk_update_post_build_request.go.tpl index bbc63400..70d3da9a 100644 --- a/templates/training_job/sdk_update_post_build_request.go.tpl +++ b/templates/training_job/sdk_update_post_build_request.go.tpl @@ -9,24 +9,19 @@ if !warmpool_diff && !profiler_diff { if warmpool_diff { input.SetProfilerConfig(nil) input.SetProfilerRuleConfigurations(nil) - warmpool_terminal := warmPoolTerminalCheck(latest) - if warmpool_terminal { - return latest, ackerr.NewTerminalError(errors.New("warm pool either does not exist or has reached a non updatable state")) - } - //Requeue if TrainingJob is in InProgress state - if err := customSetOutputUpdateWarmpool(latest); err != nil { - return nil,err + if err := rm.isWarmPoolUpdatable(latest); err != nil { + return nil, err } } if profiler_diff { - if up_err := customSetOutputUpdateProfiler(latest); up_err != nil { + if up_err := rm.customSetOutputUpdateProfiler(latest); up_err != nil { return nil, up_err } input.SetResourceConfig(nil) - if profilerRemovalCheck(desired, latest) { - handleProfilerRemoval(input) + if rm.isProfilerRemoved(desired, latest) { + rm.handleProfilerRemoval(input) } else{ - inp_err := customSetUpdateInput(desired, latest, delta, input) + inp_err := rm.customSetUpdateInput(desired, latest, delta, input) if inp_err != nil { return nil, err } diff --git a/templates/training_job/sdk_update_post_set_output.go.tpl b/templates/training_job/sdk_update_post_set_output.go.tpl new file mode 100644 index 00000000..f54805e3 --- /dev/null +++ b/templates/training_job/sdk_update_post_set_output.go.tpl @@ -0,0 +1,6 @@ +observed, err := rm.sdkFind(ctx, latest) +if err != nil { + return observed, err +} +tmp_resource := &resource{ko} +tmp_resource.SetStatus(observed) From 77f9918c4a792e653eb75481cb095aff92817964 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Fri, 2 Dec 2022 01:18:07 +0000 Subject: [PATCH 42/71] unit test and general code changes --- apis/v1alpha1/ack-generate-metadata.yaml | 6 +++--- apis/v1alpha1/generator.yaml | 5 +++++ apis/v1alpha1/training_job.go | 3 +++ apis/v1alpha1/zz_generated.deepcopy.go | 4 ++++ ...gemaker.services.k8s.aws_trainingjobs.yaml | 5 +++++ generator.yaml | 5 +++++ ...gemaker.services.k8s.aws_trainingjobs.yaml | 5 +++++ .../training_job/custom_set_update_input.go | 20 +++++++++---------- pkg/resource/training_job/hooks.go | 12 ++++++----- pkg/resource/training_job/sdk.go | 5 +++++ .../v1alpha1/readone/observed/completed.yaml | 2 ++ .../completed_debugger_variation.yaml | 4 +++- .../observed/conditions_clear_on_success.yaml | 2 ++ .../v1alpha1/readone/observed/created.yaml | 2 ++ .../observed/created_debugger_variation.yaml | 2 ++ .../observed/created_instance_groups.yaml | 4 +++- .../readone/observed/late_initialize.yaml | 2 ++ .../v1alpha1/readone/observed/stopping.yaml | 2 ++ .../observed/stopping_debugger_variation.yaml | 2 ++ .../readone/observed/warmpool_available.yaml | 4 +++- .../readone/observed/warmpool_inuse.yaml | 4 +++- .../readone/observed/warmpool_nostatus.yaml | 4 +++- .../readone/observed/warmpool_reused.yaml | 4 +++- .../readone/observed/warmpool_terminated.yaml | 2 ++ 24 files changed, 86 insertions(+), 24 deletions(-) diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index 62b95d8e..2d1d31ae 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,13 +1,13 @@ ack_generate_info: - build_date: "2022-12-02T00:18:37Z" + build_date: "2022-12-02T00:59:13Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc -api_directory_checksum: c512f6809b1f599b720320cf3980bfaab609eef7 +api_directory_checksum: 368ed3f24a6e88c5d29d44cc0a7ce60e35152d50 api_version: v1alpha1 aws_sdk_go_version: v1.44.117 generator_config_info: - file_checksum: 8a919b614dd3e79aae4bdc5afb9f8c4b1fc8e8db + file_checksum: d56d093045a7f6f17808fa8ef603f2dfb9945519 original_file_name: generator.yaml last_modification: reason: API generation diff --git a/apis/v1alpha1/generator.yaml b/apis/v1alpha1/generator.yaml index 00872904..8318ee1c 100644 --- a/apis/v1alpha1/generator.yaml +++ b/apis/v1alpha1/generator.yaml @@ -191,6 +191,11 @@ resources: from: operation: DescribeTrainingJob path: LastModifiedTime + CreationTime: + is_read_only: true + from: + operation: DescribeTrainingJob + path: CreationTime AlgorithmSpecification.MetricDefinitions: compare: is_ignored: true diff --git a/apis/v1alpha1/training_job.go b/apis/v1alpha1/training_job.go index 50fcc994..d1302c79 100644 --- a/apis/v1alpha1/training_job.go +++ b/apis/v1alpha1/training_job.go @@ -173,6 +173,9 @@ type TrainingJobStatus struct { // resource // +kubebuilder:validation:Optional Conditions []*ackv1alpha1.Condition `json:"conditions"` + // A timestamp that indicates when the training job was created. + // +kubebuilder:validation:Optional + CreationTime *metav1.Time `json:"creationTime,omitempty"` // Evaluation status of Debugger rules for debugging on a training job. // +kubebuilder:validation:Optional DebugRuleEvaluationStatuses []*DebugRuleEvaluationStatus `json:"debugRuleEvaluationStatuses,omitempty"` diff --git a/apis/v1alpha1/zz_generated.deepcopy.go b/apis/v1alpha1/zz_generated.deepcopy.go index f3292c33..59ed0bdb 100644 --- a/apis/v1alpha1/zz_generated.deepcopy.go +++ b/apis/v1alpha1/zz_generated.deepcopy.go @@ -12773,6 +12773,10 @@ func (in *TrainingJobStatus) DeepCopyInto(out *TrainingJobStatus) { } } } + if in.CreationTime != nil { + in, out := &in.CreationTime, &out.CreationTime + *out = (*in).DeepCopy() + } if in.DebugRuleEvaluationStatuses != nil { in, out := &in.DebugRuleEvaluationStatuses, &out.DebugRuleEvaluationStatuses *out = make([]*DebugRuleEvaluationStatus, len(*in)) diff --git a/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml b/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml index e4913b44..48305bcb 100644 --- a/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml +++ b/config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml @@ -598,6 +598,11 @@ spec: - type type: object type: array + creationTime: + description: A timestamp that indicates when the training job was + created. + format: date-time + type: string debugRuleEvaluationStatuses: description: Evaluation status of Debugger rules for debugging on a training job. diff --git a/generator.yaml b/generator.yaml index 00872904..8318ee1c 100644 --- a/generator.yaml +++ b/generator.yaml @@ -191,6 +191,11 @@ resources: from: operation: DescribeTrainingJob path: LastModifiedTime + CreationTime: + is_read_only: true + from: + operation: DescribeTrainingJob + path: CreationTime AlgorithmSpecification.MetricDefinitions: compare: is_ignored: true diff --git a/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml b/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml index e4913b44..48305bcb 100644 --- a/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml +++ b/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml @@ -598,6 +598,11 @@ spec: - type type: object type: array + creationTime: + description: A timestamp that indicates when the training job was + created. + format: date-time + type: string debugRuleEvaluationStatuses: description: Evaluation status of Debugger rules for debugging on a training job. diff --git a/pkg/resource/training_job/custom_set_update_input.go b/pkg/resource/training_job/custom_set_update_input.go index 963fe09e..20b9f472 100644 --- a/pkg/resource/training_job/custom_set_update_input.go +++ b/pkg/resource/training_job/custom_set_update_input.go @@ -61,7 +61,7 @@ func buildProfilerRuleConfigUpdateInput(desired *resource, latest *resource, inp } // handleProfilerRemoval sets the input parameters to disable the profiler. -func handleProfilerRemoval(input *svcsdk.UpdateTrainingJobInput) { +func (rm *resourceManager) handleProfilerRemoval(input *svcsdk.UpdateTrainingJobInput) { input.SetProfilerRuleConfigurations(nil) profilerConfig := svcsdk.ProfilerConfigForUpdate{} profilerConfig.SetDisableProfiler(true) @@ -72,18 +72,18 @@ func handleProfilerRemoval(input *svcsdk.UpdateTrainingJobInput) { // a type that is compatible with the AWS API. // Sagemaker and kubernetes types are not the same so the input has to be reconstructed. func convertProfileRuleType(rule *svcapitypes.ProfilerRuleConfiguration) *svcsdk.ProfilerRuleConfiguration { - rule := &svcsdk.ProfilerRuleConfiguration{} + smRule := &svcsdk.ProfilerRuleConfiguration{} if rule.InstanceType != nil { - rule.SetInstanceType(*rule.InstanceType) + smRule.SetInstanceType(*rule.InstanceType) } if rule.LocalPath != nil { - rule.SetLocalPath(*rule.LocalPath) + smRule.SetLocalPath(*rule.LocalPath) } if rule.RuleConfigurationName != nil { - rule.SetRuleConfigurationName(*rule.RuleConfigurationName) + smRule.SetRuleConfigurationName(*rule.RuleConfigurationName) } if rule.RuleEvaluatorImage != nil { - rule.SetRuleEvaluatorImage(*rule.RuleEvaluatorImage) + smRule.SetRuleEvaluatorImage(*rule.RuleEvaluatorImage) } if rule.RuleParameters != nil { f1elemf4 := map[string]*string{} @@ -92,13 +92,13 @@ func convertProfileRuleType(rule *svcapitypes.ProfilerRuleConfiguration) *svcsdk f1elemf4val = *f1elemf4valiter f1elemf4[f1elemf4key] = &f1elemf4val } - rule.SetRuleParameters(f1elemf4) + smRule.SetRuleParameters(f1elemf4) } if rule.S3OutputPath != nil { - rule.SetS3OutputPath(*rule.S3OutputPath) + smRule.SetS3OutputPath(*rule.S3OutputPath) } if rule.VolumeSizeInGB != nil { - rule.SetVolumeSizeInGB(*rule.VolumeSizeInGB) + smRule.SetVolumeSizeInGB(*rule.VolumeSizeInGB) } - return rule + return smRule } diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index d23cf748..b7bc50aa 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -114,7 +114,7 @@ func (rm *resourceManager) customSetOutput(r *resource) { func (rm *resourceManager) isWarmPoolUpdatable(latest *resource) error { trainingJobStatus := latest.ko.Status.TrainingJobStatus if ackcompare.IsNil(latest.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds) { - return ackerr.TerminalError("warm pool does not exist") + return ackerr.NewTerminalError(errors.New("warm pool does not exist")) } if ackcompare.IsNotNil(trainingJobStatus) { if *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { @@ -126,14 +126,16 @@ func (rm *resourceManager) isWarmPoolUpdatable(latest *resource) error { if wp_modifying { return nil } else { - return ackerr.TerminalError("warm pool is in a non updateable state") + return ackerr.NewTerminalError(errors.New("warm pool is in a non updateable state")) } } else { - return nil // Sometimes the API (briefly) does not return the WP status even if it completes. + // Sometimes the API (briefly) does not return the WP status even if it completes. + // This only occurs for a short time after training job has reached Completed state. + return requeueBeforeUpdate } } else { // Training Job is in 'Failed'|'Stopping'|'Stopped' (Terminal) - return ackerr.TerminalError("warm pool is in a non updateable state") + return ackerr.NewTerminalError(errors.New("warm pool is in a non updateable state")) } } @@ -176,7 +178,7 @@ func (rm *resourceManager) isProfilerRemoved(desired *resource, latest *resource // 2. Customer only updates Profiler Config: Set the profiler rule configuration to nil to avoid validation error. // 3. Customer only updates Rule Configurations: Recreate the input for profiler Rule and set Profiler config to nil. // safer to do this because the "only add" behavior might reappear. -func customSetUpdateInput(desired *resource, latest *resource, delta *ackcompare.Delta, input *svcsdk.UpdateTrainingJobInput) error { +func (rm *resourceManager) customSetUpdateInput(desired *resource, latest *resource, delta *ackcompare.Delta, input *svcsdk.UpdateTrainingJobInput) error { if !delta.DifferentAt("Spec.ProfilerConfig") { input.SetProfilerConfig(nil) } diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index 7f7d65be..5d978b78 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -131,6 +131,11 @@ func (rm *resourceManager) sdkFind( } else { ko.Spec.CheckpointConfig = nil } + if resp.CreationTime != nil { + ko.Status.CreationTime = &metav1.Time{*resp.CreationTime} + } else { + ko.Status.CreationTime = nil + } if resp.DebugHookConfig != nil { f5 := &svcapitypes.DebugHookConfig{} if resp.DebugHookConfig.CollectionConfigurations != nil { diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed.yaml index c086d9a3..1f637fbc 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed.yaml @@ -109,3 +109,5 @@ status: secondaryStatus: Completed trainingJobStatus: Completed profilingStatus: Disabled + creationTime: "2021-10-12T05:49:40.493Z" + lastModifiedTime: "2021-10-12T05:55:25.548Z" diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed_debugger_variation.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed_debugger_variation.yaml index 6b78b3d2..768276c3 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed_debugger_variation.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/completed_debugger_variation.yaml @@ -150,4 +150,6 @@ status: ruleEvaluationStatus: Completed secondaryStatus: Completed trainingJobStatus: Completed - profilingStatus: Enabled \ No newline at end of file + profilingStatus: Enabled + creationTime: "2021-10-12T05:49:40.493Z" + lastModifiedTime: "2021-10-12T05:52:46.108Z" \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/conditions_clear_on_success.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/conditions_clear_on_success.yaml index b4431657..af42172b 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/conditions_clear_on_success.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/conditions_clear_on_success.yaml @@ -109,3 +109,5 @@ status: secondaryStatus: Downloading trainingJobStatus: InProgress profilingStatus: Disabled + creationTime: "2021-10-12T05:49:40.493Z" + lastModifiedTime: "2021-10-12T05:52:46.108Z" diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created.yaml index ace5bc9c..72604ecc 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created.yaml @@ -107,3 +107,5 @@ status: secondaryStatus: Downloading trainingJobStatus: InProgress profilingStatus: Disabled + creationTime: "2021-10-12T05:49:40.493Z" + lastModifiedTime: "2021-10-12T05:52:46.108Z" \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_debugger_variation.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_debugger_variation.yaml index 006d0382..a11d3d18 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_debugger_variation.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_debugger_variation.yaml @@ -151,3 +151,5 @@ status: secondaryStatus: Starting trainingJobStatus: InProgress profilingStatus: Enabled + creationTime: "2021-10-13T03:49:20.337Z" + lastModifiedTime: "2021-10-13T03:49:20.576Z" \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_instance_groups.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_instance_groups.yaml index 4e6d9d16..602eb225 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_instance_groups.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/created_instance_groups.yaml @@ -114,4 +114,6 @@ status: type: ACK.ResourceSynced secondaryStatus: Downloading trainingJobStatus: InProgress - profilingStatus: Disabled \ No newline at end of file + profilingStatus: Disabled + creationTime: "2021-10-12T05:49:40.493Z" + lastModifiedTime: "2021-10-12T05:52:46.108Z" \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/late_initialize.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/late_initialize.yaml index 7d41233d..fa34a587 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/late_initialize.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/late_initialize.yaml @@ -111,3 +111,5 @@ status: secondaryStatus: Downloading trainingJobStatus: InProgress profilingStatus: Disabled + creationTime: "2021-10-12T05:49:40.493Z" + lastModifiedTime: "2021-10-12T05:52:46.108Z" diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping.yaml index bbd786d2..b0ee322c 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping.yaml @@ -107,3 +107,5 @@ status: secondaryStatus: Starting trainingJobStatus: Stopping profilingStatus: Disabled + creationTime: "2021-10-13T05:12:45.869Z" + lastModifiedTime: "2021-10-13T05:12:58.956Z" diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping_debugger_variation.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping_debugger_variation.yaml index 0558f186..288106dc 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping_debugger_variation.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/stopping_debugger_variation.yaml @@ -153,3 +153,5 @@ status: secondaryStatus: Starting trainingJobStatus: Stopping profilingStatus: Enabled + creationTime: "2021-10-13T05:48:47.342Z" + lastModifiedTime: "2021-10-13T05:49:11.155Z" diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml index 9819560f..bbe90085 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml @@ -109,4 +109,6 @@ status: status: Available secondaryStatus: Completed trainingJobStatus: Completed - profilingStatus: Disabled \ No newline at end of file + profilingStatus: Disabled + creationTime: "2021-10-12T05:49:40.493Z" + lastModifiedTime: "2021-10-12T05:52:46.108Z" \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml index f7acc768..7d85d078 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_inuse.yaml @@ -109,4 +109,6 @@ status: status: InUse secondaryStatus: Starting trainingJobStatus: InProgress - profilingStatus: Disabled \ No newline at end of file + profilingStatus: Disabled + creationTime: "2021-10-12T05:49:40.493Z" + lastModifiedTime: "2021-10-12T05:52:46.108Z" \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_nostatus.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_nostatus.yaml index 028300ac..cf2e2583 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_nostatus.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_nostatus.yaml @@ -107,4 +107,6 @@ status: type: ACK.ResourceSynced secondaryStatus: Completed trainingJobStatus: Completed - profilingStatus: Disabled \ No newline at end of file + profilingStatus: Disabled + creationTime: "2021-10-12T05:49:40.493Z" + lastModifiedTime: "2021-10-12T05:52:46.108Z" \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml index 2cd053eb..df232296 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_reused.yaml @@ -111,4 +111,6 @@ status: resourceRetainedBillableTimeInSeconds: 69 secondaryStatus: Completed trainingJobStatus: Completed - profilingStatus: Disabled \ No newline at end of file + profilingStatus: Disabled + creationTime: "2021-10-12T05:49:40.493Z" + lastModifiedTime: "2021-10-12T05:52:46.108Z" \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml index b0d56c11..d36fd070 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_terminated.yaml @@ -111,4 +111,6 @@ status: secondaryStatus: Completed trainingJobStatus: Completed profilingStatus: Disabled + creationTime: "2021-10-12T05:49:40.493Z" + lastModifiedTime: "2021-10-12T05:52:46.108Z" From 63e716fde93bd10b50f903ac842e5abbca650bed Mon Sep 17 00:00:00 2001 From: ananth102 Date: Fri, 2 Dec 2022 02:17:17 +0000 Subject: [PATCH 43/71] updated unit tests --- .../update/describe/bothprofiler_update.json | 224 ++++++++++++++++++ .../describe/profiler_config_update.json | 206 ++++++++++++++++ .../update/describe/profiler_removal.json | 206 ++++++++++++++++ .../update/describe/profilerrule_update.json | 224 ++++++++++++++++++ .../sdkapi/update/describe/warmpool.json | 161 +++++++++++++ .../training_job/testdata/test_suite.yaml | 18 +- .../update/observed/removeProfilerBoth.yaml | 9 + .../update/observed/removeProfilerConfig.yaml | 10 + .../update/observed/removeProfilerRule.yaml | 11 +- .../update/observed/updateProfilerBoth.yaml | 12 + .../update/observed/updateProfilerConfig.yaml | 9 + .../update/observed/updateProfilerRule.yaml | 13 + .../v1alpha1/update/observed/updateWP.yaml | 9 +- 13 files changed, 1109 insertions(+), 3 deletions(-) create mode 100644 pkg/resource/training_job/testdata/sdkapi/update/describe/bothprofiler_update.json create mode 100644 pkg/resource/training_job/testdata/sdkapi/update/describe/profiler_config_update.json create mode 100644 pkg/resource/training_job/testdata/sdkapi/update/describe/profiler_removal.json create mode 100644 pkg/resource/training_job/testdata/sdkapi/update/describe/profilerrule_update.json create mode 100644 pkg/resource/training_job/testdata/sdkapi/update/describe/warmpool.json diff --git a/pkg/resource/training_job/testdata/sdkapi/update/describe/bothprofiler_update.json b/pkg/resource/training_job/testdata/sdkapi/update/describe/bothprofiler_update.json new file mode 100644 index 00000000..333c3b70 --- /dev/null +++ b/pkg/resource/training_job/testdata/sdkapi/update/describe/bothprofiler_update.json @@ -0,0 +1,224 @@ +{ + "AlgorithmSpecification": { + "AlgorithmName": null, + "EnableSageMakerMetricsTimeSeries": false, + "MetricDefinitions": [ + { + "Name": "train:mae", + "Regex": ".*\\[[0-9]+\\].*#011train-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:auc", + "Regex": ".*\\[[0-9]+\\].*#011validation-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:merror", + "Regex": ".*\\[[0-9]+\\].*#011train-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:auc", + "Regex": ".*\\[[0-9]+\\].*#011train-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mae", + "Regex": ".*\\[[0-9]+\\].*#011validation-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:error", + "Regex": ".*\\[[0-9]+\\].*#011validation-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:merror", + "Regex": ".*\\[[0-9]+\\].*#011validation-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:logloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:rmse", + "Regex": ".*\\[[0-9]+\\].*#011train-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:logloss", + "Regex": ".*\\[[0-9]+\\].*#011train-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011train-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:rmse", + "Regex": ".*\\[[0-9]+\\].*#011validation-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011validation-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:error", + "Regex": ".*\\[[0-9]+\\].*#011train-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011train-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:map", + "Regex": ".*\\[[0-9]+\\].*#011train-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:map", + "Regex": ".*\\[[0-9]+\\].*#011validation-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + } + ], + "TrainingImage": "433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1", + "TrainingInputMode": "File" + }, + "AutoMLJobArn": null, + "BillableTimeInSeconds": null, + "CheckpointConfig": null, + "CreationTime": "2021-10-13T03:49:20.337Z", + "DebugHookConfig": null, + "DebugRuleConfigurations": null, + "DebugRuleEvaluationStatuses": null, + "EnableInterContainerTrafficEncryption": false, + "EnableManagedSpotTraining": false, + "EnableNetworkIsolation": false, + "Environment": null, + "ExperimentConfig": null, + "FailureReason": null, + "FinalMetricDataList": null, + "HyperParameters": { + "eta": "0.2", + "gamma": "4", + "max_depth": "5", + "min_child_weight": "6", + "num_round": "51", + "objective": "reg:squarederror", + "silent": "0", + "subsample": "0.7" + }, + "InputDataConfig": [ + { + "ChannelName": "train", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "FileSystemDataSource": null, + "S3DataSource": { + "AttributeNames": null, + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train" + } + }, + "InputMode": null, + "RecordWrapperType": "None", + "ShuffleConfig": null + }, + { + "ChannelName": "validation", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "FileSystemDataSource": null, + "S3DataSource": { + "AttributeNames": null, + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation" + } + }, + "InputMode": null, + "RecordWrapperType": "None", + "ShuffleConfig": null + } + ], + "LabelingJobArn": null, + "LastModifiedTime": "2021-10-13T03:49:20.576Z", + "ModelArtifacts": null, + "OutputDataConfig": { + "KmsKeyId": "", + "S3OutputPath": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/debugger/output" + }, + "ProfilerConfig": { + "ProfilingIntervalInMilliseconds": 200, + "ProfilingParameters": null, + "S3OutputPath": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/" + }, + "ProfilerRuleConfigurations": [ + { + "InstanceType": null, + "LocalPath": null, + "RuleConfigurationName": "ProfilerReport", + "RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/xgboost:1", + "RuleParameters": { + "rule_to_invoke": "ProfilerReport" + }, + "S3OutputPath": null, + "VolumeSizeInGB": 0 + }, + { + "InstanceType": null, + "LocalPath": null, + "RuleConfigurationName": "CPUBottleneck", + "RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/xgboost:1", + "RuleParameters": { + "rule_to_invoke": "CPUBottleneck" + }, + "S3OutputPath": null, + "VolumeSizeInGB": 0 + } + ], + "ProfilerRuleEvaluationStatuses": [ + { + "LastModifiedTime": "2021-10-13T03:49:20.337Z", + "RuleConfigurationName": "ProfilerReport", + "RuleEvaluationJobArn": null, + "RuleEvaluationStatus": "InProgress", + "StatusDetails": null + }, + { + "LastModifiedTime": "2021-10-13T03:49:20.337Z", + "RuleConfigurationName": "CPUBottleneck", + "RuleEvaluationJobArn": null, + "RuleEvaluationStatus": "InProgress", + "StatusDetails": null + } + ], + "ProfilingStatus": "Enabled", + "ResourceConfig": { + "InstanceCount": 1, + "InstanceType": "ml.m4.xlarge", + "VolumeKmsKeyId": null, + "VolumeSizeInGB": 5 + }, + "RoleArn": "arn:aws:iam::123456789012:role/service-role/AmazonSageMaker-ExecutionRole-20210920T111639", + "SecondaryStatus": "Downloading", + "SecondaryStatusTransitions": [ + { + "EndTime": null, + "StartTime": "2021-10-13T03:49:20.337Z", + "Status": "Starting", + "StatusMessage": "Starting the training job" + } + ], + "StoppingCondition": { + "MaxRuntimeInSeconds": 86400, + "MaxWaitTimeInSeconds": null + }, + "TensorBoardOutputConfig": null, + "TrainingEndTime": null, + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test", + "TrainingJobName": "training-test", + "TrainingJobStatus": "InProgress", + "TrainingStartTime": null, + "TrainingTimeInSeconds": null, + "TuningJobArn": null, + "VpcConfig": null +} \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/sdkapi/update/describe/profiler_config_update.json b/pkg/resource/training_job/testdata/sdkapi/update/describe/profiler_config_update.json new file mode 100644 index 00000000..a2436cd9 --- /dev/null +++ b/pkg/resource/training_job/testdata/sdkapi/update/describe/profiler_config_update.json @@ -0,0 +1,206 @@ +{ + "AlgorithmSpecification": { + "AlgorithmName": null, + "EnableSageMakerMetricsTimeSeries": false, + "MetricDefinitions": [ + { + "Name": "train:mae", + "Regex": ".*\\[[0-9]+\\].*#011train-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:auc", + "Regex": ".*\\[[0-9]+\\].*#011validation-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:merror", + "Regex": ".*\\[[0-9]+\\].*#011train-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:auc", + "Regex": ".*\\[[0-9]+\\].*#011train-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mae", + "Regex": ".*\\[[0-9]+\\].*#011validation-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:error", + "Regex": ".*\\[[0-9]+\\].*#011validation-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:merror", + "Regex": ".*\\[[0-9]+\\].*#011validation-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:logloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:rmse", + "Regex": ".*\\[[0-9]+\\].*#011train-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:logloss", + "Regex": ".*\\[[0-9]+\\].*#011train-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011train-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:rmse", + "Regex": ".*\\[[0-9]+\\].*#011validation-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011validation-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:error", + "Regex": ".*\\[[0-9]+\\].*#011train-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011train-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:map", + "Regex": ".*\\[[0-9]+\\].*#011train-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:map", + "Regex": ".*\\[[0-9]+\\].*#011validation-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + } + ], + "TrainingImage": "433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1", + "TrainingInputMode": "File" + }, + "AutoMLJobArn": null, + "BillableTimeInSeconds": null, + "CheckpointConfig": null, + "CreationTime": "2021-10-13T03:49:20.337Z", + "DebugHookConfig": null, + "DebugRuleConfigurations": null, + "DebugRuleEvaluationStatuses": null, + "EnableInterContainerTrafficEncryption": false, + "EnableManagedSpotTraining": false, + "EnableNetworkIsolation": false, + "Environment": null, + "ExperimentConfig": null, + "FailureReason": null, + "FinalMetricDataList": null, + "HyperParameters": { + "eta": "0.2", + "gamma": "4", + "max_depth": "5", + "min_child_weight": "6", + "num_round": "51", + "objective": "reg:squarederror", + "silent": "0", + "subsample": "0.7" + }, + "InputDataConfig": [ + { + "ChannelName": "train", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "FileSystemDataSource": null, + "S3DataSource": { + "AttributeNames": null, + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train" + } + }, + "InputMode": null, + "RecordWrapperType": "None", + "ShuffleConfig": null + }, + { + "ChannelName": "validation", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "FileSystemDataSource": null, + "S3DataSource": { + "AttributeNames": null, + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation" + } + }, + "InputMode": null, + "RecordWrapperType": "None", + "ShuffleConfig": null + } + ], + "LabelingJobArn": null, + "LastModifiedTime": "2021-10-13T03:49:20.576Z", + "ModelArtifacts": null, + "OutputDataConfig": { + "KmsKeyId": "", + "S3OutputPath": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/debugger/output" + }, + "ProfilerConfig": { + "ProfilingIntervalInMilliseconds": 200, + "ProfilingParameters": null, + "S3OutputPath": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/" + }, + "ProfilerRuleConfigurations": [ + { + "InstanceType": null, + "LocalPath": null, + "RuleConfigurationName": "ProfilerReport", + "RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/xgboost:1", + "RuleParameters": { + "rule_to_invoke": "ProfilerReport" + }, + "S3OutputPath": null, + "VolumeSizeInGB": 0 + } + ], + "ProfilerRuleEvaluationStatuses": [ + { + "LastModifiedTime": "2021-10-13T03:49:20.337Z", + "RuleConfigurationName": "ProfilerReport", + "RuleEvaluationJobArn": null, + "RuleEvaluationStatus": "InProgress", + "StatusDetails": null + } + ], + "ProfilingStatus": "Enabled", + "ResourceConfig": { + "InstanceCount": 1, + "InstanceType": "ml.m4.xlarge", + "VolumeKmsKeyId": null, + "VolumeSizeInGB": 5 + }, + "RoleArn": "arn:aws:iam::123456789012:role/service-role/AmazonSageMaker-ExecutionRole-20210920T111639", + "SecondaryStatus": "Downloading", + "SecondaryStatusTransitions": [ + { + "EndTime": null, + "StartTime": "2021-10-13T03:49:20.337Z", + "Status": "Starting", + "StatusMessage": "Starting the training job" + } + ], + "StoppingCondition": { + "MaxRuntimeInSeconds": 86400, + "MaxWaitTimeInSeconds": null + }, + "TensorBoardOutputConfig": null, + "TrainingEndTime": null, + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test", + "TrainingJobName": "training-test", + "TrainingJobStatus": "InProgress", + "TrainingStartTime": null, + "TrainingTimeInSeconds": null, + "TuningJobArn": null, + "VpcConfig": null +} \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/sdkapi/update/describe/profiler_removal.json b/pkg/resource/training_job/testdata/sdkapi/update/describe/profiler_removal.json new file mode 100644 index 00000000..564e12f0 --- /dev/null +++ b/pkg/resource/training_job/testdata/sdkapi/update/describe/profiler_removal.json @@ -0,0 +1,206 @@ +{ + "AlgorithmSpecification": { + "AlgorithmName": null, + "EnableSageMakerMetricsTimeSeries": false, + "MetricDefinitions": [ + { + "Name": "train:mae", + "Regex": ".*\\[[0-9]+\\].*#011train-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:auc", + "Regex": ".*\\[[0-9]+\\].*#011validation-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:merror", + "Regex": ".*\\[[0-9]+\\].*#011train-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:auc", + "Regex": ".*\\[[0-9]+\\].*#011train-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mae", + "Regex": ".*\\[[0-9]+\\].*#011validation-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:error", + "Regex": ".*\\[[0-9]+\\].*#011validation-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:merror", + "Regex": ".*\\[[0-9]+\\].*#011validation-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:logloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:rmse", + "Regex": ".*\\[[0-9]+\\].*#011train-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:logloss", + "Regex": ".*\\[[0-9]+\\].*#011train-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011train-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:rmse", + "Regex": ".*\\[[0-9]+\\].*#011validation-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011validation-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:error", + "Regex": ".*\\[[0-9]+\\].*#011train-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011train-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:map", + "Regex": ".*\\[[0-9]+\\].*#011train-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:map", + "Regex": ".*\\[[0-9]+\\].*#011validation-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + } + ], + "TrainingImage": "433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1", + "TrainingInputMode": "File" + }, + "AutoMLJobArn": null, + "BillableTimeInSeconds": null, + "CheckpointConfig": null, + "CreationTime": "2021-10-13T03:49:20.337Z", + "DebugHookConfig": null, + "DebugRuleConfigurations": null, + "DebugRuleEvaluationStatuses": null, + "EnableInterContainerTrafficEncryption": false, + "EnableManagedSpotTraining": false, + "EnableNetworkIsolation": false, + "Environment": null, + "ExperimentConfig": null, + "FailureReason": null, + "FinalMetricDataList": null, + "HyperParameters": { + "eta": "0.2", + "gamma": "4", + "max_depth": "5", + "min_child_weight": "6", + "num_round": "51", + "objective": "reg:squarederror", + "silent": "0", + "subsample": "0.7" + }, + "InputDataConfig": [ + { + "ChannelName": "train", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "FileSystemDataSource": null, + "S3DataSource": { + "AttributeNames": null, + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train" + } + }, + "InputMode": null, + "RecordWrapperType": "None", + "ShuffleConfig": null + }, + { + "ChannelName": "validation", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "FileSystemDataSource": null, + "S3DataSource": { + "AttributeNames": null, + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation" + } + }, + "InputMode": null, + "RecordWrapperType": "None", + "ShuffleConfig": null + } + ], + "LabelingJobArn": null, + "LastModifiedTime": "2021-10-13T03:49:20.576Z", + "ModelArtifacts": null, + "OutputDataConfig": { + "KmsKeyId": "", + "S3OutputPath": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/debugger/output" + }, + "ProfilerConfig": { + "ProfilingIntervalInMilliseconds": 500, + "ProfilingParameters": null, + "S3OutputPath": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/" + }, + "ProfilerRuleConfigurations": [ + { + "InstanceType": null, + "LocalPath": null, + "RuleConfigurationName": "ProfilerReport", + "RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/xgboost:1", + "RuleParameters": { + "rule_to_invoke": "ProfilerReport" + }, + "S3OutputPath": null, + "VolumeSizeInGB": 0 + } + ], + "ProfilerRuleEvaluationStatuses": [ + { + "LastModifiedTime": "2021-10-13T03:49:20.337Z", + "RuleConfigurationName": "ProfilerReport", + "RuleEvaluationJobArn": null, + "RuleEvaluationStatus": "InProgress", + "StatusDetails": null + } + ], + "ProfilingStatus": "Disabled", + "ResourceConfig": { + "InstanceCount": 1, + "InstanceType": "ml.m4.xlarge", + "VolumeKmsKeyId": null, + "VolumeSizeInGB": 5 + }, + "RoleArn": "arn:aws:iam::123456789012:role/service-role/AmazonSageMaker-ExecutionRole-20210920T111639", + "SecondaryStatus": "Downloading", + "SecondaryStatusTransitions": [ + { + "EndTime": null, + "StartTime": "2021-10-13T03:49:20.337Z", + "Status": "Starting", + "StatusMessage": "Starting the training job" + } + ], + "StoppingCondition": { + "MaxRuntimeInSeconds": 86400, + "MaxWaitTimeInSeconds": null + }, + "TensorBoardOutputConfig": null, + "TrainingEndTime": null, + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test", + "TrainingJobName": "training-test", + "TrainingJobStatus": "InProgress", + "TrainingStartTime": null, + "TrainingTimeInSeconds": null, + "TuningJobArn": null, + "VpcConfig": null +} \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/sdkapi/update/describe/profilerrule_update.json b/pkg/resource/training_job/testdata/sdkapi/update/describe/profilerrule_update.json new file mode 100644 index 00000000..344a5074 --- /dev/null +++ b/pkg/resource/training_job/testdata/sdkapi/update/describe/profilerrule_update.json @@ -0,0 +1,224 @@ +{ + "AlgorithmSpecification": { + "AlgorithmName": null, + "EnableSageMakerMetricsTimeSeries": false, + "MetricDefinitions": [ + { + "Name": "train:mae", + "Regex": ".*\\[[0-9]+\\].*#011train-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:auc", + "Regex": ".*\\[[0-9]+\\].*#011validation-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:merror", + "Regex": ".*\\[[0-9]+\\].*#011train-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:auc", + "Regex": ".*\\[[0-9]+\\].*#011train-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mae", + "Regex": ".*\\[[0-9]+\\].*#011validation-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:error", + "Regex": ".*\\[[0-9]+\\].*#011validation-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:merror", + "Regex": ".*\\[[0-9]+\\].*#011validation-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:logloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:rmse", + "Regex": ".*\\[[0-9]+\\].*#011train-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:logloss", + "Regex": ".*\\[[0-9]+\\].*#011train-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011train-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:rmse", + "Regex": ".*\\[[0-9]+\\].*#011validation-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011validation-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:error", + "Regex": ".*\\[[0-9]+\\].*#011train-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011train-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:map", + "Regex": ".*\\[[0-9]+\\].*#011train-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:map", + "Regex": ".*\\[[0-9]+\\].*#011validation-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + } + ], + "TrainingImage": "433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1", + "TrainingInputMode": "File" + }, + "AutoMLJobArn": null, + "BillableTimeInSeconds": null, + "CheckpointConfig": null, + "CreationTime": "2021-10-13T03:49:20.337Z", + "DebugHookConfig": null, + "DebugRuleConfigurations": null, + "DebugRuleEvaluationStatuses": null, + "EnableInterContainerTrafficEncryption": false, + "EnableManagedSpotTraining": false, + "EnableNetworkIsolation": false, + "Environment": null, + "ExperimentConfig": null, + "FailureReason": null, + "FinalMetricDataList": null, + "HyperParameters": { + "eta": "0.2", + "gamma": "4", + "max_depth": "5", + "min_child_weight": "6", + "num_round": "51", + "objective": "reg:squarederror", + "silent": "0", + "subsample": "0.7" + }, + "InputDataConfig": [ + { + "ChannelName": "train", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "FileSystemDataSource": null, + "S3DataSource": { + "AttributeNames": null, + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train" + } + }, + "InputMode": null, + "RecordWrapperType": "None", + "ShuffleConfig": null + }, + { + "ChannelName": "validation", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "FileSystemDataSource": null, + "S3DataSource": { + "AttributeNames": null, + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation" + } + }, + "InputMode": null, + "RecordWrapperType": "None", + "ShuffleConfig": null + } + ], + "LabelingJobArn": null, + "LastModifiedTime": "2021-10-13T03:49:20.576Z", + "ModelArtifacts": null, + "OutputDataConfig": { + "KmsKeyId": "", + "S3OutputPath": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/debugger/output" + }, + "ProfilerConfig": { + "ProfilingIntervalInMilliseconds": 500, + "ProfilingParameters": null, + "S3OutputPath": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/" + }, + "ProfilerRuleConfigurations": [ + { + "InstanceType": null, + "LocalPath": null, + "RuleConfigurationName": "ProfilerReport", + "RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/xgboost:1", + "RuleParameters": { + "rule_to_invoke": "ProfilerReport" + }, + "S3OutputPath": null, + "VolumeSizeInGB": 0 + }, + { + "InstanceType": null, + "LocalPath": null, + "RuleConfigurationName": "CPUBottleneck", + "RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/xgboost:1", + "RuleParameters": { + "rule_to_invoke": "CPUBottleneck" + }, + "S3OutputPath": null, + "VolumeSizeInGB": 0 + } + ], + "ProfilerRuleEvaluationStatuses": [ + { + "LastModifiedTime": "2021-10-13T03:49:20.337Z", + "RuleConfigurationName": "ProfilerReport", + "RuleEvaluationJobArn": null, + "RuleEvaluationStatus": "InProgress", + "StatusDetails": null + }, + { + "LastModifiedTime": "2021-10-13T03:49:20.337Z", + "RuleConfigurationName": "CPUBottleneck", + "RuleEvaluationJobArn": null, + "RuleEvaluationStatus": "InProgress", + "StatusDetails": null + } + ], + "ProfilingStatus": "Enabled", + "ResourceConfig": { + "InstanceCount": 1, + "InstanceType": "ml.m4.xlarge", + "VolumeKmsKeyId": null, + "VolumeSizeInGB": 5 + }, + "RoleArn": "arn:aws:iam::123456789012:role/service-role/AmazonSageMaker-ExecutionRole-20210920T111639", + "SecondaryStatus": "Downloading", + "SecondaryStatusTransitions": [ + { + "EndTime": null, + "StartTime": "2021-10-13T03:49:20.337Z", + "Status": "Starting", + "StatusMessage": "Starting the training job" + } + ], + "StoppingCondition": { + "MaxRuntimeInSeconds": 86400, + "MaxWaitTimeInSeconds": null + }, + "TensorBoardOutputConfig": null, + "TrainingEndTime": null, + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test", + "TrainingJobName": "training-test", + "TrainingJobStatus": "InProgress", + "TrainingStartTime": null, + "TrainingTimeInSeconds": null, + "TuningJobArn": null, + "VpcConfig": null +} \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/sdkapi/update/describe/warmpool.json b/pkg/resource/training_job/testdata/sdkapi/update/describe/warmpool.json new file mode 100644 index 00000000..d85c1a57 --- /dev/null +++ b/pkg/resource/training_job/testdata/sdkapi/update/describe/warmpool.json @@ -0,0 +1,161 @@ +{ + "AlgorithmSpecification": { + "EnableSageMakerMetricsTimeSeries": false, + "MetricDefinitions": [ + { + "Name": "train:mae", + "Regex": ".*\\[[0-9]+\\].*#011train-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:auc", + "Regex": ".*\\[[0-9]+\\].*#011validation-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:merror", + "Regex": ".*\\[[0-9]+\\].*#011train-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:auc", + "Regex": ".*\\[[0-9]+\\].*#011train-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mae", + "Regex": ".*\\[[0-9]+\\].*#011validation-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:error", + "Regex": ".*\\[[0-9]+\\].*#011validation-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:merror", + "Regex": ".*\\[[0-9]+\\].*#011validation-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:logloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:rmse", + "Regex": ".*\\[[0-9]+\\].*#011train-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:logloss", + "Regex": ".*\\[[0-9]+\\].*#011train-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011train-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:rmse", + "Regex": ".*\\[[0-9]+\\].*#011validation-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011validation-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:error", + "Regex": ".*\\[[0-9]+\\].*#011train-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:mlogloss", + "Regex": ".*\\[[0-9]+\\].*#011validation-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:ndcg", + "Regex": ".*\\[[0-9]+\\].*#011train-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "train:map", + "Regex": ".*\\[[0-9]+\\].*#011train-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + }, + { + "Name": "validation:map", + "Regex": ".*\\[[0-9]+\\].*#011validation-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*" + } + ], + "TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/xgboost:1", + "TrainingInputMode": "File" + }, + "CreationTime": "2021-10-12T05:49:40.493Z", + "EnableInterContainerTrafficEncryption": false, + "EnableManagedSpotTraining": false, + "EnableNetworkIsolation": false, + "HyperParameters": { + "eta": "0.2", + "gamma": "4", + "max_depth": "5", + "min_child_weight": "6", + "num_class": "10", + "num_round": "10", + "objective": "multi:softmax", + "silent": "0" + }, + "InputDataConfig": [ + { + "ChannelName": "train", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "S3DataSource": { + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train" + } + }, + "RecordWrapperType": "None" + }, + { + "ChannelName": "validation", + "CompressionType": "None", + "ContentType": "text/csv", + "DataSource": { + "S3DataSource": { + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + "S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation" + } + }, + "RecordWrapperType": "None" + } + ], + "LastModifiedTime": "2021-10-12T05:52:46.108Z", + "OutputDataConfig": { + "KmsKeyId": "", + "S3OutputPath": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output" + }, + "ProfilingStatus": "Disabled", + "ResourceConfig": { + "InstanceCount": 1, + "InstanceType": "ml.m4.xlarge", + "VolumeSizeInGB": 5, + "KeepAlivePeriodInSeconds": 70 + }, + "RoleArn": "arn:aws:iam::123456789012:role/service-role/AmazonSageMaker", + "SecondaryStatus": "Completed", + "SecondaryStatusTransitions": [ + { + "EndTime": "2021-10-12T05:52:46.108Z", + "StartTime": "2021-10-12T05:49:40.493Z", + "Status": "Starting", + "StatusMessage": "Preparing the instances for training" + }, + { + "StartTime": "2021-10-12T05:52:46.108Z", + "Status": "Downloading", + "StatusMessage": "Downloading input data" + } + ], + "StoppingCondition": { + "MaxRuntimeInSeconds": 86400 + }, + "WarmPoolStatus": { + "Status":"Available" + }, + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test", + "TrainingJobName": "training-test", + "TrainingJobStatus": "Completed", + "TrainingStartTime": "2021-10-12T05:52:46.108Z", + "TrainingTimeInSeconds": 31 +} \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/test_suite.yaml b/pkg/resource/training_job/testdata/test_suite.yaml index 8d08b4b0..7fa704b3 100644 --- a/pkg/resource/training_job/testdata/test_suite.yaml +++ b/pkg/resource/training_job/testdata/test_suite.yaml @@ -261,6 +261,8 @@ tests: svc_api: - operation: UpdateTrainingJobWithContext output_fixture: "sdkapi/update/update_sucess.json" + - operation: DescribeTrainingJobWithContext + output_fixture: "sdkapi/update/describe/profiler_removal.json" invoke: Update expect: latest_state: "v1alpha1/update/observed/removeProfilerBoth.yaml" @@ -273,6 +275,8 @@ tests: svc_api: - operation: UpdateTrainingJobWithContext output_fixture: "sdkapi/update/update_sucess.json" + - operation: DescribeTrainingJobWithContext + output_fixture: "sdkapi/update/describe/profiler_removal.json" invoke: Update expect: latest_state: "v1alpha1/update/observed/removeProfilerRule.yaml" @@ -285,6 +289,8 @@ tests: svc_api: - operation: UpdateTrainingJobWithContext output_fixture: "sdkapi/update/update_sucess.json" + - operation: DescribeTrainingJobWithContext + output_fixture: "sdkapi/update/describe/profiler_removal.json" invoke: Update expect: latest_state: "v1alpha1/update/observed/removeProfilerConfig.yaml" @@ -297,6 +303,8 @@ tests: svc_api: - operation: UpdateTrainingJobWithContext output_fixture: "sdkapi/update/update_sucess.json" + - operation: DescribeTrainingJobWithContext + output_fixture: "sdkapi/update/describe/bothprofiler_update.json" invoke: Update expect: latest_state: "v1alpha1/update/observed/updateProfilerBoth.yaml" @@ -309,6 +317,8 @@ tests: svc_api: - operation: UpdateTrainingJobWithContext output_fixture: "sdkapi/update/update_sucess.json" + - operation: DescribeTrainingJobWithContext + output_fixture: "sdkapi/update/describe/profiler_config_update.json" invoke: Update expect: latest_state: "v1alpha1/update/observed/updateProfilerConfig.yaml" @@ -320,7 +330,7 @@ tests: latest_state: "v1alpha1/update/latest/profilerStarting.yaml" invoke: Update expect: - error: controller cannot update while secondary status is in Starting state. + error: training job cannot be updated while secondary status is in Starting state. - name: "Update=ProfilerTerminal" description: "Tests if profiler returns a terminal error when TJ is updated in Completed." given: @@ -337,6 +347,8 @@ tests: svc_api: - operation: UpdateTrainingJobWithContext output_fixture: "sdkapi/update/update_sucess.json" + - operation: DescribeTrainingJobWithContext + output_fixture: "sdkapi/update/describe/warmpool.json" invoke: Update expect: latest_state: "v1alpha1/update/observed/updateWP.yaml" @@ -364,6 +376,8 @@ tests: svc_api: - operation: UpdateTrainingJobWithContext output_fixture: "sdkapi/update/update_sucess.json" + - operation: DescribeTrainingJobWithContext + output_fixture: "sdkapi/update/describe/profiler_config_update.json" invoke: Update expect: latest_state: "v1alpha1/update/observed/updateProfilerConfig.yaml" @@ -400,6 +414,8 @@ tests: svc_api: - operation: UpdateTrainingJobWithContext output_fixture: "sdkapi/update/update_sucess.json" + - operation: DescribeTrainingJobWithContext + output_fixture: "sdkapi/update/describe/profilerrule_update.json" invoke: Update expect: latest_state: "v1alpha1/update/observed/updateProfilerRule.yaml" diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml index d4a26189..20ba335d 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml @@ -43,3 +43,12 @@ status: message: TrainingJob is in InProgress status. status: "False" type: ACK.ResourceSynced + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationStatus: InProgress + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Disabled + creationTime: "2021-10-13T03:49:20.337Z" + lastModifiedTime: "2021-10-13T03:49:20.576Z" diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml index 190c8fb3..8f6ce5e7 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml @@ -48,3 +48,13 @@ status: message: TrainingJob is in InProgress status. status: "False" type: ACK.ResourceSynced + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationStatus: InProgress + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Disabled + creationTime: "2021-10-13T03:49:20.337Z" + lastModifiedTime: "2021-10-13T03:49:20.576Z" + diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml index e532e87a..6006d15c 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml @@ -45,4 +45,13 @@ status: - lastTransitionTime: "0001-01-01T00:00:00Z" message: TrainingJob is in InProgress status. status: "False" - type: ACK.ResourceSynced \ No newline at end of file + type: ACK.ResourceSynced + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationStatus: InProgress + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Disabled + creationTime: "2021-10-13T03:49:20.337Z" + lastModifiedTime: "2021-10-13T03:49:20.576Z" diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml index 0283c287..fe53e0cc 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml @@ -55,3 +55,15 @@ status: message: TrainingJob is in InProgress status. status: "False" type: ACK.ResourceSynced + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationStatus: InProgress + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: CPUBottleneck + ruleEvaluationStatus: InProgress + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + creationTime: "2021-10-13T03:49:20.337Z" + lastModifiedTime: "2021-10-13T03:49:20.576Z" diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml index 29c32faa..5ad5188b 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml @@ -52,3 +52,12 @@ status: message: TrainingJob is in InProgress status. status: "False" type: ACK.ResourceSynced + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationStatus: InProgress + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + creationTime: "2021-10-13T03:49:20.337Z" + lastModifiedTime: "2021-10-13T03:49:20.576Z" diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml index 324c1d05..c15e3cdf 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml @@ -55,3 +55,16 @@ status: message: TrainingJob is in InProgress status. status: "False" type: ACK.ResourceSynced + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationStatus: InProgress + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: CPUBottleneck + ruleEvaluationStatus: InProgress + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + creationTime: "2021-10-13T03:49:20.337Z" + lastModifiedTime: "2021-10-13T03:49:20.576Z" + diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml index c3ab737c..4b956906 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml @@ -43,4 +43,11 @@ status: - lastTransitionTime: "0001-01-01T00:00:00Z" message: Warm Pool Infrastructure is in Available status. status: "False" - type: ACK.ResourceSynced \ No newline at end of file + type: ACK.ResourceSynced + warmPoolStatus: + status: Available + secondaryStatus: Completed + trainingJobStatus: Completed + profilingStatus: Disabled + creationTime: "2021-10-12T05:49:40.493Z" + lastModifiedTime: "2021-10-12T05:52:46.108Z" \ No newline at end of file From d7f828a94c30f11f43dac1ce3ab472f657c68d6c Mon Sep 17 00:00:00 2001 From: ananth102 Date: Fri, 2 Dec 2022 04:33:35 +0000 Subject: [PATCH 44/71] updated test --- apis/v1alpha1/ack-generate-metadata.yaml | 4 ++-- pkg/resource/training_job/hooks.go | 8 -------- pkg/resource/training_job/sdk.go | 7 +++++++ .../training_job/sdk_update_post_build_request.go.tpl | 7 +++++++ test/e2e/tests/test_trainingjob_debugger.py | 3 +++ 5 files changed, 19 insertions(+), 10 deletions(-) diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index 2d1d31ae..0913b3e1 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,9 +1,9 @@ ack_generate_info: - build_date: "2022-12-02T00:59:13Z" + build_date: "2022-12-02T03:50:51Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc -api_directory_checksum: 368ed3f24a6e88c5d29d44cc0a7ce60e35152d50 +api_directory_checksum: 3c32189115c09455ec3762ba6d37263b3500e6ea api_version: v1alpha1 aws_sdk_go_version: v1.44.117 generator_config_info: diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index b7bc50aa..c23900bf 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -54,10 +54,6 @@ var ( errors.New("warm pool cannot be updated in InProgress state, requeuing until TrainingJob reaches completed state."), ackrequeue.DefaultRequeueAfterDuration, ) - requeueBeforeUpdateStarting = ackrequeue.NeededAfter( - errors.New("training job cannot be updated while secondary status is in Starting state."), - ackrequeue.DefaultRequeueAfterDuration, - ) ) // customSetOutput sets the resource ResourceSynced condition to False if @@ -146,11 +142,7 @@ func (rm *resourceManager) isWarmPoolUpdatable(latest *resource) error { // customSetOutputUpdateProfiler decides whether the training job is ready/eligible for update // depending on the status. func (rm *resourceManager) customSetOutputUpdateProfiler(r *resource) error { - trainingSecondaryStatus := r.ko.Status.SecondaryStatus trainingJobStatus := r.ko.Status.TrainingJobStatus - if ackcompare.IsNotNil(trainingSecondaryStatus) && *trainingSecondaryStatus == svcsdk.SecondaryStatusStarting { - return requeueBeforeUpdateStarting - } if ackcompare.IsNotNil(trainingJobStatus) { for _, terminalStatus := range TrainingJobTerminalProfiler { if terminalStatus == *trainingJobStatus { diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index 5d978b78..6073a897 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -1141,6 +1141,13 @@ func (rm *resourceManager) sdkUpdate( if !warmpool_diff && !profiler_diff { return latest, ackerr.NewTerminalError(errors.New("only Warm Pool or Profiler can be updated")) } + trainingSecondaryStatus := latest.ko.Status.SecondaryStatus + if ackcompare.IsNotNil(trainingSecondaryStatus) && *trainingSecondaryStatus == svcsdk.SecondaryStatusStarting { + return nil, ackrequeue.NeededAfter( + errors.New("training job cannot be updated while secondary status is in Starting state."), + ackrequeue.DefaultRequeueAfterDuration, + ) + } if warmpool_diff { input.SetProfilerConfig(nil) input.SetProfilerRuleConfigurations(nil) diff --git a/templates/training_job/sdk_update_post_build_request.go.tpl b/templates/training_job/sdk_update_post_build_request.go.tpl index 70d3da9a..9bbb574b 100644 --- a/templates/training_job/sdk_update_post_build_request.go.tpl +++ b/templates/training_job/sdk_update_post_build_request.go.tpl @@ -6,6 +6,13 @@ if warmpool_diff && profiler_diff { if !warmpool_diff && !profiler_diff { return latest, ackerr.NewTerminalError(errors.New("only Warm Pool or Profiler can be updated")) } +trainingSecondaryStatus := latest.ko.Status.SecondaryStatus +if ackcompare.IsNotNil(trainingSecondaryStatus) && *trainingSecondaryStatus == svcsdk.SecondaryStatusStarting { + return nil, ackrequeue.NeededAfter( + errors.New("training job cannot be updated while secondary status is in Starting state."), + ackrequeue.DefaultRequeueAfterDuration, + ) +} if warmpool_diff { input.SetProfilerConfig(nil) input.SetProfilerRuleConfigurations(nil) diff --git a/test/e2e/tests/test_trainingjob_debugger.py b/test/e2e/tests/test_trainingjob_debugger.py index efd1a3bc..b9257d97 100644 --- a/test/e2e/tests/test_trainingjob_debugger.py +++ b/test/e2e/tests/test_trainingjob_debugger.py @@ -219,6 +219,9 @@ def test_update(self, xgboost_training_job_debugger): == NEW_PROFILER_INTERVAL ) + assert resource["status"]["lastModifiedTime"] != resource["status"]["creationTime"] + assert training_job_desc["LastModifiedTime"] != training_job_desc["CreationTime"] + # Check that you can delete a completed resource from k8s _, deleted = k8s.delete_custom_resource( reference, cfg.JOB_DELETE_WAIT_PERIODS, cfg.JOB_DELETE_WAIT_LENGTH From c75de8c2c868978fc84e41fc53ad45781bc92854 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Fri, 2 Dec 2022 17:35:15 +0000 Subject: [PATCH 45/71] changed delta --- apis/v1alpha1/ack-generate-metadata.yaml | 2 +- pkg/resource/training_job/custom_delta.go | 3 ++- templates/training_job/sdk_update_post_build_request.go.tpl | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index 0913b3e1..ed1eb5b1 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,5 +1,5 @@ ack_generate_info: - build_date: "2022-12-02T03:50:51Z" + build_date: "2022-12-02T17:34:04Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc diff --git a/pkg/resource/training_job/custom_delta.go b/pkg/resource/training_job/custom_delta.go index c63ab1df..8923fafa 100644 --- a/pkg/resource/training_job/custom_delta.go +++ b/pkg/resource/training_job/custom_delta.go @@ -27,7 +27,8 @@ func customSetDefaults( // Default value of VolumeSizeInGB is 0 defaultVolumeSizeInGB := aws.Int64(0) - if ackcompare.IsNotNil(a.ko.Spec.ProfilerRuleConfigurations) && ackcompare.IsNotNil(b.ko.Spec.ProfilerRuleConfigurations) { + if ackcompare.IsNotNil(a.ko.Spec.ProfilerRuleConfigurations) && ackcompare.IsNotNil(b.ko.Spec.ProfilerRuleConfigurations) && + len(a.ko.Spec.ProfilerRuleConfigurations) == len(b.ko.Spec.ProfilerRuleConfigurations) { for index := range a.ko.Spec.ProfilerRuleConfigurations { // Prevent out of bounds panics. if index == len(a.ko.Spec.ProfilerRuleConfigurations) || index == len(b.ko.Spec.ProfilerRuleConfigurations) { diff --git a/templates/training_job/sdk_update_post_build_request.go.tpl b/templates/training_job/sdk_update_post_build_request.go.tpl index 9bbb574b..b6097543 100644 --- a/templates/training_job/sdk_update_post_build_request.go.tpl +++ b/templates/training_job/sdk_update_post_build_request.go.tpl @@ -27,7 +27,7 @@ if profiler_diff { input.SetResourceConfig(nil) if rm.isProfilerRemoved(desired, latest) { rm.handleProfilerRemoval(input) - } else{ + } else { inp_err := rm.customSetUpdateInput(desired, latest, delta, input) if inp_err != nil { return nil, err From 872ecd2abeb64a5451c874e9f296f9dcdac0b279 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Fri, 2 Dec 2022 17:42:59 +0000 Subject: [PATCH 46/71] removed redundant nil check --- pkg/resource/training_job/hooks.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index c23900bf..f42d62c5 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -95,7 +95,7 @@ func (rm *resourceManager) customSetOutput(r *resource) { // Sometimes DescribeTrainingJob does not contain the warm pool status // In this condition the only possible status is Available or Terminated. - if ackcompare.IsNotNil(trainingJobStatus) && ackcompare.IsNil(r.ko.Status.WarmPoolStatus) { + if ackcompare.IsNil(r.ko.Status.WarmPoolStatus) { svccommon.SetSyncedCondition(r, aws.String("Available"), aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) } From 9910ffd93ebd5cb904db7b1b1c2366d3589fd0c1 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Fri, 2 Dec 2022 21:37:05 +0000 Subject: [PATCH 47/71] refactor: changed variable names --- .../training_job/custom_set_update_input.go | 40 +++++++++---------- pkg/resource/training_job/hooks.go | 2 +- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pkg/resource/training_job/custom_set_update_input.go b/pkg/resource/training_job/custom_set_update_input.go index 20b9f472..e776bea4 100644 --- a/pkg/resource/training_job/custom_set_update_input.go +++ b/pkg/resource/training_job/custom_set_update_input.go @@ -30,7 +30,7 @@ import ( // Update training job is post operation wrt to the profiler parameters. // Because of this only NEW rules can be specified. // In this function we check to see if any new profiler configurstions have been added. -func buildProfilerRuleConfigUpdateInput(desired *resource, latest *resource, input *svcsdk.UpdateTrainingJobInput) error { +func (rm *resourceManager) buildProfilerRuleConfigUpdateInput(desired *resource, latest *resource, input *svcsdk.UpdateTrainingJobInput) error { profilerRuleDesired := desired.ko.Spec.ProfilerRuleConfigurations profilerRuleLatest := latest.ko.Spec.ProfilerRuleConfigurations @@ -52,7 +52,7 @@ func buildProfilerRuleConfigUpdateInput(desired *resource, latest *resource, inp if ackcompare.IsNotNil(rule) && ackcompare.IsNotNil(rule.RuleConfigurationName) { _, present := ruleMap[*rule.RuleConfigurationName] if !present { - profilerRuleInput = append(profilerRuleInput, convertProfileRuleType(rule)) + profilerRuleInput = append(profilerRuleInput, rm.convertProfileRuleType(rule)) } } } @@ -71,34 +71,34 @@ func (rm *resourceManager) handleProfilerRemoval(input *svcsdk.UpdateTrainingJob // convertProfileRuleType converts the kubernetes object ProfilerRuleConfiguration into // a type that is compatible with the AWS API. // Sagemaker and kubernetes types are not the same so the input has to be reconstructed. -func convertProfileRuleType(rule *svcapitypes.ProfilerRuleConfiguration) *svcsdk.ProfilerRuleConfiguration { - smRule := &svcsdk.ProfilerRuleConfiguration{} - if rule.InstanceType != nil { - smRule.SetInstanceType(*rule.InstanceType) +func (rm *resourceManager) convertProfileRuleType(kubernetesObjectRule *svcapitypes.ProfilerRuleConfiguration) *svcsdk.ProfilerRuleConfiguration { + sagemakerAPIRule := &svcsdk.ProfilerRuleConfiguration{} + if kubernetesObjectRule.InstanceType != nil { + sagemakerAPIRule.SetInstanceType(*kubernetesObjectRule.InstanceType) } - if rule.LocalPath != nil { - smRule.SetLocalPath(*rule.LocalPath) + if kubernetesObjectRule.LocalPath != nil { + sagemakerAPIRule.SetLocalPath(*kubernetesObjectRule.LocalPath) } - if rule.RuleConfigurationName != nil { - smRule.SetRuleConfigurationName(*rule.RuleConfigurationName) + if kubernetesObjectRule.RuleConfigurationName != nil { + sagemakerAPIRule.SetRuleConfigurationName(*kubernetesObjectRule.RuleConfigurationName) } - if rule.RuleEvaluatorImage != nil { - smRule.SetRuleEvaluatorImage(*rule.RuleEvaluatorImage) + if kubernetesObjectRule.RuleEvaluatorImage != nil { + sagemakerAPIRule.SetRuleEvaluatorImage(*kubernetesObjectRule.RuleEvaluatorImage) } - if rule.RuleParameters != nil { + if kubernetesObjectRule.RuleParameters != nil { f1elemf4 := map[string]*string{} - for f1elemf4key, f1elemf4valiter := range rule.RuleParameters { + for f1elemf4key, f1elemf4valiter := range kubernetesObjectRule.RuleParameters { var f1elemf4val string f1elemf4val = *f1elemf4valiter f1elemf4[f1elemf4key] = &f1elemf4val } - smRule.SetRuleParameters(f1elemf4) + sagemakerAPIRule.SetRuleParameters(f1elemf4) } - if rule.S3OutputPath != nil { - smRule.SetS3OutputPath(*rule.S3OutputPath) + if kubernetesObjectRule.S3OutputPath != nil { + sagemakerAPIRule.SetS3OutputPath(*kubernetesObjectRule.S3OutputPath) } - if rule.VolumeSizeInGB != nil { - smRule.SetVolumeSizeInGB(*rule.VolumeSizeInGB) + if kubernetesObjectRule.VolumeSizeInGB != nil { + sagemakerAPIRule.SetVolumeSizeInGB(*kubernetesObjectRule.VolumeSizeInGB) } - return smRule + return sagemakerAPIRule } diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index f42d62c5..ddca85e4 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -177,7 +177,7 @@ func (rm *resourceManager) customSetUpdateInput(desired *resource, latest *resou if !delta.DifferentAt("Spec.ProfilerRuleConfigurations") { input.SetProfilerRuleConfigurations(nil) } else { - err := buildProfilerRuleConfigUpdateInput(desired, latest, input) + err := rm.buildProfilerRuleConfigUpdateInput(desired, latest, input) return err } From 8a6e212737cfdd2bb9de5e33dd7528b0940cd376 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Fri, 2 Dec 2022 23:11:51 +0000 Subject: [PATCH 48/71] updated update input logic --- ...m_set_update_input.go => custom_update.go} | 52 +++++++++++++++---- 1 file changed, 43 insertions(+), 9 deletions(-) rename pkg/resource/training_job/{custom_set_update_input.go => custom_update.go} (72%) diff --git a/pkg/resource/training_job/custom_set_update_input.go b/pkg/resource/training_job/custom_update.go similarity index 72% rename from pkg/resource/training_job/custom_set_update_input.go rename to pkg/resource/training_job/custom_update.go index e776bea4..29d580dc 100644 --- a/pkg/resource/training_job/custom_set_update_input.go +++ b/pkg/resource/training_job/custom_update.go @@ -20,7 +20,6 @@ import ( "errors" ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" - ackerr "github.com/aws-controllers-k8s/runtime/pkg/errors" svcapitypes "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" svcsdk "github.com/aws/aws-sdk-go/service/sagemaker" ) @@ -30,6 +29,11 @@ import ( // Update training job is post operation wrt to the profiler parameters. // Because of this only NEW rules can be specified. // In this function we check to see if any new profiler configurstions have been added. +// Four cases: +// 1. Rule gets added (handled normally) +// 2. Rule gets removed (error is returned) +// 3. Rule gets removed but others get added (error is returned) +// 4. Rule gets changed (error gets returned) func (rm *resourceManager) buildProfilerRuleConfigUpdateInput(desired *resource, latest *resource, input *svcsdk.UpdateTrainingJobInput) error { profilerRuleDesired := desired.ko.Spec.ProfilerRuleConfigurations profilerRuleLatest := latest.ko.Spec.ProfilerRuleConfigurations @@ -37,17 +41,16 @@ func (rm *resourceManager) buildProfilerRuleConfigUpdateInput(desired *resource, if ackcompare.IsNil(profilerRuleLatest) { return nil } - if len(profilerRuleDesired) < len(profilerRuleLatest) { - return ackerr.NewTerminalError(errors.New("cannot remove a profiler rule.")) + if len(profilerRuleDesired) <= len(profilerRuleLatest) { + return errors.New("cannot remove/modify a profiler rule.") } - ruleMap := map[string]int{} - profilerRuleInput := []*svcsdk.ProfilerRuleConfiguration{} - for _, rule := range profilerRuleLatest { - if ackcompare.IsNotNil(rule.RuleConfigurationName) { - ruleMap[*rule.RuleConfigurationName] = 1 - } + ruleMap, err := markNonUpdatableRules(profilerRuleDesired, profilerRuleLatest) + if err != nil { + return err } + profilerRuleInput := []*svcsdk.ProfilerRuleConfiguration{} + for _, rule := range profilerRuleDesired { if ackcompare.IsNotNil(rule) && ackcompare.IsNotNil(rule.RuleConfigurationName) { _, present := ruleMap[*rule.RuleConfigurationName] @@ -56,10 +59,41 @@ func (rm *resourceManager) buildProfilerRuleConfigUpdateInput(desired *resource, } } } + // If the length of this slice is zero that only the contents of the profile rule have changed + if len(profilerRuleInput) == 0 { + return errors.New("cannot modify a profiler rule.") + } input.SetProfilerRuleConfigurations(profilerRuleInput) return nil } +// markNonUpdatableRules returns a map containing the rules that are not eligible for update. +// In addition it returns an error if a rule gets removed. +func markNonUpdatableRules(profilerRuleDesired []*svcapitypes.ProfilerRuleConfiguration, profilerRuleLatest []*svcapitypes.ProfilerRuleConfiguration) (map[string]int, error) { + commonRulesMap := map[string]int{} + latestRulesMap := map[string]int{} + for _, rule := range profilerRuleLatest { + if ackcompare.IsNotNil(rule.RuleConfigurationName) { + commonRulesMap[*rule.RuleConfigurationName] = 0 + latestRulesMap[*rule.RuleConfigurationName] = 0 + } + } + for _, rule := range profilerRuleDesired { + if ackcompare.IsNotNil(rule.RuleConfigurationName) { + commonRulesMap[*rule.RuleConfigurationName] = 1 + } + } + for _, val := range commonRulesMap { + // This means that there exists a rule in latest that is not present in desired + // which means that the input is invalid. + if val == 0 { + return nil, errors.New("cannot remove a profiler rule.") + } + } + + return latestRulesMap, nil +} + // handleProfilerRemoval sets the input parameters to disable the profiler. func (rm *resourceManager) handleProfilerRemoval(input *svcsdk.UpdateTrainingJobInput) { input.SetProfilerRuleConfigurations(nil) From 1c94a650afb07b51ffd6051545b7c29f6a61bf45 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Sat, 3 Dec 2022 00:04:31 +0000 Subject: [PATCH 49/71] minor refactor --- apis/v1alpha1/ack-generate-metadata.yaml | 2 +- pkg/resource/training_job/sdk.go | 5 ++--- templates/training_job/sdk_update_post_build_request.go.tpl | 5 ++--- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index ed1eb5b1..ae7a5a93 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,5 +1,5 @@ ack_generate_info: - build_date: "2022-12-02T17:34:04Z" + build_date: "2022-12-03T00:03:52Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index 6073a897..420da546 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -1163,9 +1163,8 @@ func (rm *resourceManager) sdkUpdate( if rm.isProfilerRemoved(desired, latest) { rm.handleProfilerRemoval(input) } else { - inp_err := rm.customSetUpdateInput(desired, latest, delta, input) - if inp_err != nil { - return nil, err + if inp_err := rm.customSetUpdateInput(desired, latest, delta, input); inp_err != nil { + return nil, inp_err } } } diff --git a/templates/training_job/sdk_update_post_build_request.go.tpl b/templates/training_job/sdk_update_post_build_request.go.tpl index b6097543..4ee76d77 100644 --- a/templates/training_job/sdk_update_post_build_request.go.tpl +++ b/templates/training_job/sdk_update_post_build_request.go.tpl @@ -28,9 +28,8 @@ if profiler_diff { if rm.isProfilerRemoved(desired, latest) { rm.handleProfilerRemoval(input) } else { - inp_err := rm.customSetUpdateInput(desired, latest, delta, input) - if inp_err != nil { - return nil, err + if inp_err := rm.customSetUpdateInput(desired, latest, delta, input); inp_err != nil { + return nil, inp_err } } } From 67b7714bf2b21249baae5e331c62f4960fe034e5 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Sat, 3 Dec 2022 00:09:49 +0000 Subject: [PATCH 50/71] updated unit test --- .../training_job/testdata/test_suite.yaml | 18 ++++- .../update/desired/removeAddProfilerRule.yaml | 53 ++++++++++++++ .../update/desired/removeOneProfilerRule.yaml | 49 +++++++++++++ .../update/latest/profilerMultipleRules.yaml | 70 +++++++++++++++++++ 4 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/desired/removeAddProfilerRule.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/desired/removeOneProfilerRule.yaml create mode 100644 pkg/resource/training_job/testdata/v1alpha1/update/latest/profilerMultipleRules.yaml diff --git a/pkg/resource/training_job/testdata/test_suite.yaml b/pkg/resource/training_job/testdata/test_suite.yaml index 7fa704b3..63deb2b9 100644 --- a/pkg/resource/training_job/testdata/test_suite.yaml +++ b/pkg/resource/training_job/testdata/test_suite.yaml @@ -407,7 +407,7 @@ tests: expect: error: "resource is in terminal condition" - name: "Update=Update Profiler Rule" - description: "Attempt to update something else" + description: "Updating a profiler rule" given: desired_state: "v1alpha1/update/desired/updateProfilerRule.yaml" latest_state: "v1alpha1/update/latest/profilerHappyBase.yaml" @@ -420,6 +420,22 @@ tests: expect: latest_state: "v1alpha1/update/observed/updateProfilerRule.yaml" error: nil + - name: "Update=RemoveOneProfilerRule" + description: "Removing one profiler rule" + given: + desired_state: "v1alpha1/update/desired/removeOneProfilerRule.yaml" + latest_state: "v1alpha1/update/latest/profilerMultipleRules.yaml" + invoke: Update + expect: + error: "cannot remove/modify a profiler rule." + - name: "Update=RemoveAddProfilerRule" + description: "Removing one profiler rule" + given: + desired_state: "v1alpha1/update/desired/removeAddProfilerRule.yaml" + latest_state: "v1alpha1/update/latest/profilerMultipleRules.yaml" + invoke: Update + expect: + error: "cannot remove/modify a profiler rule." diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/desired/removeAddProfilerRule.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/desired/removeAddProfilerRule.yaml new file mode 100644 index 00000000..e064f39f --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/desired/removeAddProfilerRule.yaml @@ -0,0 +1,53 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + name: training-test +spec: + trainingJobName: training-test + hyperParameters: + max_depth: "5" + gamma: "4" + eta: "0.2" + min_child_weight: "6" + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + outputDataConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + stoppingCondition: + maxRuntimeInSeconds: 86400 + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None + profilerConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilingIntervalInMilliseconds: 500 + profilerRuleConfigurations: + - ruleConfigurationName: SuperProfiler + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: SuperProfiler + - ruleConfigurationName: CPUBottleneck + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: CPUBottleneck +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: [] diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/desired/removeOneProfilerRule.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/desired/removeOneProfilerRule.yaml new file mode 100644 index 00000000..f0fc4385 --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/desired/removeOneProfilerRule.yaml @@ -0,0 +1,49 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + name: training-test +spec: + trainingJobName: training-test + hyperParameters: + max_depth: "5" + gamma: "4" + eta: "0.2" + min_child_weight: "6" + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + outputDataConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + stoppingCondition: + maxRuntimeInSeconds: 86400 + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None + profilerConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilingIntervalInMilliseconds: 500 + profilerRuleConfigurations: + - ruleConfigurationName: CPUBottleneck + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: CPUBottleneck +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: [] diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/latest/profilerMultipleRules.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/latest/profilerMultipleRules.yaml new file mode 100644 index 00000000..c15e3cdf --- /dev/null +++ b/pkg/resource/training_job/testdata/v1alpha1/update/latest/profilerMultipleRules.yaml @@ -0,0 +1,70 @@ +apiVersion: sagemaker.services.k8s.aws/v1alpha1 +kind: TrainingJob +metadata: + name: training-test +spec: + trainingJobName: training-test + hyperParameters: + max_depth: "5" + gamma: "4" + eta: "0.2" + min_child_weight: "6" + objective: "reg:squarederror" + subsample: "0.7" + num_round: "51" + algorithmSpecification: + trainingImage: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.2-1 + trainingInputMode: File + roleARN: arn:aws:iam::123456789012:role/service-role/AmazonSageMaker + outputDataConfig: + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/output + resourceConfig: + instanceCount: 1 + instanceType: ml.m4.xlarge + volumeSizeInGB: 5 + stoppingCondition: + maxRuntimeInSeconds: 86400 + inputDataConfig: + - channelName: train + dataSource: + s3DataSource: + s3DataType: S3Prefix + s3URI: s3://ack-sagemaker-bucket-592697580195/sagemaker/xgboost/train/ + s3DataDistributionType: FullyReplicated + contentType: text/libsvm + compressionType: None + profilerConfig: + profilingIntervalInMilliseconds: 500 + s3OutputPath: s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/ + profilerRuleConfigurations: + - ruleConfigurationName: ProfilerReport + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: ProfilerReport + - ruleConfigurationName: CPUBottleneck + ruleEvaluatorImage: 895741380848.dkr.ecr.us-west-2.amazonaws.com/sagemaker-debugger-rules:latest + ruleParameters: + rule_to_invoke: CPUBottleneck +status: + ackResourceMetadata: + arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test + ownerAccountID: "" + region: "" + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationStatus: InProgress + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: CPUBottleneck + ruleEvaluationStatus: InProgress + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + creationTime: "2021-10-13T03:49:20.337Z" + lastModifiedTime: "2021-10-13T03:49:20.576Z" + From fa963fb48cf8dba029fcbd522ebcd37ebb8f4d09 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Mon, 5 Dec 2022 18:08:52 +0000 Subject: [PATCH 51/71] changed function to method --- pkg/resource/training_job/custom_update.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/resource/training_job/custom_update.go b/pkg/resource/training_job/custom_update.go index 29d580dc..06bb6b32 100644 --- a/pkg/resource/training_job/custom_update.go +++ b/pkg/resource/training_job/custom_update.go @@ -45,7 +45,7 @@ func (rm *resourceManager) buildProfilerRuleConfigUpdateInput(desired *resource, return errors.New("cannot remove/modify a profiler rule.") } - ruleMap, err := markNonUpdatableRules(profilerRuleDesired, profilerRuleLatest) + ruleMap, err := rm.markNonUpdatableRules(profilerRuleDesired, profilerRuleLatest) if err != nil { return err } @@ -69,7 +69,7 @@ func (rm *resourceManager) buildProfilerRuleConfigUpdateInput(desired *resource, // markNonUpdatableRules returns a map containing the rules that are not eligible for update. // In addition it returns an error if a rule gets removed. -func markNonUpdatableRules(profilerRuleDesired []*svcapitypes.ProfilerRuleConfiguration, profilerRuleLatest []*svcapitypes.ProfilerRuleConfiguration) (map[string]int, error) { +func (rm *resourceManager) markNonUpdatableRules(profilerRuleDesired []*svcapitypes.ProfilerRuleConfiguration, profilerRuleLatest []*svcapitypes.ProfilerRuleConfiguration) (map[string]int, error) { commonRulesMap := map[string]int{} latestRulesMap := map[string]int{} for _, rule := range profilerRuleLatest { From f46ca2abe0bd104cbbdc4aa35cc6018b662b7b28 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Mon, 5 Dec 2022 21:22:43 +0000 Subject: [PATCH 52/71] Using requeue on sucess instead of resource synced --- apis/v1alpha1/ack-generate-metadata.yaml | 4 ++-- apis/v1alpha1/generator.yaml | 2 ++ generator.yaml | 2 ++ pkg/resource/training_job/hooks.go | 18 ----------------- pkg/resource/training_job/manager_factory.go | 2 +- pkg/resource/training_job/sdk.go | 20 ++++++++++--------- .../create/observed/success_after_create.yaml | 5 ++++- ...ccess_after_create_debugger_variation.yaml | 5 ++++- .../readone/observed/warmpool_available.yaml | 4 ++-- .../readone/observed/warmpool_nostatus.yaml | 4 ++-- .../sdk_update_post_set_output.go.tpl | 10 ++++------ 11 files changed, 34 insertions(+), 42 deletions(-) diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index ae7a5a93..55ea6485 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,5 +1,5 @@ ack_generate_info: - build_date: "2022-12-03T00:03:52Z" + build_date: "2022-12-05T20:17:26Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc @@ -7,7 +7,7 @@ api_directory_checksum: 3c32189115c09455ec3762ba6d37263b3500e6ea api_version: v1alpha1 aws_sdk_go_version: v1.44.117 generator_config_info: - file_checksum: d56d093045a7f6f17808fa8ef603f2dfb9945519 + file_checksum: 54537eab4b25276641c237dd1de49a6c1bc06fb2 original_file_name: generator.yaml last_modification: reason: API generation diff --git a/apis/v1alpha1/generator.yaml b/apis/v1alpha1/generator.yaml index 8318ee1c..99689007 100644 --- a/apis/v1alpha1/generator.yaml +++ b/apis/v1alpha1/generator.yaml @@ -112,6 +112,8 @@ resources: compare: is_ignored: true TrainingJob: + reconcile: + requeue_on_success_seconds: 300 exceptions: errors: 404: diff --git a/generator.yaml b/generator.yaml index 8318ee1c..99689007 100644 --- a/generator.yaml +++ b/generator.yaml @@ -112,6 +112,8 @@ resources: compare: is_ignored: true TrainingJob: + reconcile: + requeue_on_success_seconds: 300 exceptions: errors: 404: diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index ddca85e4..1d480ccb 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -86,24 +86,6 @@ func (rm *resourceManager) customSetOutput(r *resource) { svccommon.SetSyncedCondition(r, trainingJobStatus, &resourceName, &trainingJobModifyingStatuses) - warmpoolUsed := ackcompare.IsNotNil(r.ko.Spec.ResourceConfig) && ackcompare.IsNotNil(r.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds) - - // Only requeue when warm pool is being used and when training job is in the completed state. - // WP will always have terminated status on error(Training Job or Warmpool). - if ackcompare.IsNotNil(trainingJobStatus) && *trainingJobStatus == svcsdk.TrainingJobStatusCompleted && - warmpoolUsed { - - // Sometimes DescribeTrainingJob does not contain the warm pool status - // In this condition the only possible status is Available or Terminated. - if ackcompare.IsNil(r.ko.Status.WarmPoolStatus) { - svccommon.SetSyncedCondition(r, aws.String("Available"), aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) - } - - if ackcompare.IsNotNil(r.ko.Status.WarmPoolStatus) && svccommon.IsModifyingStatus(r.ko.Status.WarmPoolStatus.Status, &WarmPoolModifyingStatuses) { - svccommon.SetSyncedCondition(r, r.ko.Status.WarmPoolStatus.Status, aws.String("Warm Pool Infrastructure"), &WarmPoolModifyingStatuses) - } - } - } // isWarmPoolUpdateable returns a requeue or terminal error depending on the warmpool/training job state diff --git a/pkg/resource/training_job/manager_factory.go b/pkg/resource/training_job/manager_factory.go index effe1d13..ca5ef941 100644 --- a/pkg/resource/training_job/manager_factory.go +++ b/pkg/resource/training_job/manager_factory.go @@ -82,7 +82,7 @@ func (f *resourceManagerFactory) IsAdoptable() bool { // RequeueOnSuccessSeconds returns true if the resource should be requeued after specified seconds // Default is false which means resource will not be requeued after success. func (f *resourceManagerFactory) RequeueOnSuccessSeconds() int { - return 0 + return 300 } func newResourceManagerFactory() *resourceManagerFactory { diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index 420da546..03130ad2 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -1189,13 +1189,10 @@ func (rm *resourceManager) sdkUpdate( } rm.setStatusDefaults(ko) - observed, err := rm.sdkFind(ctx, latest) - if err != nil { - return observed, err - } - tmp_resource := &resource{ko} - tmp_resource.SetStatus(observed) - + return desired, ackrequeue.NeededAfter( + errors.New("training job is updating"), + ackrequeue.DefaultRequeueAfterDuration, + ) return &resource{ko}, nil } @@ -1420,8 +1417,13 @@ func (rm *resourceManager) updateConditions( recoverableCondition.Message = nil } } - // Required to avoid the "declared but not used" error in the default case - _ = syncCondition + if syncCondition == nil && onSuccess { + syncCondition = &ackv1alpha1.Condition{ + Type: ackv1alpha1.ConditionTypeResourceSynced, + Status: corev1.ConditionTrue, + } + ko.Status.Conditions = append(ko.Status.Conditions, syncCondition) + } if terminalCondition != nil || recoverableCondition != nil || syncCondition != nil { return &resource{ko}, true // updated } diff --git a/pkg/resource/training_job/testdata/v1alpha1/create/observed/success_after_create.yaml b/pkg/resource/training_job/testdata/v1alpha1/create/observed/success_after_create.yaml index d337c137..ea507d65 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/create/observed/success_after_create.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/create/observed/success_after_create.yaml @@ -55,4 +55,7 @@ status: arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job ownerAccountID: "" region: "" - conditions: [] + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + status: "True" + type: ACK.ResourceSynced diff --git a/pkg/resource/training_job/testdata/v1alpha1/create/observed/success_after_create_debugger_variation.yaml b/pkg/resource/training_job/testdata/v1alpha1/create/observed/success_after_create_debugger_variation.yaml index 4961852f..b8673d05 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/create/observed/success_after_create_debugger_variation.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/create/observed/success_after_create_debugger_variation.yaml @@ -89,4 +89,7 @@ status: arn: arn:aws:sagemaker:us-west-2:123456789012:training-job/xgboost-training-job ownerAccountID: "" region: "" - conditions: [] + conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + status: "True" + type: ACK.ResourceSynced diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml index bbe90085..50461efb 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_available.yaml @@ -102,8 +102,8 @@ status: region: "" conditions: - lastTransitionTime: "0001-01-01T00:00:00Z" - message: Warm Pool Infrastructure is in Available status. - status: "False" + message: TrainingJob is in Completed status. + status: "True" type: ACK.ResourceSynced warmPoolStatus: status: Available diff --git a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_nostatus.yaml b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_nostatus.yaml index cf2e2583..c9d23eb8 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_nostatus.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/readone/observed/warmpool_nostatus.yaml @@ -102,8 +102,8 @@ status: region: "" conditions: - lastTransitionTime: "0001-01-01T00:00:00Z" - message: Warm Pool Infrastructure is in Available status. - status: "False" + message: TrainingJob is in Completed status. + status: "True" type: ACK.ResourceSynced secondaryStatus: Completed trainingJobStatus: Completed diff --git a/templates/training_job/sdk_update_post_set_output.go.tpl b/templates/training_job/sdk_update_post_set_output.go.tpl index f54805e3..a4e7d1a5 100644 --- a/templates/training_job/sdk_update_post_set_output.go.tpl +++ b/templates/training_job/sdk_update_post_set_output.go.tpl @@ -1,6 +1,4 @@ -observed, err := rm.sdkFind(ctx, latest) -if err != nil { - return observed, err -} -tmp_resource := &resource{ko} -tmp_resource.SetStatus(observed) +return desired, ackrequeue.NeededAfter( + errors.New("training job is updating"), + ackrequeue.DefaultRequeueAfterDuration, +) \ No newline at end of file From ca85f940f3174990fa74edd21f88f6c0ae82f7f9 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Mon, 5 Dec 2022 21:37:59 +0000 Subject: [PATCH 53/71] modified unit tests --- .../training_job/testdata/test_suite.yaml | 14 +++++++------- .../update/observed/removeProfilerBoth.yaml | 15 +++------------ .../update/observed/removeProfilerConfig.yaml | 15 +++------------ .../update/observed/removeProfilerRule.yaml | 15 +++------------ .../update/observed/updateProfilerBoth.yaml | 18 +++--------------- .../update/observed/updateProfilerConfig.yaml | 15 +++------------ .../update/observed/updateProfilerRule.yaml | 18 +++--------------- .../v1alpha1/update/observed/updateWP.yaml | 14 ++++---------- 8 files changed, 29 insertions(+), 95 deletions(-) diff --git a/pkg/resource/training_job/testdata/test_suite.yaml b/pkg/resource/training_job/testdata/test_suite.yaml index 63deb2b9..f0c79c8d 100644 --- a/pkg/resource/training_job/testdata/test_suite.yaml +++ b/pkg/resource/training_job/testdata/test_suite.yaml @@ -266,7 +266,7 @@ tests: invoke: Update expect: latest_state: "v1alpha1/update/observed/removeProfilerBoth.yaml" - error: nil + error: training job is updating - name: "Update=RemoveProfilerRule" desciption: "This test checks if the Controller can remove the profiler properly" given: @@ -280,7 +280,7 @@ tests: invoke: Update expect: latest_state: "v1alpha1/update/observed/removeProfilerRule.yaml" - error: nil + error: training job is updating - name: "Update=RemoveProfilerConfig" desciption: "This test checks if the Controller can remove the profiler properly" given: @@ -294,7 +294,7 @@ tests: invoke: Update expect: latest_state: "v1alpha1/update/observed/removeProfilerConfig.yaml" - error: nil + error: training job is updating - name: "Update=BothProfiler" description: "Update Both Profiler parameters" given: @@ -308,7 +308,7 @@ tests: invoke: Update expect: latest_state: "v1alpha1/update/observed/updateProfilerBoth.yaml" - error: nil + error: training job is updating - name: "Update=ProfilerConfig" description: "Update just the profiler config" given: @@ -322,7 +322,7 @@ tests: invoke: Update expect: latest_state: "v1alpha1/update/observed/updateProfilerConfig.yaml" - error: nil + error: training job is updating - name: "Update=ProfilerStarting" description: "Tests if profiler returns a requeue error when TJ is updated in Starting." given: @@ -381,7 +381,7 @@ tests: invoke: Update expect: latest_state: "v1alpha1/update/observed/updateProfilerConfig.yaml" - error: nil + error: training job is updating - name: "Update=AddWarmPool" description: "Attempt to add Warm Pool when a previous one does not exist." given: @@ -419,7 +419,7 @@ tests: invoke: Update expect: latest_state: "v1alpha1/update/observed/updateProfilerRule.yaml" - error: nil + error: "training job is updating" - name: "Update=RemoveOneProfilerRule" description: "Removing one profiler rule" given: diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml index 20ba335d..27233996 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml @@ -40,15 +40,6 @@ status: region: "" conditions: - lastTransitionTime: "0001-01-01T00:00:00Z" - message: TrainingJob is in InProgress status. - status: "False" - type: ACK.ResourceSynced - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationStatus: InProgress - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Disabled - creationTime: "2021-10-13T03:49:20.337Z" - lastModifiedTime: "2021-10-13T03:49:20.576Z" + message: training job is updating + status: "True" + type: ACK.Recoverable diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml index 8f6ce5e7..956348ad 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml @@ -45,16 +45,7 @@ status: region: "" conditions: - lastTransitionTime: "0001-01-01T00:00:00Z" - message: TrainingJob is in InProgress status. - status: "False" - type: ACK.ResourceSynced - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationStatus: InProgress - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Disabled - creationTime: "2021-10-13T03:49:20.337Z" - lastModifiedTime: "2021-10-13T03:49:20.576Z" + message: training job is updating + status: "True" + type: ACK.Recoverable diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml index 6006d15c..71893197 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml @@ -43,15 +43,6 @@ status: region: "" conditions: - lastTransitionTime: "0001-01-01T00:00:00Z" - message: TrainingJob is in InProgress status. - status: "False" - type: ACK.ResourceSynced - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationStatus: InProgress - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Disabled - creationTime: "2021-10-13T03:49:20.337Z" - lastModifiedTime: "2021-10-13T03:49:20.576Z" + message: training job is updating + status: "True" + type: ACK.Recoverable diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml index fe53e0cc..565a44e0 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml @@ -52,18 +52,6 @@ status: region: "" conditions: - lastTransitionTime: "0001-01-01T00:00:00Z" - message: TrainingJob is in InProgress status. - status: "False" - type: ACK.ResourceSynced - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationStatus: InProgress - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: CPUBottleneck - ruleEvaluationStatus: InProgress - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Enabled - creationTime: "2021-10-13T03:49:20.337Z" - lastModifiedTime: "2021-10-13T03:49:20.576Z" + message: training job is updating + status: "True" + type: ACK.Recoverable diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml index 5ad5188b..fe86cbab 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml @@ -49,15 +49,6 @@ status: region: "" conditions: - lastTransitionTime: "0001-01-01T00:00:00Z" - message: TrainingJob is in InProgress status. - status: "False" - type: ACK.ResourceSynced - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationStatus: InProgress - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Enabled - creationTime: "2021-10-13T03:49:20.337Z" - lastModifiedTime: "2021-10-13T03:49:20.576Z" + message: training job is updating + status: "True" + type: ACK.Recoverable diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml index c15e3cdf..71b75801 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml @@ -52,19 +52,7 @@ status: region: "" conditions: - lastTransitionTime: "0001-01-01T00:00:00Z" - message: TrainingJob is in InProgress status. - status: "False" - type: ACK.ResourceSynced - profilerRuleEvaluationStatuses: - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: ProfilerReport - ruleEvaluationStatus: InProgress - - lastModifiedTime: "0001-01-01T00:00:00Z" - ruleConfigurationName: CPUBottleneck - ruleEvaluationStatus: InProgress - secondaryStatus: Downloading - trainingJobStatus: InProgress - profilingStatus: Enabled - creationTime: "2021-10-13T03:49:20.337Z" - lastModifiedTime: "2021-10-13T03:49:20.576Z" + message: training job is updating + status: "True" + type: ACK.Recoverable diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml index 4b956906..697d4d4c 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml @@ -41,13 +41,7 @@ status: region: "" conditions: - lastTransitionTime: "0001-01-01T00:00:00Z" - message: Warm Pool Infrastructure is in Available status. - status: "False" - type: ACK.ResourceSynced - warmPoolStatus: - status: Available - secondaryStatus: Completed - trainingJobStatus: Completed - profilingStatus: Disabled - creationTime: "2021-10-12T05:49:40.493Z" - lastModifiedTime: "2021-10-12T05:52:46.108Z" \ No newline at end of file + message: training job is updating + status: "True" + type: ACK.Recoverable + \ No newline at end of file From 7e7398e1aa558417177bd1b0d54a0c41913cb80e Mon Sep 17 00:00:00 2001 From: ananth102 Date: Tue, 6 Dec 2022 03:08:25 +0000 Subject: [PATCH 54/71] fix: setting the status of desired --- apis/v1alpha1/ack-generate-metadata.yaml | 2 +- pkg/resource/training_job/sdk.go | 5 +++++ templates/training_job/sdk_update_post_set_output.go.tpl | 5 +++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index 55ea6485..452f6947 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,5 +1,5 @@ ack_generate_info: - build_date: "2022-12-05T20:17:26Z" + build_date: "2022-12-06T02:38:08Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index 03130ad2..8a1d7e98 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -1189,6 +1189,11 @@ func (rm *resourceManager) sdkUpdate( } rm.setStatusDefaults(ko) + observed, err := rm.sdkFind(ctx, latest) + if err != nil { + return observed, err + } + desired.SetStatus(observed) return desired, ackrequeue.NeededAfter( errors.New("training job is updating"), ackrequeue.DefaultRequeueAfterDuration, diff --git a/templates/training_job/sdk_update_post_set_output.go.tpl b/templates/training_job/sdk_update_post_set_output.go.tpl index a4e7d1a5..fe991e04 100644 --- a/templates/training_job/sdk_update_post_set_output.go.tpl +++ b/templates/training_job/sdk_update_post_set_output.go.tpl @@ -1,3 +1,8 @@ +observed, err := rm.sdkFind(ctx, latest) +if err != nil { + return observed, err +} +desired.SetStatus(observed) return desired, ackrequeue.NeededAfter( errors.New("training job is updating"), ackrequeue.DefaultRequeueAfterDuration, From a1651c6880a4413fe0edd8eb9d192f27684ccc0a Mon Sep 17 00:00:00 2001 From: ananth102 Date: Tue, 6 Dec 2022 03:18:44 +0000 Subject: [PATCH 55/71] updating unit tests --- .../update/observed/removeProfilerBoth.yaml | 13 +++++++++++++ .../update/observed/removeProfilerConfig.yaml | 13 +++++++++++++ .../update/observed/removeProfilerRule.yaml | 13 +++++++++++++ .../update/observed/updateProfilerBoth.yaml | 16 ++++++++++++++++ .../update/observed/updateProfilerConfig.yaml | 13 +++++++++++++ .../update/observed/updateProfilerRule.yaml | 16 ++++++++++++++++ .../v1alpha1/update/observed/updateWP.yaml | 12 +++++++++++- 7 files changed, 95 insertions(+), 1 deletion(-) diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml index 27233996..9c27a1de 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerBoth.yaml @@ -39,7 +39,20 @@ status: ownerAccountID: "" region: "" conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced - lastTransitionTime: "0001-01-01T00:00:00Z" message: training job is updating status: "True" type: ACK.Recoverable + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationStatus: InProgress + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Disabled + creationTime: "2021-10-13T03:49:20.337Z" + lastModifiedTime: "2021-10-13T03:49:20.576Z" \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml index 956348ad..bb1c0cf7 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerConfig.yaml @@ -44,8 +44,21 @@ status: ownerAccountID: "" region: "" conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced - lastTransitionTime: "0001-01-01T00:00:00Z" message: training job is updating status: "True" type: ACK.Recoverable + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationStatus: InProgress + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Disabled + creationTime: "2021-10-13T03:49:20.337Z" + lastModifiedTime: "2021-10-13T03:49:20.576Z" diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml index 71893197..17b4d9f0 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/removeProfilerRule.yaml @@ -42,7 +42,20 @@ status: ownerAccountID: "" region: "" conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced - lastTransitionTime: "0001-01-01T00:00:00Z" message: training job is updating status: "True" type: ACK.Recoverable + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationStatus: InProgress + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Disabled + creationTime: "2021-10-13T03:49:20.337Z" + lastModifiedTime: "2021-10-13T03:49:20.576Z" \ No newline at end of file diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml index 565a44e0..35d32300 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerBoth.yaml @@ -51,7 +51,23 @@ status: ownerAccountID: "" region: "" conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced - lastTransitionTime: "0001-01-01T00:00:00Z" message: training job is updating status: "True" type: ACK.Recoverable + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationStatus: InProgress + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: CPUBottleneck + ruleEvaluationStatus: InProgress + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + creationTime: "2021-10-13T03:49:20.337Z" + lastModifiedTime: "2021-10-13T03:49:20.576Z" diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml index fe86cbab..4c8abe23 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerConfig.yaml @@ -48,7 +48,20 @@ status: ownerAccountID: "" region: "" conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced - lastTransitionTime: "0001-01-01T00:00:00Z" message: training job is updating status: "True" type: ACK.Recoverable + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationStatus: InProgress + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + creationTime: "2021-10-13T03:49:20.337Z" + lastModifiedTime: "2021-10-13T03:49:20.576Z" diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml index 71b75801..37e8cf94 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateProfilerRule.yaml @@ -51,8 +51,24 @@ status: ownerAccountID: "" region: "" conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in InProgress status. + status: "False" + type: ACK.ResourceSynced - lastTransitionTime: "0001-01-01T00:00:00Z" message: training job is updating status: "True" type: ACK.Recoverable + profilerRuleEvaluationStatuses: + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: ProfilerReport + ruleEvaluationStatus: InProgress + - lastModifiedTime: "0001-01-01T00:00:00Z" + ruleConfigurationName: CPUBottleneck + ruleEvaluationStatus: InProgress + secondaryStatus: Downloading + trainingJobStatus: InProgress + profilingStatus: Enabled + creationTime: "2021-10-13T03:49:20.337Z" + lastModifiedTime: "2021-10-13T03:49:20.576Z" diff --git a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml index 697d4d4c..85be996e 100644 --- a/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml +++ b/pkg/resource/training_job/testdata/v1alpha1/update/observed/updateWP.yaml @@ -40,8 +40,18 @@ status: ownerAccountID: "" region: "" conditions: + - lastTransitionTime: "0001-01-01T00:00:00Z" + message: TrainingJob is in Completed status. + status: "True" + type: ACK.ResourceSynced - lastTransitionTime: "0001-01-01T00:00:00Z" message: training job is updating status: "True" type: ACK.Recoverable - \ No newline at end of file + warmPoolStatus: + status: Available + secondaryStatus: Completed + trainingJobStatus: Completed + profilingStatus: Disabled + creationTime: "2021-10-12T05:49:40.493Z" + lastModifiedTime: "2021-10-12T05:52:46.108Z" From 77438b9d8984702f600d13b2803a438901155a35 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Tue, 6 Dec 2022 18:21:36 +0000 Subject: [PATCH 56/71] fixed merge issues --- apis/v1alpha1/ack-generate-metadata.yaml | 6 +++--- apis/v1alpha1/generator.yaml | 9 +++++---- apis/v1alpha1/types.go | 6 +++--- generator.yaml | 9 +++++---- helm/crds/sagemaker.services.k8s.aws_processingjobs.yaml | 2 +- helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml | 2 +- helm/crds/sagemaker.services.k8s.aws_transformjobs.yaml | 2 +- pkg/resource/notebook_instance_lifecycle_config/sdk.go | 2 +- 8 files changed, 20 insertions(+), 18 deletions(-) diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index 452f6947..486c1b64 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,13 +1,13 @@ ack_generate_info: - build_date: "2022-12-06T02:38:08Z" + build_date: "2022-12-06T18:18:39Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc -api_directory_checksum: 3c32189115c09455ec3762ba6d37263b3500e6ea +api_directory_checksum: 859aa25cb7eb363de766f3279fd5b12664c31425 api_version: v1alpha1 aws_sdk_go_version: v1.44.117 generator_config_info: - file_checksum: 54537eab4b25276641c237dd1de49a6c1bc06fb2 + file_checksum: 60bfaf57553c2e031a3e78bc8bb9c3535ad8ada8 original_file_name: generator.yaml last_modification: reason: API generation diff --git a/apis/v1alpha1/generator.yaml b/apis/v1alpha1/generator.yaml index 4ae602fa..5d307e1d 100644 --- a/apis/v1alpha1/generator.yaml +++ b/apis/v1alpha1/generator.yaml @@ -1009,11 +1009,12 @@ ignore: - ExecutionRoleIdentityConfig - HyperParameterTuningResourceConfig - InstanceMetadataServiceConfiguration + - CanvasAppSettings + - ExplainerConfig + - HyperParameterTuningJobStrategyConfig + - DisableProfiler field_paths: - StartPipelineExecutionInput.ClientRequestToken - CreatePipelineInput.ClientRequestToken - CreatePipelineInput.PipelineDefinitionS3Location - - CanvasAppSettings - - ExplainerConfig - - HyperParameterTuningJobStrategyConfig - - DisableProfiler \ No newline at end of file + \ No newline at end of file diff --git a/apis/v1alpha1/types.go b/apis/v1alpha1/types.go index ec90c093..979298d7 100644 --- a/apis/v1alpha1/types.go +++ b/apis/v1alpha1/types.go @@ -957,11 +957,11 @@ type Experiment struct { // Associates a SageMaker job as a trial component with an experiment and trial. // Specified when you call the following APIs: // -// - CreateProcessingJob +// * CreateProcessingJob // -// - CreateTrainingJob +// * CreateTrainingJob // -// - CreateTransformJob +// * CreateTransformJob type ExperimentConfig struct { ExperimentName *string `json:"experimentName,omitempty"` TrialComponentDisplayName *string `json:"trialComponentDisplayName,omitempty"` diff --git a/generator.yaml b/generator.yaml index 4ae602fa..5d307e1d 100644 --- a/generator.yaml +++ b/generator.yaml @@ -1009,11 +1009,12 @@ ignore: - ExecutionRoleIdentityConfig - HyperParameterTuningResourceConfig - InstanceMetadataServiceConfiguration + - CanvasAppSettings + - ExplainerConfig + - HyperParameterTuningJobStrategyConfig + - DisableProfiler field_paths: - StartPipelineExecutionInput.ClientRequestToken - CreatePipelineInput.ClientRequestToken - CreatePipelineInput.PipelineDefinitionS3Location - - CanvasAppSettings - - ExplainerConfig - - HyperParameterTuningJobStrategyConfig - - DisableProfiler \ No newline at end of file + \ No newline at end of file diff --git a/helm/crds/sagemaker.services.k8s.aws_processingjobs.yaml b/helm/crds/sagemaker.services.k8s.aws_processingjobs.yaml index 3b003865..cc089ae7 100644 --- a/helm/crds/sagemaker.services.k8s.aws_processingjobs.yaml +++ b/helm/crds/sagemaker.services.k8s.aws_processingjobs.yaml @@ -70,7 +70,7 @@ spec: experimentConfig: description: "Associates a SageMaker job as a trial component with an experiment and trial. Specified when you call the following APIs: - \n - CreateProcessingJob \n - CreateTrainingJob \n - CreateTransformJob" + \n * CreateProcessingJob \n * CreateTrainingJob \n * CreateTransformJob" properties: experimentName: type: string diff --git a/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml b/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml index a2eb98a8..48305bcb 100644 --- a/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml +++ b/helm/crds/sagemaker.services.k8s.aws_trainingjobs.yaml @@ -209,7 +209,7 @@ spec: experimentConfig: description: "Associates a SageMaker job as a trial component with an experiment and trial. Specified when you call the following APIs: - \n - CreateProcessingJob \n - CreateTrainingJob \n - CreateTransformJob" + \n * CreateProcessingJob \n * CreateTrainingJob \n * CreateTransformJob" properties: experimentName: type: string diff --git a/helm/crds/sagemaker.services.k8s.aws_transformjobs.yaml b/helm/crds/sagemaker.services.k8s.aws_transformjobs.yaml index ab1923eb..3edd452d 100644 --- a/helm/crds/sagemaker.services.k8s.aws_transformjobs.yaml +++ b/helm/crds/sagemaker.services.k8s.aws_transformjobs.yaml @@ -83,7 +83,7 @@ spec: experimentConfig: description: "Associates a SageMaker job as a trial component with an experiment and trial. Specified when you call the following APIs: - \n - CreateProcessingJob \n - CreateTrainingJob \n - CreateTransformJob" + \n * CreateProcessingJob \n * CreateTrainingJob \n * CreateTransformJob" properties: experimentName: type: string diff --git a/pkg/resource/notebook_instance_lifecycle_config/sdk.go b/pkg/resource/notebook_instance_lifecycle_config/sdk.go index 17eece81..dd4def62 100644 --- a/pkg/resource/notebook_instance_lifecycle_config/sdk.go +++ b/pkg/resource/notebook_instance_lifecycle_config/sdk.go @@ -271,7 +271,7 @@ func (rm *resourceManager) sdkUpdate( ko := desired.ko.DeepCopy() rm.setStatusDefaults(ko) - // Done because controller finishes reconciling after update. + //Done because controller finishes reconciling after update. return nil, requeueWaitWhileUpdating return &resource{ko}, nil } From e86f002aa00f558256d3c4caae7860a43728ebc3 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 7 Dec 2022 00:03:13 +0000 Subject: [PATCH 57/71] corrected spelling --- .../{update_sucess.json => update_success.json} | 0 .../training_job/testdata/test_suite.yaml | 16 ++++++++-------- 2 files changed, 8 insertions(+), 8 deletions(-) rename pkg/resource/training_job/testdata/sdkapi/update/{update_sucess.json => update_success.json} (100%) diff --git a/pkg/resource/training_job/testdata/sdkapi/update/update_sucess.json b/pkg/resource/training_job/testdata/sdkapi/update/update_success.json similarity index 100% rename from pkg/resource/training_job/testdata/sdkapi/update/update_sucess.json rename to pkg/resource/training_job/testdata/sdkapi/update/update_success.json diff --git a/pkg/resource/training_job/testdata/test_suite.yaml b/pkg/resource/training_job/testdata/test_suite.yaml index f0c79c8d..0a8a8ae8 100644 --- a/pkg/resource/training_job/testdata/test_suite.yaml +++ b/pkg/resource/training_job/testdata/test_suite.yaml @@ -260,7 +260,7 @@ tests: latest_state: "v1alpha1/update/latest/profilerHappyBase.yaml" svc_api: - operation: UpdateTrainingJobWithContext - output_fixture: "sdkapi/update/update_sucess.json" + output_fixture: "sdkapi/update/update_success.json" - operation: DescribeTrainingJobWithContext output_fixture: "sdkapi/update/describe/profiler_removal.json" invoke: Update @@ -274,7 +274,7 @@ tests: latest_state: "v1alpha1/update/latest/profilerHappyBase.yaml" svc_api: - operation: UpdateTrainingJobWithContext - output_fixture: "sdkapi/update/update_sucess.json" + output_fixture: "sdkapi/update/update_success.json" - operation: DescribeTrainingJobWithContext output_fixture: "sdkapi/update/describe/profiler_removal.json" invoke: Update @@ -288,7 +288,7 @@ tests: latest_state: "v1alpha1/update/latest/profilerHappyBase.yaml" svc_api: - operation: UpdateTrainingJobWithContext - output_fixture: "sdkapi/update/update_sucess.json" + output_fixture: "sdkapi/update/update_success.json" - operation: DescribeTrainingJobWithContext output_fixture: "sdkapi/update/describe/profiler_removal.json" invoke: Update @@ -302,7 +302,7 @@ tests: latest_state: "v1alpha1/update/latest/profilerHappyBase.yaml" svc_api: - operation: UpdateTrainingJobWithContext - output_fixture: "sdkapi/update/update_sucess.json" + output_fixture: "sdkapi/update/update_success.json" - operation: DescribeTrainingJobWithContext output_fixture: "sdkapi/update/describe/bothprofiler_update.json" invoke: Update @@ -316,7 +316,7 @@ tests: latest_state: "v1alpha1/update/latest/profilerHappyBase.yaml" svc_api: - operation: UpdateTrainingJobWithContext - output_fixture: "sdkapi/update/update_sucess.json" + output_fixture: "sdkapi/update/update_success.json" - operation: DescribeTrainingJobWithContext output_fixture: "sdkapi/update/describe/profiler_config_update.json" invoke: Update @@ -346,7 +346,7 @@ tests: latest_state: "v1alpha1/update/latest/WPHappy.yaml" svc_api: - operation: UpdateTrainingJobWithContext - output_fixture: "sdkapi/update/update_sucess.json" + output_fixture: "sdkapi/update/update_success.json" - operation: DescribeTrainingJobWithContext output_fixture: "sdkapi/update/describe/warmpool.json" invoke: Update @@ -375,7 +375,7 @@ tests: latest_state: "v1alpha1/update/latest/vanilla.yaml" svc_api: - operation: UpdateTrainingJobWithContext - output_fixture: "sdkapi/update/update_sucess.json" + output_fixture: "sdkapi/update/update_success.json" - operation: DescribeTrainingJobWithContext output_fixture: "sdkapi/update/describe/profiler_config_update.json" invoke: Update @@ -413,7 +413,7 @@ tests: latest_state: "v1alpha1/update/latest/profilerHappyBase.yaml" svc_api: - operation: UpdateTrainingJobWithContext - output_fixture: "sdkapi/update/update_sucess.json" + output_fixture: "sdkapi/update/update_success.json" - operation: DescribeTrainingJobWithContext output_fixture: "sdkapi/update/describe/profilerrule_update.json" invoke: Update From 3663ab0df0c7c6e2c15e9d034ed55907a26d9af1 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 7 Dec 2022 00:13:37 +0000 Subject: [PATCH 58/71] updated descriptions --- .../training_job/testdata/test_suite.yaml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pkg/resource/training_job/testdata/test_suite.yaml b/pkg/resource/training_job/testdata/test_suite.yaml index 0a8a8ae8..777dcadb 100644 --- a/pkg/resource/training_job/testdata/test_suite.yaml +++ b/pkg/resource/training_job/testdata/test_suite.yaml @@ -254,7 +254,7 @@ tests: description: "Tests for update operation" scenarios: - name: "Update=RemoveProfilerBothParams" - desciption: "This test checks if the Controller can remove the profiler properly" + description: "This test checks if the Controller can remove the profiler properly, if both profile parameters are removed in the spec." given: desired_state: "v1alpha1/update/desired/removeProfilerboth.yaml" latest_state: "v1alpha1/update/latest/profilerHappyBase.yaml" @@ -268,7 +268,7 @@ tests: latest_state: "v1alpha1/update/observed/removeProfilerBoth.yaml" error: training job is updating - name: "Update=RemoveProfilerRule" - desciption: "This test checks if the Controller can remove the profiler properly" + description: "This test checks if the Controller can remove the profiler properly, if the Profiler rule is removed in the spec." given: desired_state: "v1alpha1/update/desired/removeProfilerRule.yaml" latest_state: "v1alpha1/update/latest/profilerHappyBase.yaml" @@ -282,7 +282,7 @@ tests: latest_state: "v1alpha1/update/observed/removeProfilerRule.yaml" error: training job is updating - name: "Update=RemoveProfilerConfig" - desciption: "This test checks if the Controller can remove the profiler properly" + description: "This test checks if the Controller can remove the profiler properly, if the Profiler config is removed in the spec." given: desired_state: "v1alpha1/update/desired/removeProfilerConfig.yaml" latest_state: "v1alpha1/update/latest/profilerHappyBase.yaml" @@ -324,7 +324,7 @@ tests: latest_state: "v1alpha1/update/observed/updateProfilerConfig.yaml" error: training job is updating - name: "Update=ProfilerStarting" - description: "Tests if profiler returns a requeue error when TJ is updated in Starting." + description: "Tests if profiler returns a requeue error when Training Job is updated in Starting." given: desired_state: "v1alpha1/update/desired/updateProfileConfig.yaml" latest_state: "v1alpha1/update/latest/profilerStarting.yaml" @@ -332,7 +332,7 @@ tests: expect: error: training job cannot be updated while secondary status is in Starting state. - name: "Update=ProfilerTerminal" - description: "Tests if profiler returns a terminal error when TJ is updated in Completed." + description: "Tests if profiler returns a terminal error when Training Job is updated in Completed." given: desired_state: "v1alpha1/update/desired/updateProfileConfig.yaml" latest_state: "v1alpha1/update/latest/profilerTerminal.yaml" @@ -340,7 +340,7 @@ tests: expect: error: "resource is in terminal condition" - name: "Update=WarmPool" - description: "Update a warm pool" + description: "Update a warm pool successfully." given: desired_state: "v1alpha1/update/desired/updateWarmPool.yaml" latest_state: "v1alpha1/update/latest/WPHappy.yaml" @@ -353,7 +353,7 @@ tests: expect: latest_state: "v1alpha1/update/observed/updateWP.yaml" - name: "Update=WarmPoolInProgress" - description: "Return a requeue error if trainingjob is in InProgress state when a warm pool is being updated." + description: "Return a requeue error if trainingjob is in InProgress state." given: desired_state: "v1alpha1/update/desired/updateWarmPool.yaml" latest_state: "v1alpha1/update/latest/WPDownloading.yaml" @@ -399,7 +399,7 @@ tests: expect: error: "resource is in terminal condition" - name: "Update=Invalid update" - description: "Attempt to update something else" + description: "Attempt to update a parameter that is not allowed to be updated." given: desired_state: "v1alpha1/update/desired/invalidupdate.yaml" latest_state: "v1alpha1/update/latest/WPHappy.yaml" @@ -429,7 +429,7 @@ tests: expect: error: "cannot remove/modify a profiler rule." - name: "Update=RemoveAddProfilerRule" - description: "Removing one profiler rule" + description: "Removing a profiler rule but adding another." given: desired_state: "v1alpha1/update/desired/removeAddProfilerRule.yaml" latest_state: "v1alpha1/update/latest/profilerMultipleRules.yaml" From 8538820fffbbfcda2490d3b4f174ba089f3aedc6 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 7 Dec 2022 03:38:32 +0000 Subject: [PATCH 59/71] changed error strings --- pkg/resource/training_job/custom_update.go | 2 +- pkg/resource/training_job/hooks.go | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/resource/training_job/custom_update.go b/pkg/resource/training_job/custom_update.go index 06bb6b32..60539203 100644 --- a/pkg/resource/training_job/custom_update.go +++ b/pkg/resource/training_job/custom_update.go @@ -42,7 +42,7 @@ func (rm *resourceManager) buildProfilerRuleConfigUpdateInput(desired *resource, return nil } if len(profilerRuleDesired) <= len(profilerRuleLatest) { - return errors.New("cannot remove/modify a profiler rule.") + return errors.New("cannot remove/modify existing profiler rules.") } ruleMap, err := rm.markNonUpdatableRules(profilerRuleDesired, profilerRuleLatest) diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index 1d480ccb..a277e0d8 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -51,7 +51,7 @@ var ( ) requeueBeforeUpdate = ackrequeue.NeededAfter( - errors.New("warm pool cannot be updated in InProgress state, requeuing until TrainingJob reaches completed state."), + errors.New("warm pool cannot be updated while TrainingJob status is InProgress, requeuing until TrainingJob completes."), ackrequeue.DefaultRequeueAfterDuration, ) ) @@ -92,7 +92,7 @@ func (rm *resourceManager) customSetOutput(r *resource) { func (rm *resourceManager) isWarmPoolUpdatable(latest *resource) error { trainingJobStatus := latest.ko.Status.TrainingJobStatus if ackcompare.IsNil(latest.ko.Spec.ResourceConfig.KeepAlivePeriodInSeconds) { - return ackerr.NewTerminalError(errors.New("warm pool does not exist")) + return ackerr.NewTerminalError(errors.New("warm pool does not exist and can only be configured at creation time")) } if ackcompare.IsNotNil(trainingJobStatus) { if *trainingJobStatus == svcsdk.TrainingJobStatusInProgress { @@ -104,7 +104,7 @@ func (rm *resourceManager) isWarmPoolUpdatable(latest *resource) error { if wp_modifying { return nil } else { - return ackerr.NewTerminalError(errors.New("warm pool is in a non updateable state")) + return ackerr.NewTerminalError(errors.New("warm pool cannot be updated if has been terminated or reused")) } } else { // Sometimes the API (briefly) does not return the WP status even if it completes. @@ -113,7 +113,7 @@ func (rm *resourceManager) isWarmPoolUpdatable(latest *resource) error { } } else { // Training Job is in 'Failed'|'Stopping'|'Stopped' (Terminal) - return ackerr.NewTerminalError(errors.New("warm pool is in a non updateable state")) + return ackerr.NewTerminalError(errors.New("warm pool can only be updated if TrainingJob status is Completed. Warm pool will be terminated automatically if trainingjob has not completed successfully")) } } From 2a923516023c83763f4237a6d17127fb70b63a60 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 7 Dec 2022 03:41:07 +0000 Subject: [PATCH 60/71] changed error strings --- pkg/resource/training_job/testdata/test_suite.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/resource/training_job/testdata/test_suite.yaml b/pkg/resource/training_job/testdata/test_suite.yaml index 777dcadb..a836fdf2 100644 --- a/pkg/resource/training_job/testdata/test_suite.yaml +++ b/pkg/resource/training_job/testdata/test_suite.yaml @@ -359,7 +359,7 @@ tests: latest_state: "v1alpha1/update/latest/WPDownloading.yaml" invoke: Update expect: - error: warm pool cannot be updated in InProgress state, requeuing until TrainingJob reaches completed state. + error: "warm pool cannot be updated while TrainingJob status is InProgress, requeuing until TrainingJob completes." - name: "Update=WarmPoolTerminal" description: "Check if controller behaves correctly when WarmPool cannot be updated." given: @@ -427,7 +427,7 @@ tests: latest_state: "v1alpha1/update/latest/profilerMultipleRules.yaml" invoke: Update expect: - error: "cannot remove/modify a profiler rule." + error: "cannot remove/modify existing profiler rules." - name: "Update=RemoveAddProfilerRule" description: "Removing a profiler rule but adding another." given: @@ -435,7 +435,7 @@ tests: latest_state: "v1alpha1/update/latest/profilerMultipleRules.yaml" invoke: Update expect: - error: "cannot remove/modify a profiler rule." + error: "cannot remove/modify existing profiler rules." From 734089187465e2a3d189b6d32819cce838bd5ba8 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 7 Dec 2022 03:57:48 +0000 Subject: [PATCH 61/71] control flow modifications --- apis/v1alpha1/ack-generate-metadata.yaml | 4 +-- pkg/resource/training_job/custom_delta.go | 29 ++++++------------- pkg/resource/training_job/custom_update.go | 11 +++---- pkg/resource/training_job/hooks.go | 22 +++++++------- pkg/resource/training_job/sdk.go | 8 ++--- .../sdk_update_post_build_request.go.tpl | 8 ++--- 6 files changed, 33 insertions(+), 49 deletions(-) diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index 486c1b64..cab2e869 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,9 +1,9 @@ ack_generate_info: - build_date: "2022-12-06T18:18:39Z" + build_date: "2022-12-07T03:56:44Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc -api_directory_checksum: 859aa25cb7eb363de766f3279fd5b12664c31425 +api_directory_checksum: 48886052888c7166740636bc128afdb53d05a4cb api_version: v1alpha1 aws_sdk_go_version: v1.44.117 generator_config_info: diff --git a/pkg/resource/training_job/custom_delta.go b/pkg/resource/training_job/custom_delta.go index 8923fafa..0d7919f3 100644 --- a/pkg/resource/training_job/custom_delta.go +++ b/pkg/resource/training_job/custom_delta.go @@ -30,10 +30,6 @@ func customSetDefaults( if ackcompare.IsNotNil(a.ko.Spec.ProfilerRuleConfigurations) && ackcompare.IsNotNil(b.ko.Spec.ProfilerRuleConfigurations) && len(a.ko.Spec.ProfilerRuleConfigurations) == len(b.ko.Spec.ProfilerRuleConfigurations) { for index := range a.ko.Spec.ProfilerRuleConfigurations { - // Prevent out of bounds panics. - if index == len(a.ko.Spec.ProfilerRuleConfigurations) || index == len(b.ko.Spec.ProfilerRuleConfigurations) { - break - } if ackcompare.IsNil(a.ko.Spec.ProfilerRuleConfigurations[index].VolumeSizeInGB) && ackcompare.IsNotNil(b.ko.Spec.ProfilerRuleConfigurations[index].VolumeSizeInGB) { a.ko.Spec.ProfilerRuleConfigurations[index].VolumeSizeInGB = defaultVolumeSizeInGB } @@ -75,32 +71,25 @@ func customPostCompare(latest *resource, desired *resource, delta *ackcompare.De return } profilerStatus := latest.ko.Status.ProfilingStatus - profilerDisabled := false if ackcompare.IsNotNil(profilerStatus) { //Do not remove profiler if user wants to enable it if *profilerStatus == "Disabled" && !userInitiatesProfilerCheck(desired) { - profilerDisabled = true - } else { - return + // TODO: Replace remove delta with an ack version when its natively supported + if profilerConfigDiff { + removeDelta(delta, "Spec.ProfilerConfig") + } + if profilerRuleDiff { + removeDelta(delta, "Spec.ProfilerRuleConfigurations") + } } - } else { - return - } - // TODO: Replace remove delta with an ack version when its natively supported - if profilerConfigDiff && profilerDisabled { - removeDelta(delta, "Spec.ProfilerConfig") - } - if profilerRuleDiff && profilerDisabled { - removeDelta(delta, "Spec.ProfilerRuleConfigurations") } + } // userInitiatesProfilerCheck checks if the user enabled/re enabled the profiler. func userInitiatesProfilerCheck(desired *resource) bool { - profilerConfigPresent := ackcompare.IsNotNil(desired.ko.Spec.ProfilerConfig) - profilerRuleConfigPresent := ackcompare.IsNotNil(desired.ko.Spec.ProfilerRuleConfigurations) - return profilerConfigPresent && profilerRuleConfigPresent + return ackcompare.IsNotNil(desired.ko.Spec.ProfilerConfig) && ackcompare.IsNotNil(desired.ko.Spec.ProfilerRuleConfigurations) } // removeDelta Removes fieldName from the delta slice. diff --git a/pkg/resource/training_job/custom_update.go b/pkg/resource/training_job/custom_update.go index 60539203..60a33828 100644 --- a/pkg/resource/training_job/custom_update.go +++ b/pkg/resource/training_job/custom_update.go @@ -34,6 +34,7 @@ import ( // 2. Rule gets removed (error is returned) // 3. Rule gets removed but others get added (error is returned) // 4. Rule gets changed (error gets returned) +// 5. One/more rule gets changed and one/more rules get added : successful update and error in the next reconcilation loop func (rm *resourceManager) buildProfilerRuleConfigUpdateInput(desired *resource, latest *resource, input *svcsdk.UpdateTrainingJobInput) error { profilerRuleDesired := desired.ko.Spec.ProfilerRuleConfigurations profilerRuleLatest := latest.ko.Spec.ProfilerRuleConfigurations @@ -73,15 +74,11 @@ func (rm *resourceManager) markNonUpdatableRules(profilerRuleDesired []*svcapity commonRulesMap := map[string]int{} latestRulesMap := map[string]int{} for _, rule := range profilerRuleLatest { - if ackcompare.IsNotNil(rule.RuleConfigurationName) { - commonRulesMap[*rule.RuleConfigurationName] = 0 - latestRulesMap[*rule.RuleConfigurationName] = 0 - } + commonRulesMap[*rule.RuleConfigurationName] = 0 + latestRulesMap[*rule.RuleConfigurationName] = 0 } for _, rule := range profilerRuleDesired { - if ackcompare.IsNotNil(rule.RuleConfigurationName) { - commonRulesMap[*rule.RuleConfigurationName] = 1 - } + commonRulesMap[*rule.RuleConfigurationName] = 1 } for _, val := range commonRulesMap { // This means that there exists a rule in latest that is not present in desired diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index a277e0d8..0ca53768 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -73,14 +73,13 @@ func (rm *resourceManager) customSetOutput(r *resource) { } } - for _, rule := range r.ko.Status.ProfilerRuleEvaluationStatuses { - if ackcompare.IsNotNil(r.ko.Status.ProfilingStatus) && *r.ko.Status.ProfilingStatus == "Disabled" { - // Sometimes rule evaluation status will stay in InProgress state. - break - } - if rule.RuleEvaluationStatus != nil && svccommon.IsModifyingStatus(rule.RuleEvaluationStatus, &ruleModifyingStatuses) { - svccommon.SetSyncedCondition(r, rule.RuleEvaluationStatus, aws.String("ProfilerRule"), &ruleModifyingStatuses) - return + // Sometimes rule evaluation status will stay in InProgress state. + if ackcompare.IsNotNil(r.ko.Status.ProfilingStatus) && *r.ko.Status.ProfilingStatus != "Disabled" { + for _, rule := range r.ko.Status.ProfilerRuleEvaluationStatuses { + if rule.RuleEvaluationStatus != nil && svccommon.IsModifyingStatus(rule.RuleEvaluationStatus, &ruleModifyingStatuses) { + svccommon.SetSyncedCondition(r, rule.RuleEvaluationStatus, aws.String("ProfilerRule"), &ruleModifyingStatuses) + return + } } } @@ -100,8 +99,7 @@ func (rm *resourceManager) isWarmPoolUpdatable(latest *resource) error { } if *trainingJobStatus == svcsdk.TrainingJobStatusCompleted { if ackcompare.IsNotNil(latest.ko.Status.WarmPoolStatus) { - wp_modifying := svccommon.IsModifyingStatus(latest.ko.Status.WarmPoolStatus.Status, &WarmPoolModifyingStatuses) - if wp_modifying { + if svccommon.IsModifyingStatus(latest.ko.Status.WarmPoolStatus.Status, &WarmPoolModifyingStatuses) { return nil } else { return ackerr.NewTerminalError(errors.New("warm pool cannot be updated if has been terminated or reused")) @@ -121,9 +119,9 @@ func (rm *resourceManager) isWarmPoolUpdatable(latest *resource) error { } -// customSetOutputUpdateProfiler decides whether the training job is ready/eligible for update +// isProfilerUpdatable decides whether the training job is ready/eligible for update // depending on the status. -func (rm *resourceManager) customSetOutputUpdateProfiler(r *resource) error { +func (rm *resourceManager) isProfilerUpdatable(r *resource) error { trainingJobStatus := r.ko.Status.TrainingJobStatus if ackcompare.IsNotNil(trainingJobStatus) { for _, terminalStatus := range TrainingJobTerminalProfiler { diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index 8a1d7e98..805667d7 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -1156,15 +1156,15 @@ func (rm *resourceManager) sdkUpdate( } } if profiler_diff { - if up_err := rm.customSetOutputUpdateProfiler(latest); up_err != nil { - return nil, up_err + if err := rm.isProfilerUpdatable(latest); err != nil { + return nil, err } input.SetResourceConfig(nil) if rm.isProfilerRemoved(desired, latest) { rm.handleProfilerRemoval(input) } else { - if inp_err := rm.customSetUpdateInput(desired, latest, delta, input); inp_err != nil { - return nil, inp_err + if err := rm.customSetUpdateInput(desired, latest, delta, input); err != nil { + return nil, err } } } diff --git a/templates/training_job/sdk_update_post_build_request.go.tpl b/templates/training_job/sdk_update_post_build_request.go.tpl index 4ee76d77..2e69ecfa 100644 --- a/templates/training_job/sdk_update_post_build_request.go.tpl +++ b/templates/training_job/sdk_update_post_build_request.go.tpl @@ -21,15 +21,15 @@ if warmpool_diff { } } if profiler_diff { - if up_err := rm.customSetOutputUpdateProfiler(latest); up_err != nil { - return nil, up_err + if err := rm.isProfilerUpdatable(latest); err != nil { + return nil, err } input.SetResourceConfig(nil) if rm.isProfilerRemoved(desired, latest) { rm.handleProfilerRemoval(input) } else { - if inp_err := rm.customSetUpdateInput(desired, latest, delta, input); inp_err != nil { - return nil, inp_err + if err := rm.customSetUpdateInput(desired, latest, delta, input); err != nil { + return nil, err } } } From 819e9c021a13df3d6bf08085586a67cc69ddfcfb Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 7 Dec 2022 04:14:17 +0000 Subject: [PATCH 62/71] logic improvements and clarifying comments --- pkg/resource/training_job/custom_delta.go | 3 +++ pkg/resource/training_job/custom_update.go | 22 +++++++++++++--------- pkg/resource/training_job/hooks.go | 1 + 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/pkg/resource/training_job/custom_delta.go b/pkg/resource/training_job/custom_delta.go index 0d7919f3..4f18b7c9 100644 --- a/pkg/resource/training_job/custom_delta.go +++ b/pkg/resource/training_job/custom_delta.go @@ -98,6 +98,9 @@ func removeDelta(delta *ackcompare.Delta, fieldName string) { differences := delta.Differences for index, diff := range differences { if diff.Path.Contains(fieldName) { + // differences[index+1:]... is a legal statement in go even if the slice is of length 1. + // The minimum index in the left part of [index+1:] is allowed to be equal to capacity. + // The left index will never be greater than the capacity. differences = append(differences[:index], differences[index+1:]...) delta.Differences = differences return diff --git a/pkg/resource/training_job/custom_update.go b/pkg/resource/training_job/custom_update.go index 60a33828..f177d048 100644 --- a/pkg/resource/training_job/custom_update.go +++ b/pkg/resource/training_job/custom_update.go @@ -46,7 +46,7 @@ func (rm *resourceManager) buildProfilerRuleConfigUpdateInput(desired *resource, return errors.New("cannot remove/modify existing profiler rules.") } - ruleMap, err := rm.markNonUpdatableRules(profilerRuleDesired, profilerRuleLatest) + latestRules, err := rm.markNonUpdatableRules(profilerRuleDesired, profilerRuleLatest) if err != nil { return err } @@ -54,7 +54,7 @@ func (rm *resourceManager) buildProfilerRuleConfigUpdateInput(desired *resource, for _, rule := range profilerRuleDesired { if ackcompare.IsNotNil(rule) && ackcompare.IsNotNil(rule.RuleConfigurationName) { - _, present := ruleMap[*rule.RuleConfigurationName] + _, present := latestRules[*rule.RuleConfigurationName] if !present { profilerRuleInput = append(profilerRuleInput, rm.convertProfileRuleType(rule)) } @@ -71,16 +71,20 @@ func (rm *resourceManager) buildProfilerRuleConfigUpdateInput(desired *resource, // markNonUpdatableRules returns a map containing the rules that are not eligible for update. // In addition it returns an error if a rule gets removed. func (rm *resourceManager) markNonUpdatableRules(profilerRuleDesired []*svcapitypes.ProfilerRuleConfiguration, profilerRuleLatest []*svcapitypes.ProfilerRuleConfiguration) (map[string]int, error) { - commonRulesMap := map[string]int{} - latestRulesMap := map[string]int{} + latestRules := map[string]int{} for _, rule := range profilerRuleLatest { - commonRulesMap[*rule.RuleConfigurationName] = 0 - latestRulesMap[*rule.RuleConfigurationName] = 0 + latestRules[*rule.RuleConfigurationName] = 0 } + // If a Rule Configuration is present in both latest and desired, set it to one. for _, rule := range profilerRuleDesired { - commonRulesMap[*rule.RuleConfigurationName] = 1 + _, present := latestRules[*rule.RuleConfigurationName] + if present { + latestRules[*rule.RuleConfigurationName] = 1 + } } - for _, val := range commonRulesMap { + // If a value in the map is equal to 0, the user must have removed the rule because + // added rules would not be present in the map. + for _, val := range latestRules { // This means that there exists a rule in latest that is not present in desired // which means that the input is invalid. if val == 0 { @@ -88,7 +92,7 @@ func (rm *resourceManager) markNonUpdatableRules(profilerRuleDesired []*svcapity } } - return latestRulesMap, nil + return latestRules, nil } // handleProfilerRemoval sets the input parameters to disable the profiler. diff --git a/pkg/resource/training_job/hooks.go b/pkg/resource/training_job/hooks.go index 0ca53768..40a8a26e 100644 --- a/pkg/resource/training_job/hooks.go +++ b/pkg/resource/training_job/hooks.go @@ -134,6 +134,7 @@ func (rm *resourceManager) isProfilerUpdatable(r *resource) error { } // isProfilerRemoved checks if the profiler was removed. +// The profiler gets removed when ProfilerConfig or ProfilerRuleConfig (or both) are not present in the spec but were present before. func (rm *resourceManager) isProfilerRemoved(desired *resource, latest *resource) bool { if ackcompare.IsNil(desired.ko.Spec.ProfilerRuleConfigurations) && ackcompare.IsNotNil(latest.ko.Spec.ProfilerRuleConfigurations) { return true From 05a331fe4b306572f8e806946e607bbb209b5a05 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 7 Dec 2022 16:58:06 +0000 Subject: [PATCH 63/71] new test structure --- test/e2e/tests/test_trainingjob_debugger.py | 70 ++++++++------------- 1 file changed, 26 insertions(+), 44 deletions(-) diff --git a/test/e2e/tests/test_trainingjob_debugger.py b/test/e2e/tests/test_trainingjob_debugger.py index b9257d97..0be5c55f 100644 --- a/test/e2e/tests/test_trainingjob_debugger.py +++ b/test/e2e/tests/test_trainingjob_debugger.py @@ -33,7 +33,7 @@ NEW_PROFILER_INTERVAL = 200 -@pytest.fixture(scope="function") +@pytest.fixture(scope="class") def xgboost_training_job_debugger(): resource_name = random_suffix_name("xgboost-trainingjob-debugger", 50) replacements = REPLACEMENT_VALUES.copy() @@ -111,9 +111,7 @@ def _assert_training_rule_eval_status_in_sync( resource_rule_type = sagemaker_rule_type[0].lower() + sagemaker_rule_type[1:] assert ( self._wait_sagemaker_training_rule_eval_status( - training_job_name, - sagemaker_rule_type, - expected_status, + training_job_name, sagemaker_rule_type, expected_status, ) == self._wait_resource_training_rule_eval_status( reference, resource_rule_type, expected_status @@ -121,7 +119,7 @@ def _assert_training_rule_eval_status_in_sync( == expected_status ) - def test_completed(self, xgboost_training_job_debugger): + def create_debugger_training(self, xgboost_training_job_debugger): (reference, resource, _) = xgboost_training_job_debugger assert k8s.get_resource_exists(reference) @@ -138,35 +136,7 @@ def test_completed(self, xgboost_training_job_debugger): ) assert resource_arn == training_job_arn - assert training_job_desc["TrainingJobStatus"] == cfg.JOB_STATUS_INPROGRESS - assert k8s.wait_on_condition(reference, "ACK.ResourceSynced", "False") - - assert_training_status_in_sync( - training_job_name, reference, cfg.JOB_STATUS_COMPLETED - ) - assert k8s.wait_on_condition(reference, "ACK.ResourceSynced", "False") - - # Assert debugger rule evaluation completed - self._assert_training_rule_eval_status_in_sync( - training_job_name, "DebugRule", reference, cfg.RULE_STATUS_COMPLETED - ) - - # Assert profiler rule evaluation completed - self._assert_training_rule_eval_status_in_sync( - training_job_name, "ProfilerRule", reference, cfg.RULE_STATUS_COMPLETED - ) - assert k8s.wait_on_condition(reference, "ACK.ResourceSynced", "True") - - resource_tags = resource["spec"].get("tags", None) - assert_tags_in_sync(training_job_arn, resource_tags) - - # Check that you can delete a completed resource from k8s - _, deleted = k8s.delete_custom_resource( - reference, cfg.JOB_DELETE_WAIT_PERIODS, cfg.JOB_DELETE_WAIT_LENGTH - ) - assert deleted is True - - def test_update(self, xgboost_training_job_debugger): + def update_debugger_trainingjob(self, xgboost_training_job_debugger): (reference, resource, spec) = xgboost_training_job_debugger assert k8s.get_resource_exists(reference) @@ -174,14 +144,6 @@ def test_update(self, xgboost_training_job_debugger): assert training_job_name is not None training_job_desc = get_sagemaker_training_job(training_job_name) - training_job_arn = training_job_desc["TrainingJobArn"] - - resource_arn = k8s.get_resource_arn(resource) - if resource_arn is None: - logging.error( - f"ARN for this resource is None, resource status is: {resource['status']}" - ) - assert resource_arn == training_job_arn assert training_job_desc["TrainingJobStatus"] == cfg.JOB_STATUS_INPROGRESS assert k8s.wait_on_condition(reference, "ACK.ResourceSynced", "False") @@ -219,11 +181,31 @@ def test_update(self, xgboost_training_job_debugger): == NEW_PROFILER_INTERVAL ) - assert resource["status"]["lastModifiedTime"] != resource["status"]["creationTime"] - assert training_job_desc["LastModifiedTime"] != training_job_desc["CreationTime"] + assert ( + resource["status"]["lastModifiedTime"] != resource["status"]["creationTime"] + ) + assert ( + training_job_desc["LastModifiedTime"] != training_job_desc["CreationTime"] + ) + def delete_debugger_trainingjob(self, xgboost_training_job_debugger): # Check that you can delete a completed resource from k8s + (reference, resource, _) = xgboost_training_job_debugger + + training_job_name = resource["spec"].get("trainingJobName", None) + + training_job_desc = get_sagemaker_training_job(training_job_name) + training_job_arn = training_job_desc["TrainingJobArn"] + + resource_tags = resource["spec"].get("tags", None) + assert_tags_in_sync(training_job_arn, resource_tags) + _, deleted = k8s.delete_custom_resource( reference, cfg.JOB_DELETE_WAIT_PERIODS, cfg.JOB_DELETE_WAIT_LENGTH ) assert deleted is True + + def test_driver(self, xgboost_training_job_debugger): + self.create_debugger_training(xgboost_training_job_debugger) + self.update_debugger_trainingjob(xgboost_training_job_debugger) + self.delete_debugger_trainingjob(xgboost_training_job_debugger) From 23852b56bc26e821b30793fb44e7c8d810520181 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 7 Dec 2022 17:27:21 +0000 Subject: [PATCH 64/71] testfix --- test/e2e/tests/test_trainingjob_debugger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/tests/test_trainingjob_debugger.py b/test/e2e/tests/test_trainingjob_debugger.py index 0be5c55f..1c7d5203 100644 --- a/test/e2e/tests/test_trainingjob_debugger.py +++ b/test/e2e/tests/test_trainingjob_debugger.py @@ -185,7 +185,7 @@ def update_debugger_trainingjob(self, xgboost_training_job_debugger): resource["status"]["lastModifiedTime"] != resource["status"]["creationTime"] ) assert ( - training_job_desc["LastModifiedTime"] != training_job_desc["CreationTime"] + training_sm_desc["LastModifiedTime"] != training_sm_desc["CreationTime"] ) def delete_debugger_trainingjob(self, xgboost_training_job_debugger): From ed6a315c8133ce8b08bf4171558654a0ff11eaf6 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 7 Dec 2022 18:49:16 +0000 Subject: [PATCH 65/71] another fix --- test/e2e/tests/test_trainingjob_debugger.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/e2e/tests/test_trainingjob_debugger.py b/test/e2e/tests/test_trainingjob_debugger.py index 1c7d5203..e19b9b47 100644 --- a/test/e2e/tests/test_trainingjob_debugger.py +++ b/test/e2e/tests/test_trainingjob_debugger.py @@ -156,7 +156,6 @@ def update_debugger_trainingjob(self, xgboost_training_job_debugger): assert_training_status_in_sync( training_job_name, reference, cfg.JOB_STATUS_COMPLETED ) - assert k8s.wait_on_condition(reference, "ACK.ResourceSynced", "False") # Assert debugger rule evaluation completed self._assert_training_rule_eval_status_in_sync( From dc04ed05641c0f661e6384eb452fa1167566dfaf Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 7 Dec 2022 19:43:32 +0000 Subject: [PATCH 66/71] revert test modification --- test/e2e/tests/test_trainingjob_debugger.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/e2e/tests/test_trainingjob_debugger.py b/test/e2e/tests/test_trainingjob_debugger.py index e19b9b47..1c7d5203 100644 --- a/test/e2e/tests/test_trainingjob_debugger.py +++ b/test/e2e/tests/test_trainingjob_debugger.py @@ -156,6 +156,7 @@ def update_debugger_trainingjob(self, xgboost_training_job_debugger): assert_training_status_in_sync( training_job_name, reference, cfg.JOB_STATUS_COMPLETED ) + assert k8s.wait_on_condition(reference, "ACK.ResourceSynced", "False") # Assert debugger rule evaluation completed self._assert_training_rule_eval_status_in_sync( From d3454ce38dd492fd2c011868d4884f05418620f2 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 7 Dec 2022 21:17:34 +0000 Subject: [PATCH 67/71] modified terminal conditions --- apis/v1alpha1/ack-generate-metadata.yaml | 2 +- pkg/resource/training_job/custom_update.go | 7 ++++--- pkg/resource/training_job/sdk.go | 4 ++-- templates/training_job/sdk_update_post_set_output.go.tpl | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/apis/v1alpha1/ack-generate-metadata.yaml b/apis/v1alpha1/ack-generate-metadata.yaml index cab2e869..a6298340 100755 --- a/apis/v1alpha1/ack-generate-metadata.yaml +++ b/apis/v1alpha1/ack-generate-metadata.yaml @@ -1,5 +1,5 @@ ack_generate_info: - build_date: "2022-12-07T03:56:44Z" + build_date: "2022-12-07T21:20:55Z" build_hash: 6e2ffbc3b16a30ac59be6719918c601c2c864064 go_version: go1.17.13 version: v0.20.1-3-g6e2ffbc diff --git a/pkg/resource/training_job/custom_update.go b/pkg/resource/training_job/custom_update.go index f177d048..3c9d2a5e 100644 --- a/pkg/resource/training_job/custom_update.go +++ b/pkg/resource/training_job/custom_update.go @@ -20,6 +20,7 @@ import ( "errors" ackcompare "github.com/aws-controllers-k8s/runtime/pkg/compare" + ackerr "github.com/aws-controllers-k8s/runtime/pkg/errors" svcapitypes "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1" svcsdk "github.com/aws/aws-sdk-go/service/sagemaker" ) @@ -43,7 +44,7 @@ func (rm *resourceManager) buildProfilerRuleConfigUpdateInput(desired *resource, return nil } if len(profilerRuleDesired) <= len(profilerRuleLatest) { - return errors.New("cannot remove/modify existing profiler rules.") + return ackerr.NewTerminalError(errors.New("cannot remove/modify existing profiler rules.")) } latestRules, err := rm.markNonUpdatableRules(profilerRuleDesired, profilerRuleLatest) @@ -62,7 +63,7 @@ func (rm *resourceManager) buildProfilerRuleConfigUpdateInput(desired *resource, } // If the length of this slice is zero that only the contents of the profile rule have changed if len(profilerRuleInput) == 0 { - return errors.New("cannot modify a profiler rule.") + return ackerr.NewTerminalError(errors.New("cannot modify an existing profiler rule.")) } input.SetProfilerRuleConfigurations(profilerRuleInput) return nil @@ -88,7 +89,7 @@ func (rm *resourceManager) markNonUpdatableRules(profilerRuleDesired []*svcapity // This means that there exists a rule in latest that is not present in desired // which means that the input is invalid. if val == 0 { - return nil, errors.New("cannot remove a profiler rule.") + return nil, ackerr.NewTerminalError(errors.New("cannot remove a profiler rule.")) } } diff --git a/pkg/resource/training_job/sdk.go b/pkg/resource/training_job/sdk.go index 805667d7..df775223 100644 --- a/pkg/resource/training_job/sdk.go +++ b/pkg/resource/training_job/sdk.go @@ -1193,8 +1193,8 @@ func (rm *resourceManager) sdkUpdate( if err != nil { return observed, err } - desired.SetStatus(observed) - return desired, ackrequeue.NeededAfter( + ko.Status = observed.ko.Status + return &resource{ko}, ackrequeue.NeededAfter( errors.New("training job is updating"), ackrequeue.DefaultRequeueAfterDuration, ) diff --git a/templates/training_job/sdk_update_post_set_output.go.tpl b/templates/training_job/sdk_update_post_set_output.go.tpl index fe991e04..b881c9fe 100644 --- a/templates/training_job/sdk_update_post_set_output.go.tpl +++ b/templates/training_job/sdk_update_post_set_output.go.tpl @@ -2,8 +2,8 @@ observed, err := rm.sdkFind(ctx, latest) if err != nil { return observed, err } -desired.SetStatus(observed) -return desired, ackrequeue.NeededAfter( +ko.Status = observed.ko.Status +return &resource{ko}, ackrequeue.NeededAfter( errors.New("training job is updating"), ackrequeue.DefaultRequeueAfterDuration, ) \ No newline at end of file From 68a1ffa0f182cf5a239f616a090a59e3a0d87bab Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 7 Dec 2022 21:24:03 +0000 Subject: [PATCH 68/71] updated unit test --- pkg/resource/training_job/testdata/test_suite.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/resource/training_job/testdata/test_suite.yaml b/pkg/resource/training_job/testdata/test_suite.yaml index a836fdf2..ff2ba651 100644 --- a/pkg/resource/training_job/testdata/test_suite.yaml +++ b/pkg/resource/training_job/testdata/test_suite.yaml @@ -427,7 +427,7 @@ tests: latest_state: "v1alpha1/update/latest/profilerMultipleRules.yaml" invoke: Update expect: - error: "cannot remove/modify existing profiler rules." + error: "resource is in terminal condition" - name: "Update=RemoveAddProfilerRule" description: "Removing a profiler rule but adding another." given: @@ -435,7 +435,7 @@ tests: latest_state: "v1alpha1/update/latest/profilerMultipleRules.yaml" invoke: Update expect: - error: "cannot remove/modify existing profiler rules." + error: "resource is in terminal condition" From d35727fdf2a9882af1ae14dbaa865c5edc627028 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 7 Dec 2022 21:46:57 +0000 Subject: [PATCH 69/71] changed error messaged and removed redundant check --- pkg/resource/hyper_parameter_tuning_job/custom_delta.go | 3 ++- pkg/resource/training_job/custom_update.go | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/resource/hyper_parameter_tuning_job/custom_delta.go b/pkg/resource/hyper_parameter_tuning_job/custom_delta.go index b5fbce5d..35dd710a 100644 --- a/pkg/resource/hyper_parameter_tuning_job/custom_delta.go +++ b/pkg/resource/hyper_parameter_tuning_job/custom_delta.go @@ -36,9 +36,10 @@ func customSetDefaults( // The code generator currently cannot ignore the field path for resourceConfig.KeepAlivePeriodInSeconds // without also ignoring Trainingjob. This block below should be removed once the code generator supports // removing fields like resourceConfig.KeepAlivePeriodInSeconds + // HPO will always return nil on the server side. if ackcompare.IsNotNil(a.ko.Spec.TrainingJobDefinition) && ackcompare.IsNotNil(b.ko.Spec.TrainingJobDefinition) { if ackcompare.IsNotNil(a.ko.Spec.TrainingJobDefinition.ResourceConfig) && ackcompare.IsNotNil(b.ko.Spec.TrainingJobDefinition.ResourceConfig) { - if ackcompare.IsNotNil(a.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds) && ackcompare.IsNil(b.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds) { + if ackcompare.IsNotNil(a.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds) { a.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds = nil } } diff --git a/pkg/resource/training_job/custom_update.go b/pkg/resource/training_job/custom_update.go index 3c9d2a5e..068a8d04 100644 --- a/pkg/resource/training_job/custom_update.go +++ b/pkg/resource/training_job/custom_update.go @@ -89,7 +89,7 @@ func (rm *resourceManager) markNonUpdatableRules(profilerRuleDesired []*svcapity // This means that there exists a rule in latest that is not present in desired // which means that the input is invalid. if val == 0 { - return nil, ackerr.NewTerminalError(errors.New("cannot remove a profiler rule.")) + return nil, ackerr.NewTerminalError(errors.New("cannot remove an existing profiler rule.")) } } From 72bc374a4e84fe600535fb8dc64c125b8b533830 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 7 Dec 2022 22:02:00 +0000 Subject: [PATCH 70/71] taking out hpo custom code --- .../custom_delta.go | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/pkg/resource/hyper_parameter_tuning_job/custom_delta.go b/pkg/resource/hyper_parameter_tuning_job/custom_delta.go index 35dd710a..79405c25 100644 --- a/pkg/resource/hyper_parameter_tuning_job/custom_delta.go +++ b/pkg/resource/hyper_parameter_tuning_job/custom_delta.go @@ -31,30 +31,6 @@ func customSetDefaults( } } } - // TODO: Remove the block below. - // The server side default of KeepAlivePeriodInSeconds is nil, when launching a HPO job. - // The code generator currently cannot ignore the field path for resourceConfig.KeepAlivePeriodInSeconds - // without also ignoring Trainingjob. This block below should be removed once the code generator supports - // removing fields like resourceConfig.KeepAlivePeriodInSeconds - // HPO will always return nil on the server side. - if ackcompare.IsNotNil(a.ko.Spec.TrainingJobDefinition) && ackcompare.IsNotNil(b.ko.Spec.TrainingJobDefinition) { - if ackcompare.IsNotNil(a.ko.Spec.TrainingJobDefinition.ResourceConfig) && ackcompare.IsNotNil(b.ko.Spec.TrainingJobDefinition.ResourceConfig) { - if ackcompare.IsNotNil(a.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds) { - a.ko.Spec.TrainingJobDefinition.ResourceConfig.KeepAlivePeriodInSeconds = nil - } - } - } - // HPO does not support custom warm pool durations and the Server side default will - // always be nil. - if ackcompare.IsNotNil(a.ko.Spec.TrainingJobDefinitions) { - for i, trainDefinition := range a.ko.Spec.TrainingJobDefinitions { - if ackcompare.IsNotNil(trainDefinition) { - if ackcompare.IsNotNil(trainDefinition.ResourceConfig) && ackcompare.IsNotNil(trainDefinition.ResourceConfig.KeepAlivePeriodInSeconds) { - a.ko.Spec.TrainingJobDefinitions[i].ResourceConfig.KeepAlivePeriodInSeconds = nil - } - } - } - } } From 4a12b3dd1904661b961c1269552a7eba5db23a49 Mon Sep 17 00:00:00 2001 From: ananth102 Date: Wed, 7 Dec 2022 22:18:22 +0000 Subject: [PATCH 71/71] eliminate api call --- test/e2e/tests/test_trainingjob_debugger.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/test/e2e/tests/test_trainingjob_debugger.py b/test/e2e/tests/test_trainingjob_debugger.py index 1c7d5203..60eccb80 100644 --- a/test/e2e/tests/test_trainingjob_debugger.py +++ b/test/e2e/tests/test_trainingjob_debugger.py @@ -192,13 +192,10 @@ def delete_debugger_trainingjob(self, xgboost_training_job_debugger): # Check that you can delete a completed resource from k8s (reference, resource, _) = xgboost_training_job_debugger - training_job_name = resource["spec"].get("trainingJobName", None) - - training_job_desc = get_sagemaker_training_job(training_job_name) - training_job_arn = training_job_desc["TrainingJobArn"] + resource_arn = k8s.get_resource_arn(resource) resource_tags = resource["spec"].get("tags", None) - assert_tags_in_sync(training_job_arn, resource_tags) + assert_tags_in_sync(resource_arn, resource_tags) _, deleted = k8s.delete_custom_resource( reference, cfg.JOB_DELETE_WAIT_PERIODS, cfg.JOB_DELETE_WAIT_LENGTH