Skip to content

Commit 050a186

Browse files
authored
MGMT-4475: validations to consider effective role (#2587)
- role based validations now takes into account the suggested role in case of auto-assign. This results in timely failures in case the role to be assigned is incompatible with the validation's logic - role evaluation is now being done in its own function within the monitoring instead of in RefreshStatus - feature flag has been added to allow for shutting down the calculation
1 parent 38b6857 commit 050a186

24 files changed

+168
-132
lines changed

internal/cluster/common.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ func getKnownMastersNodesIds(c *common.Cluster, db *gorm.DB) ([]*strfmt.UUID, er
144144

145145
allowedStatuses := []string{models.HostStatusKnown, models.HostStatusPreparingForInstallation}
146146
for _, host := range cluster.Hosts {
147-
if host.Role == models.HostRoleMaster && funk.ContainsString(allowedStatuses, swag.StringValue(host.Status)) {
147+
if common.GetEffectiveRole(host) == models.HostRoleMaster && funk.ContainsString(allowedStatuses, swag.StringValue(host.Status)) {
148148
masterNodesIds = append(masterNodesIds, host.ID)
149149
}
150150
}
@@ -154,7 +154,7 @@ func getKnownMastersNodesIds(c *common.Cluster, db *gorm.DB) ([]*strfmt.UUID, er
154154
func NumberOfWorkers(c *common.Cluster) int {
155155
num := 0
156156
for _, host := range c.Hosts {
157-
if host.Role != models.HostRoleWorker || *host.Status == models.HostStatusDisabled {
157+
if common.GetEffectiveRole(host) != models.HostRoleWorker || *host.Status == models.HostStatusDisabled {
158158
continue
159159
}
160160
num += 1
@@ -173,7 +173,7 @@ func MapWorkersHostsByStatus(c *common.Cluster) map[string][]*models.Host {
173173
func mapHostsByStatus(c *common.Cluster, role models.HostRole) map[string][]*models.Host {
174174
hostMap := make(map[string][]*models.Host)
175175
for _, host := range c.Hosts {
176-
if role != "" && host.Role != role {
176+
if role != "" && common.GetEffectiveRole(host) != role {
177177
continue
178178
}
179179
if _, ok := hostMap[swag.StringValue(host.Status)]; ok {

internal/cluster/validator.go

+5-4
Original file line numberDiff line numberDiff line change
@@ -380,12 +380,12 @@ func (v *clusterValidator) sufficientMastersCount(c *clusterPreprocessContext) V
380380
candidates := make([]*models.Host, 0)
381381

382382
for _, host := range hosts {
383-
switch role := host.Role; role {
383+
switch role := common.GetEffectiveRole(host); role {
384384
case models.HostRoleMaster:
385-
//add pre-assigned master hosts to the masters list
385+
//add pre-assigned/suggested master hosts to the masters list
386386
masters = append(masters, host)
387387
case models.HostRoleWorker:
388-
//add pre-assigned worker hosts to the worker list
388+
//add pre-assigned/suggested worker hosts to the worker list
389389
workers = append(workers, host)
390390
default:
391391
//auto-assign hosts and other types go to the candidate list
@@ -397,7 +397,8 @@ func (v *clusterValidator) sufficientMastersCount(c *clusterPreprocessContext) V
397397
//if allocated masters count is less than the desired count, find eligable hosts
398398
//from the candidate pool to match the master count criteria, up to 3
399399
if len(masters) < minMastersNeededForInstallation {
400-
if isValid, err := v.hostAPI.IsValidMasterCandidate(h, c.cluster, c.db, v.log); isValid && err == nil {
400+
candidate := *h
401+
if isValid, err := v.hostAPI.IsValidMasterCandidate(&candidate, c.cluster, c.db, v.log); isValid && err == nil {
401402
masters = append(masters, h)
402403
continue
403404
}

internal/common/common.go

+7
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,13 @@ func AreMastersSchedulable(cluster *Cluster) bool {
9999
return swag.BoolValue(cluster.SchedulableMasters)
100100
}
101101

102+
func GetEffectiveRole(host *models.Host) models.HostRole {
103+
if host.Role == models.HostRoleAutoAssign && host.SuggestedRole != "" {
104+
return host.SuggestedRole
105+
}
106+
return host.Role
107+
}
108+
102109
func GetConsoleUrl(clusterName, baseDomain string) string {
103110
return fmt.Sprintf("%s.%s.%s", consoleUrlPrefix, clusterName, baseDomain)
104111
}

internal/hardware/validator.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ func (v *validator) GetClusterHostRequirements(ctx context.Context, cluster *com
168168
return nil, err
169169
}
170170

171-
ocpRequirements, err := v.getOCPClusterHostRoleRequirementsForVersion(cluster, host.Role)
171+
ocpRequirements, err := v.getOCPClusterHostRoleRequirementsForVersion(cluster, common.GetEffectiveRole(host))
172172
if err != nil {
173173
return nil, err
174174
}

internal/host/common.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ func updateRole(log logrus.FieldLogger, h *models.Host, role models.HostRole, su
100100
*h.ID, h.InfraEnvID, srcRole).Updates(fields).Error
101101
}
102102

103-
func GetHostnameAndRoleByIP(ip string, hosts []*models.Host) (string, models.HostRole, error) {
103+
func GetHostnameAndEffectiveRoleByIP(ip string, hosts []*models.Host) (string, models.HostRole, error) {
104104
for _, h := range hosts {
105105
if h.Inventory == "" {
106106
continue
@@ -117,7 +117,7 @@ func GetHostnameAndRoleByIP(ip string, hosts []*models.Host) (string, models.Hos
117117
return "", "", err
118118
}
119119
if ip == parsedIP.String() {
120-
return getRealHostname(h, inv), h.Role, nil
120+
return getRealHostname(h, inv), common.GetEffectiveRole(h), nil
121121
}
122122
}
123123
}

internal/host/common_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ type hostNetProfile struct {
1818
ip string
1919
}
2020

21-
var _ = Describe("GetHostnameAndRoleByIP", func() {
21+
var _ = Describe("GetHostnameAndEffectiveRoleByIP", func() {
2222

2323
hostRolesIpv4 := []hostNetProfile{{role: models.HostRoleMaster, hostname: "master-0", ip: "1.2.3.1"}, {role: models.HostRoleWorker, hostname: "worker-0", ip: "1.2.3.2"}}
2424
hostrolesIpv6 := []hostNetProfile{{role: models.HostRoleMaster, hostname: "master-1", ip: "1001:db8::11"}, {role: models.HostRoleWorker, hostname: "worker-1", ip: "1001:db8::12"}}
@@ -80,7 +80,7 @@ var _ = Describe("GetHostnameAndRoleByIP", func() {
8080
h := hostutil.GenerateTestHostWithNetworkAddress(strfmt.UUID(uuid.New().String()), infraEnvID, clusterID, v.role, models.HostStatusKnown, netAddr)
8181
hosts = append(hosts, h)
8282
}
83-
hostname, role, err := GetHostnameAndRoleByIP(test.targetIP, hosts)
83+
hostname, role, err := GetHostnameAndEffectiveRoleByIP(test.targetIP, hosts)
8484
if test.expectedError != nil {
8585
Expect(err).To(Equal(test.expectedError))
8686
} else {

internal/host/host.go

+45-23
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ type LogTimeoutConfig struct {
100100
type Config struct {
101101
LogTimeoutConfig
102102
EnableAutoReset bool `envconfig:"ENABLE_AUTO_RESET" default:"false"`
103+
EnableAutoAssign bool `envconfig:"ENABLE_AUTO_ASSIGN" default:"true"`
103104
ResetTimeout time.Duration `envconfig:"RESET_CLUSTER_TIMEOUT" default:"3m"`
104105
MonitorBatchSize int `envconfig:"HOST_MONITOR_BATCH_SIZE" default:"100"`
105106
DisabledHostvalidations DisabledHostValidations `envconfig:"DISABLED_HOST_VALIDATIONS" default:""` // Which host validations to disable (should not run in preprocess)
@@ -134,6 +135,7 @@ type API interface {
134135
IsInstallable(h *models.Host) bool
135136
// auto assign host role
136137
AutoAssignRole(ctx context.Context, h *models.Host, db *gorm.DB) (bool, error)
138+
RefreshRole(ctx context.Context, h *models.Host, db *gorm.DB) error
137139
IsValidMasterCandidate(h *models.Host, c *common.Cluster, db *gorm.DB, log logrus.FieldLogger) (bool, error)
138140
SetUploadLogsAt(ctx context.Context, h *models.Host, db *gorm.DB) error
139141
UpdateLogsProgress(ctx context.Context, h *models.Host, progress string) error
@@ -375,6 +377,28 @@ func (m *Manager) updateInventory(ctx context.Context, cluster *common.Cluster,
375377
}).Error
376378
}
377379

380+
func (m *Manager) refreshRoleInternal(ctx context.Context, h *models.Host, db *gorm.DB, forceRefresh bool) error {
381+
//update suggested role, if not yet set
382+
var suggestedRole models.HostRole
383+
var err error
384+
if m.Config.EnableAutoAssign || forceRefresh {
385+
//because of possible hw changes, suggested role should be calculated
386+
//periodically even if the suggested role is already set
387+
if h.Role == models.HostRoleAutoAssign &&
388+
funk.ContainsString(hostStatusesBeforeInstallation[:], *h.Status) {
389+
if suggestedRole, err = m.autoRoleSelection(ctx, h, db); err == nil {
390+
if h.SuggestedRole != suggestedRole {
391+
if err = updateRole(m.log, h, h.Role, suggestedRole, db, string(h.Role)); err == nil {
392+
h.SuggestedRole = suggestedRole
393+
m.log.Infof("suggested role for host %s is %s", *h.ID, suggestedRole)
394+
}
395+
}
396+
}
397+
}
398+
}
399+
return err
400+
}
401+
378402
func (m *Manager) refreshStatusInternal(ctx context.Context, h *models.Host, c *common.Cluster, i *common.InfraEnv, db *gorm.DB) error {
379403
if db == nil {
380404
db = m.db
@@ -409,15 +433,6 @@ func (m *Manager) refreshStatusInternal(ctx context.Context, h *models.Host, c *
409433
}
410434
}
411435

412-
//update suggested role, if not yet set
413-
var suggestedRole models.HostRole
414-
if h.Role == models.HostRoleAutoAssign && h.SuggestedRole == models.HostRoleAutoAssign &&
415-
funk.ContainsString(hostStatusesBeforeInstallation[:], *h.Status) {
416-
if suggestedRole, err = m.autoRoleSelection(ctx, h, db); err == nil {
417-
_ = updateRole(m.log, h, h.Role, suggestedRole, db, string(h.Role))
418-
}
419-
}
420-
421436
err = m.sm.Run(TransitionTypeRefresh, newStateHost(h), &TransitionArgsRefreshHost{
422437
ctx: ctx,
423438
db: db,
@@ -431,6 +446,13 @@ func (m *Manager) refreshStatusInternal(ctx context.Context, h *models.Host, c *
431446
return nil
432447
}
433448

449+
func (m *Manager) RefreshRole(ctx context.Context, h *models.Host, db *gorm.DB) error {
450+
if db == nil {
451+
db = m.db
452+
}
453+
return m.refreshRoleInternal(ctx, h, db, true)
454+
}
455+
434456
func (m *Manager) RefreshStatus(ctx context.Context, h *models.Host, db *gorm.DB) error {
435457
if db == nil {
436458
db = m.db
@@ -1009,17 +1031,16 @@ func (m *Manager) AutoAssignRole(ctx context.Context, h *models.Host, db *gorm.D
10091031
if h.Role == models.HostRoleAutoAssign {
10101032
log := logutil.FromContext(ctx, m.log)
10111033
// If role is auto-assigned calculate the suggested roles
1012-
// This logic will moved to the monitor soon
1013-
suggestedRole, err := m.autoRoleSelection(ctx, h, db)
1014-
if err != nil {
1034+
// to make sure the suggestion is fresh
1035+
if err := m.RefreshRole(ctx, h, db); err != nil { //force refresh
10151036
return false, err
10161037
}
10171038

10181039
//copy the suggested role into the role and update the host record
1019-
log.Infof("suggested role %s for host %s cluster %s", suggestedRole, h.ID.String(), h.ClusterID.String())
1020-
if err := updateRole(m.log, h, suggestedRole, suggestedRole, db, string(models.HostRoleAutoAssign)); err != nil {
1040+
log.Infof("suggested role %s for host %s cluster %s", h.SuggestedRole, h.ID.String(), h.ClusterID.String())
1041+
if err := updateRole(m.log, h, h.SuggestedRole, h.SuggestedRole, db, string(models.HostRoleAutoAssign)); err != nil {
10211042
log.WithError(err).Errorf("failed to update role %s for host %s cluster %s",
1022-
suggestedRole, h.ID.String(), h.ClusterID.String())
1043+
h.SuggestedRole, h.ID.String(), h.ClusterID.String())
10231044
return true, err
10241045
}
10251046

@@ -1061,15 +1082,15 @@ func (m *Manager) selectRole(ctx context.Context, h *models.Host, db *gorm.DB) (
10611082
h.ID.String(), h.ClusterID.String())
10621083
}
10631084

1064-
// count already existing masters
1065-
mastersCount := 0
1066-
if err = db.Model(&models.Host{}).Where("cluster_id = ? and status != ? and role = ?",
1067-
h.ClusterID, models.HostStatusDisabled, models.HostRoleMaster).Count(&mastersCount).Error; err != nil {
1085+
// count already existing masters or hosts with suggested role of master
1086+
otherMastersCount := 0
1087+
if err = db.Model(&models.Host{}).Where("cluster_id = ? and id != ? and status != ? and (role = ? or suggested_role = ?)",
1088+
h.ClusterID, h.ID, models.HostStatusDisabled, models.HostRoleMaster, models.HostRoleMaster).Count(&otherMastersCount).Error; err != nil {
10681089
log.WithError(err).Errorf("failed to count masters in cluster %s", h.ClusterID.String())
10691090
return autoSelectedRole, err
10701091
}
10711092

1072-
if mastersCount < common.MinMasterHostsNeededForInstallation {
1093+
if otherMastersCount < common.MinMasterHostsNeededForInstallation {
10731094
h.Role = models.HostRoleMaster
10741095
vc, err = newValidationContext(h, nil, nil, db, m.hwValidator)
10751096
if err != nil {
@@ -1269,13 +1290,14 @@ func (m *Manager) captureConnectivityReportMetrics(ctx context.Context, openshif
12691290
}
12701291
for _, r := range connectivityReport.RemoteHosts {
12711292
for _, l3 := range r.L3Connectivity {
1272-
_, targetRole, err := GetHostnameAndRoleByIP(l3.RemoteIPAddress, hosts)
1293+
_, targetRole, err := GetHostnameAndEffectiveRoleByIP(l3.RemoteIPAddress, hosts)
12731294
if err != nil {
12741295
log.Warn(err)
12751296
continue
12761297
}
1277-
m.metricApi.NetworkLatencyBetweenHosts(openshiftVersion, h.Role, targetRole, l3.AverageRTTMs)
1278-
m.metricApi.PacketLossBetweenHosts(openshiftVersion, h.Role, targetRole, l3.PacketLossPercentage)
1298+
effectiveRole := common.GetEffectiveRole(h)
1299+
m.metricApi.NetworkLatencyBetweenHosts(openshiftVersion, effectiveRole, targetRole, l3.AverageRTTMs)
1300+
m.metricApi.PacketLossBetweenHosts(openshiftVersion, effectiveRole, targetRole, l3.PacketLossPercentage)
12791301
}
12801302
}
12811303
}

internal/host/host_test.go

+1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ var (
4141
defaultConfig = &Config{
4242
ResetTimeout: 3 * time.Minute,
4343
EnableAutoReset: true,
44+
EnableAutoAssign: true,
4445
MonitorBatchSize: 100,
4546
DisabledHostvalidations: defaultDisabledHostValidations,
4647
}

internal/host/hostcommands/install_cmd.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ func (i *installCmd) GetSteps(ctx context.Context, host *models.Host) ([]*models
102102
}
103103

104104
func (i *installCmd) getFullInstallerCommand(cluster *common.Cluster, host *models.Host, infraEnv *common.InfraEnv, bootdevice string) (string, error) {
105-
role := host.Role
105+
role := common.GetEffectiveRole(host)
106106
if host.Bootstrap {
107107
role = models.HostRoleBootstrap
108108
}

internal/host/hostcommands/logs_cmd.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ func (i *logsCmd) getNonBootstrapMastersIPsInHostCluster(ctx context.Context, ho
125125
func (i *logsCmd) getHostsIpsfromMachineCIDR(cluster common.Cluster) ([]string, error) {
126126
var ips []string
127127
for _, h := range cluster.Hosts {
128-
if h.Bootstrap || h.Role == models.HostRoleWorker {
128+
if h.Bootstrap || common.GetEffectiveRole(h) == models.HostRoleWorker {
129129
continue
130130
}
131131
ip, err := network.GetPrimaryMachineCIDRIP(h, &cluster)

internal/host/hostrole_test.go

+1-13
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,7 @@ import (
1010
. "github.com/onsi/ginkgo"
1111
. "github.com/onsi/gomega"
1212
"github.com/openshift/assisted-service/internal/common"
13-
eventgen "github.com/openshift/assisted-service/internal/common/events"
1413
"github.com/openshift/assisted-service/internal/events"
15-
"github.com/openshift/assisted-service/internal/events/eventstest"
1614
"github.com/openshift/assisted-service/internal/hardware"
1715
"github.com/openshift/assisted-service/internal/host/hostutil"
1816
"github.com/openshift/assisted-service/internal/operators"
@@ -77,14 +75,12 @@ var _ = Describe("Suggested-Role on Refresh", func() {
7775
inventory string
7876
srcState string
7977
suggested_role models.HostRole
80-
eventType *string
8178
}{
8279
{
8380
name: "insufficient worker memory --> suggested as worker",
8481
inventory: hostutil.GenerateInventoryWithResourcesWithBytes(4, conversions.MibToBytes(150), conversions.MibToBytes(150), "worker"),
8582
srcState: models.HostStatusDiscovering,
8683
suggested_role: models.HostRoleWorker,
87-
eventType: &eventgen.HostStatusUpdatedEventName,
8884
},
8985
{
9086
name: "sufficient master memory --> suggested as master when masters < 3",
@@ -97,7 +93,6 @@ var _ = Describe("Suggested-Role on Refresh", func() {
9793
inventory: workerInventory(),
9894
srcState: models.HostStatusKnown,
9995
suggested_role: models.HostRoleWorker,
100-
eventType: &eventgen.HostStatusUpdatedEventName,
10196
},
10297
}
10398

@@ -114,15 +109,8 @@ var _ = Describe("Suggested-Role on Refresh", func() {
114109
host.SuggestedRole = models.HostRoleAutoAssign
115110
Expect(db.Create(&host).Error).ShouldNot(HaveOccurred())
116111
mockDefaultClusterHostRequirements(mockHwValidator)
117-
if t.eventType != nil {
118-
mockEvents.EXPECT().SendHostEvent(gomock.Any(), eventstest.NewEventMatcher(
119-
eventstest.WithNameMatcher(*t.eventType),
120-
eventstest.WithHostIdMatcher(host.ID.String()),
121-
eventstest.WithInfraEnvIdMatcher(host.InfraEnvID.String()),
122-
))
123-
}
124112

125-
err := hapi.RefreshStatus(ctx, &host, db)
113+
err := hapi.RefreshRole(ctx, &host, db)
126114
Expect(err).ToNot(HaveOccurred())
127115

128116
var resultHost models.Host

internal/host/hostutil/host_utils.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ func GetDiskByInstallationPath(disks []*models.Disk, installationPath string) *m
144144
}
145145

146146
func IgnitionFileName(host *models.Host) string {
147-
return fmt.Sprintf("%s-%s.ign", host.Role, host.ID)
147+
return fmt.Sprintf("%s-%s.ign", common.GetEffectiveRole(host), host.ID)
148148
}
149149

150150
var allowedFlags = []string{"--append-karg", "--delete-karg", "-n", "--copy-network", "--network-dir", "--save-partlabel", "--save-partindex", "--image-url"}

internal/host/mock_host_api.go

+14
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

internal/host/monitor.go

+7
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,13 @@ func (m *Manager) clusterHostMonitoring() int64 {
8080
if err != nil {
8181
log.WithError(err).Errorf("failed to refresh host %s state", *host.ID)
8282
}
83+
//the refreshed role will be taken into account
84+
//on the next monitor cycle. The force flag is a workaround
85+
//until the feature flag is removed
86+
err = m.refreshRoleInternal(ctx, host, m.db, false)
87+
if err != nil {
88+
log.WithError(err).Errorf("failed to refresh host %s role", *host.ID)
89+
}
8390
}
8491
}
8592
}

0 commit comments

Comments
 (0)