github.com/1aal/kubeblocks@v0.0.0-20231107070852-e1c03e598921/pkg/lorry/operations/replica/checkrole.go (about) 1 /* 2 Copyright (C) 2022-2023 ApeCloud Co., Ltd 3 4 This file is part of KubeBlocks project 5 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU Affero General Public License as published by 8 the Free Software Foundation, either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU Affero General Public License for more details. 15 16 You should have received a copy of the GNU Affero General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 package replica 21 22 import ( 23 "context" 24 "encoding/json" 25 "fmt" 26 "strings" 27 "time" 28 29 "github.com/go-logr/logr" 30 "github.com/pkg/errors" 31 "github.com/spf13/viper" 32 ctrl "sigs.k8s.io/controller-runtime" 33 34 "github.com/1aal/kubeblocks/pkg/constant" 35 "github.com/1aal/kubeblocks/pkg/lorry/dcs" 36 "github.com/1aal/kubeblocks/pkg/lorry/engines/register" 37 "github.com/1aal/kubeblocks/pkg/lorry/operations" 38 "github.com/1aal/kubeblocks/pkg/lorry/util" 39 ) 40 41 // AccessMode defines SVC access mode enums. 42 // +enum 43 type AccessMode string 44 45 type CheckRole struct { 46 operations.Base 47 logger logr.Logger 48 dcsStore dcs.DCS 49 OriRole string 50 CheckRoleFailedCount int 51 FailedEventReportFrequency int 52 ProbeTimeout time.Duration 53 DBRoles map[string]AccessMode 54 } 55 56 var checkrole operations.Operation = &CheckRole{} 57 58 func init() { 59 err := operations.Register(strings.ToLower(string(util.CheckRoleOperation)), checkrole) 60 if err != nil { 61 panic(err.Error()) 62 } 63 } 64 65 func (s *CheckRole) Init(ctx context.Context) error { 66 s.dcsStore = dcs.GetStore() 67 if s.dcsStore == nil { 68 return errors.New("dcs store init failed") 69 } 70 71 val := viper.GetString("KB_SERVICE_ROLES") 72 if val != "" { 73 if err := json.Unmarshal([]byte(val), &s.DBRoles); err != nil { 74 fmt.Println(errors.Wrap(err, "KB_DB_ROLES env format error").Error()) 75 } 76 } 77 78 s.FailedEventReportFrequency = viper.GetInt("KB_FAILED_EVENT_REPORT_FREQUENCY") 79 if s.FailedEventReportFrequency < 300 { 80 s.FailedEventReportFrequency = 300 81 } else if s.FailedEventReportFrequency > 3600 { 82 s.FailedEventReportFrequency = 3600 83 } 84 85 timeoutSeconds := util.DefaultProbeTimeoutSeconds 86 if viper.IsSet(constant.KBEnvRoleProbeTimeout) { 87 timeoutSeconds = viper.GetInt(constant.KBEnvRoleProbeTimeout) 88 } 89 // lorry utilizes the pod readiness probe to trigger role probe and 'timeoutSeconds' is directly copied from the 'probe.timeoutSeconds' field of pod. 90 // here we give 80% of the total time to role probe job and leave the remaining 20% to kubelet to handle the readiness probe related tasks. 91 s.ProbeTimeout = time.Duration(timeoutSeconds) * (800 * time.Millisecond) 92 s.logger = ctrl.Log.WithName("checkrole") 93 return nil 94 } 95 96 func (s *CheckRole) IsReadonly(ctx context.Context) bool { 97 return true 98 } 99 100 func (s *CheckRole) Do(ctx context.Context, req *operations.OpsRequest) (*operations.OpsResponse, error) { 101 manager, err := register.GetDBManager() 102 if err != nil { 103 return nil, errors.Wrap(err, "get manager failed") 104 } 105 106 resp := &operations.OpsResponse{ 107 Data: map[string]any{}, 108 } 109 resp.Data["operation"] = util.CheckRoleOperation 110 resp.Data["originalRole"] = s.OriRole 111 112 if !manager.IsDBStartupReady() { 113 resp.Data["message"] = "db not ready" 114 return resp, nil 115 } 116 117 cluster := s.dcsStore.GetClusterFromCache() 118 119 ctx1, cancel := context.WithTimeout(ctx, s.ProbeTimeout) 120 defer cancel() 121 role, err := manager.GetReplicaRole(ctx1, cluster) 122 if err != nil { 123 s.logger.Info("executing checkRole error", "error", err) 124 // do not return err, as it will cause readinessprobe to fail 125 err = nil 126 if s.CheckRoleFailedCount%s.FailedEventReportFrequency == 0 { 127 s.logger.Info("role checks failed continuously", "times", s.CheckRoleFailedCount) 128 // if err is not nil, send event through kubelet readinessprobe 129 err = util.SentEventForProbe(ctx, resp.Data) 130 } 131 s.CheckRoleFailedCount++ 132 return resp, err 133 } 134 135 s.CheckRoleFailedCount = 0 136 if isValid, message := s.roleValidate(role); !isValid { 137 resp.Data["message"] = message 138 return resp, nil 139 } 140 141 resp.Data["role"] = role 142 if s.OriRole == role { 143 return nil, nil 144 } 145 resp.Data["event"] = util.OperationSuccess 146 s.OriRole = role 147 err = util.SentEventForProbe(ctx, resp.Data) 148 return resp, err 149 } 150 151 // Component may have some internal roles that needn't be exposed to end user, 152 // and not configured in cluster definition, e.g. ETCD's Candidate. 153 // roleValidate is used to filter the internal roles and decrease the number 154 // of report events to reduce the possibility of event conflicts. 155 func (s *CheckRole) roleValidate(role string) (bool, string) { 156 // do not validate them when db roles setting is missing 157 if len(s.DBRoles) == 0 { 158 return true, "" 159 } 160 161 var msg string 162 isValid := false 163 for r := range s.DBRoles { 164 if strings.EqualFold(r, role) { 165 isValid = true 166 break 167 } 168 } 169 if !isValid { 170 msg = fmt.Sprintf("role %s is not configured in cluster definition %v", role, s.DBRoles) 171 } 172 return isValid, msg 173 }