github.com/1aal/kubeblocks@v0.0.0-20231107070852-e1c03e598921/controllers/apps/operations/ops_util.go (about) 1 /* 2 Copyright (C) 2022-2023 ApeCloud Co., Ltd 3 4 This file is part of KubeBlocks project 5 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU Affero General Public License as published by 8 the Free Software Foundation, either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU Affero General Public License for more details. 15 16 You should have received a copy of the GNU Affero General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 package operations 21 22 import ( 23 "context" 24 "fmt" 25 "math" 26 "reflect" 27 "time" 28 29 "golang.org/x/exp/slices" 30 corev1 "k8s.io/api/core/v1" 31 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 "sigs.k8s.io/controller-runtime/pkg/client" 33 34 appsv1alpha1 "github.com/1aal/kubeblocks/apis/apps/v1alpha1" 35 workloads "github.com/1aal/kubeblocks/apis/workloads/v1alpha1" 36 "github.com/1aal/kubeblocks/controllers/apps/components" 37 opsutil "github.com/1aal/kubeblocks/controllers/apps/operations/util" 38 "github.com/1aal/kubeblocks/pkg/configuration/core" 39 "github.com/1aal/kubeblocks/pkg/constant" 40 intctrlutil "github.com/1aal/kubeblocks/pkg/controllerutil" 41 ) 42 43 // componentFailedTimeout when the duration of component failure exceeds this threshold, it is determined that opsRequest has failed 44 const componentFailedTimeout = 30 * time.Second 45 46 var _ error = &WaitForClusterPhaseErr{} 47 48 type WaitForClusterPhaseErr struct { 49 clusterName string 50 currentPhase appsv1alpha1.ClusterPhase 51 expectedPhase []appsv1alpha1.ClusterPhase 52 } 53 54 func (e *WaitForClusterPhaseErr) Error() string { 55 return fmt.Sprintf("wait for cluster %s to reach phase %v, current status is :%s", e.clusterName, e.expectedPhase, e.currentPhase) 56 } 57 58 type handleStatusProgressWithComponent func(reqCtx intctrlutil.RequestCtx, 59 cli client.Client, 60 opsRes *OpsResource, 61 pgRes progressResource, 62 compStatus *appsv1alpha1.OpsRequestComponentStatus) (expectProgressCount int32, succeedCount int32, err error) 63 64 type handleReconfigureOpsStatus func(cmStatus *appsv1alpha1.ConfigurationItemStatus) error 65 66 // reconcileActionWithComponentOps will be performed when action is done and loops till OpsRequest.status.phase is Succeed/Failed. 67 // the common function to reconcile opsRequest status when the opsRequest will affect the lifecycle of the components. 68 func reconcileActionWithComponentOps(reqCtx intctrlutil.RequestCtx, 69 cli client.Client, 70 opsRes *OpsResource, 71 opsMessageKey string, 72 handleStatusProgress handleStatusProgressWithComponent, 73 ) (appsv1alpha1.OpsPhase, time.Duration, error) { 74 if opsRes == nil { 75 return "", 0, nil 76 } 77 opsRequestPhase := appsv1alpha1.OpsRunningPhase 78 clusterDef, err := getClusterDefByName(reqCtx.Ctx, cli, 79 opsRes.Cluster.Spec.ClusterDefRef) 80 if err != nil { 81 return opsRequestPhase, 0, err 82 } 83 var ( 84 opsRequest = opsRes.OpsRequest 85 isFailed bool 86 ok bool 87 expectProgressCount int32 88 completedProgressCount int32 89 checkAllClusterComponent bool 90 requeueTimeAfterFailed time.Duration 91 ) 92 componentNameMap := opsRequest.GetComponentNameSet() 93 // if no specified components, we should check the all components phase of cluster. 94 if len(componentNameMap) == 0 { 95 checkAllClusterComponent = true 96 } 97 patch := client.MergeFrom(opsRequest.DeepCopy()) 98 oldOpsRequestStatus := opsRequest.Status.DeepCopy() 99 if opsRequest.Status.Components == nil { 100 opsRequest.Status.Components = map[string]appsv1alpha1.OpsRequestComponentStatus{} 101 } 102 opsIsCompleted := opsRequestHasProcessed(reqCtx, cli, *opsRes) 103 for k, v := range opsRes.Cluster.Status.Components { 104 if _, ok = componentNameMap[k]; !ok && !checkAllClusterComponent { 105 continue 106 } 107 var compStatus appsv1alpha1.OpsRequestComponentStatus 108 if compStatus, ok = opsRequest.Status.Components[k]; !ok { 109 compStatus = appsv1alpha1.OpsRequestComponentStatus{} 110 } 111 lastFailedTime := compStatus.LastFailedTime 112 if components.IsFailedOrAbnormal(v.Phase) { 113 isFailed = true 114 if lastFailedTime.IsZero() { 115 lastFailedTime = metav1.Now() 116 } 117 if time.Now().Before(lastFailedTime.Add(componentFailedTimeout)) { 118 requeueTimeAfterFailed = componentFailedTimeout - time.Since(lastFailedTime.Time) 119 } 120 } else if !lastFailedTime.IsZero() { 121 // reset lastFailedTime if component is not failed 122 lastFailedTime = metav1.Time{} 123 } 124 if compStatus.Phase != v.Phase { 125 compStatus.Phase = v.Phase 126 compStatus.LastFailedTime = lastFailedTime 127 } 128 clusterComponent := opsRes.Cluster.Spec.GetComponentByName(k) 129 expectCount, completedCount, err := handleStatusProgress(reqCtx, cli, opsRes, progressResource{ 130 opsMessageKey: opsMessageKey, 131 clusterComponent: clusterComponent, 132 clusterComponentDef: clusterDef.GetComponentDefByName(clusterComponent.ComponentDefRef), 133 opsIsCompleted: opsIsCompleted, 134 }, &compStatus) 135 if err != nil { 136 if intctrlutil.IsTargetError(err, intctrlutil.ErrorWaitCacheRefresh) { 137 return opsRequestPhase, time.Second, nil 138 } 139 return opsRequestPhase, 0, err 140 } 141 expectProgressCount += expectCount 142 completedProgressCount += completedCount 143 opsRequest.Status.Components[k] = compStatus 144 } 145 opsRequest.Status.Progress = fmt.Sprintf("%d/%d", completedProgressCount, expectProgressCount) 146 if !reflect.DeepEqual(opsRequest.Status, *oldOpsRequestStatus) { 147 if err = cli.Status().Patch(reqCtx.Ctx, opsRequest, patch); err != nil { 148 return opsRequestPhase, 0, err 149 } 150 } 151 // check if the cluster has applied the changes of the opsRequest and wait for the cluster to finish processing the ops. 152 if !opsIsCompleted { 153 return opsRequestPhase, 0, nil 154 } 155 156 if isFailed { 157 if requeueTimeAfterFailed != 0 { 158 // component failure may be temporary, waiting for component failure timeout. 159 return opsRequestPhase, requeueTimeAfterFailed, nil 160 } 161 return appsv1alpha1.OpsFailedPhase, 0, nil 162 } 163 if completedProgressCount != expectProgressCount { 164 return opsRequestPhase, time.Second, nil 165 } 166 return appsv1alpha1.OpsSucceedPhase, 0, nil 167 } 168 169 // opsRequestHasProcessed checks if the opsRequest has been processed. 170 func opsRequestHasProcessed(reqCtx intctrlutil.RequestCtx, cli client.Client, opsRes OpsResource) bool { 171 if opsRes.ToClusterPhase == opsRes.Cluster.Status.Phase { 172 return false 173 } 174 // if all pods of all components are with latest revision, ops has processed 175 rsmList := &workloads.ReplicatedStateMachineList{} 176 if err := cli.List(reqCtx.Ctx, rsmList, 177 client.InNamespace(opsRes.Cluster.Namespace), 178 client.MatchingLabels{constant.AppInstanceLabelKey: opsRes.Cluster.Name}); err != nil { 179 return false 180 } 181 for _, rsm := range rsmList.Items { 182 isLatestRevision, err := components.IsComponentPodsWithLatestRevision(reqCtx.Ctx, cli, opsRes.Cluster, &rsm) 183 if err != nil { 184 return false 185 } 186 if !isLatestRevision { 187 return false 188 } 189 } 190 return true 191 } 192 193 // getClusterDefByName gets the ClusterDefinition object by the name. 194 func getClusterDefByName(ctx context.Context, cli client.Client, clusterDefName string) (*appsv1alpha1.ClusterDefinition, error) { 195 clusterDef := &appsv1alpha1.ClusterDefinition{} 196 if err := cli.Get(ctx, client.ObjectKey{Name: clusterDefName}, clusterDef); err != nil { 197 return nil, err 198 } 199 return clusterDef, nil 200 } 201 202 // PatchOpsStatusWithOpsDeepCopy patches OpsRequest.status with the deepCopy opsRequest. 203 func PatchOpsStatusWithOpsDeepCopy(ctx context.Context, 204 cli client.Client, 205 opsRes *OpsResource, 206 opsRequestDeepCopy *appsv1alpha1.OpsRequest, 207 phase appsv1alpha1.OpsPhase, 208 condition ...*metav1.Condition) error { 209 210 opsRequest := opsRes.OpsRequest 211 patch := client.MergeFrom(opsRequestDeepCopy) 212 for _, v := range condition { 213 if v == nil { 214 continue 215 } 216 opsRequest.SetStatusCondition(*v) 217 // emit an event 218 eventType := corev1.EventTypeNormal 219 if phase == appsv1alpha1.OpsFailedPhase { 220 eventType = corev1.EventTypeWarning 221 } 222 opsRes.Recorder.Event(opsRequest, eventType, v.Reason, v.Message) 223 } 224 if opsRequest.IsComplete(phase) { 225 opsRequest.Status.CompletionTimestamp = metav1.Time{Time: time.Now()} 226 // when OpsRequest is completed, remove it from annotation 227 if err := DeleteOpsRequestAnnotationInCluster(ctx, cli, opsRes); err != nil { 228 return err 229 } 230 } 231 if phase == appsv1alpha1.OpsCreatingPhase && opsRequest.Status.Phase != phase { 232 opsRequest.Status.StartTimestamp = metav1.Time{Time: time.Now()} 233 } 234 opsRequest.Status.Phase = phase 235 return cli.Status().Patch(ctx, opsRequest, patch) 236 } 237 238 // PatchOpsStatus patches OpsRequest.status 239 func PatchOpsStatus(ctx context.Context, 240 cli client.Client, 241 opsRes *OpsResource, 242 phase appsv1alpha1.OpsPhase, 243 condition ...*metav1.Condition) error { 244 return PatchOpsStatusWithOpsDeepCopy(ctx, cli, opsRes, opsRes.OpsRequest.DeepCopy(), phase, condition...) 245 } 246 247 // PatchClusterNotFound patches ClusterNotFound condition to the OpsRequest.status.conditions. 248 func PatchClusterNotFound(ctx context.Context, cli client.Client, opsRes *OpsResource) error { 249 message := fmt.Sprintf("spec.clusterRef %s is not found", opsRes.OpsRequest.Spec.ClusterRef) 250 condition := appsv1alpha1.NewValidateFailedCondition(appsv1alpha1.ReasonClusterNotFound, message) 251 return PatchOpsStatus(ctx, cli, opsRes, appsv1alpha1.OpsFailedPhase, condition) 252 } 253 254 // patchOpsHandlerNotSupported patches OpsNotSupported condition to the OpsRequest.status.conditions. 255 func patchOpsHandlerNotSupported(ctx context.Context, cli client.Client, opsRes *OpsResource) error { 256 message := fmt.Sprintf("spec.type %s is not supported by operator", opsRes.OpsRequest.Spec.Type) 257 condition := appsv1alpha1.NewValidateFailedCondition(appsv1alpha1.ReasonOpsTypeNotSupported, message) 258 return PatchOpsStatus(ctx, cli, opsRes, appsv1alpha1.OpsFailedPhase, condition) 259 } 260 261 // patchValidateErrorCondition patches ValidateError condition to the OpsRequest.status.conditions. 262 func patchValidateErrorCondition(ctx context.Context, cli client.Client, opsRes *OpsResource, errMessage string) error { 263 condition := appsv1alpha1.NewValidateFailedCondition(appsv1alpha1.ReasonValidateFailed, errMessage) 264 return PatchOpsStatus(ctx, cli, opsRes, appsv1alpha1.OpsFailedPhase, condition) 265 } 266 267 // patchFastFailErrorCondition patches a new failed condition to the OpsRequest.status.conditions. 268 func patchFastFailErrorCondition(ctx context.Context, cli client.Client, opsRes *OpsResource, err error) error { 269 condition := appsv1alpha1.NewFailedCondition(opsRes.OpsRequest, err) 270 return PatchOpsStatus(ctx, cli, opsRes, appsv1alpha1.OpsFailedPhase, condition) 271 } 272 273 // GetOpsRecorderFromSlice gets OpsRequest recorder from slice by target cluster phase 274 func GetOpsRecorderFromSlice(opsRequestSlice []appsv1alpha1.OpsRecorder, 275 opsRequestName string) (int, appsv1alpha1.OpsRecorder) { 276 for i, v := range opsRequestSlice { 277 if v.Name == opsRequestName { 278 return i, v 279 } 280 } 281 // if not found, return -1 and an empty OpsRecorder object 282 return -1, appsv1alpha1.OpsRecorder{} 283 } 284 285 // patchOpsRequestToCreating patches OpsRequest.status.phase to Running 286 func patchOpsRequestToCreating(reqCtx intctrlutil.RequestCtx, 287 cli client.Client, 288 opsRes *OpsResource, 289 opsDeepCoy *appsv1alpha1.OpsRequest, 290 opsHandler OpsHandler) error { 291 var condition *metav1.Condition 292 validatePassCondition := appsv1alpha1.NewValidatePassedCondition(opsRes.OpsRequest.Name) 293 condition, err := opsHandler.ActionStartedCondition(reqCtx, cli, opsRes) 294 if err != nil { 295 return err 296 } 297 return PatchOpsStatusWithOpsDeepCopy(reqCtx.Ctx, cli, opsRes, opsDeepCoy, appsv1alpha1.OpsCreatingPhase, validatePassCondition, condition) 298 } 299 300 // DeleteOpsRequestAnnotationInCluster when OpsRequest.status.phase is Succeeded or Failed 301 // we should remove the OpsRequest Annotation of cluster, then unlock cluster 302 func DeleteOpsRequestAnnotationInCluster(ctx context.Context, cli client.Client, opsRes *OpsResource) error { 303 var ( 304 opsRequestSlice []appsv1alpha1.OpsRecorder 305 err error 306 ) 307 if opsRequestSlice, err = opsutil.GetOpsRequestSliceFromCluster(opsRes.Cluster); err != nil { 308 return err 309 } 310 index, opsRecord := GetOpsRecorderFromSlice(opsRequestSlice, opsRes.OpsRequest.Name) 311 if opsRecord.Name == "" { 312 return nil 313 } 314 // delete the opsRequest information in Cluster.annotations 315 opsRequestSlice = slices.Delete(opsRequestSlice, index, index+1) 316 return opsutil.PatchClusterOpsAnnotations(ctx, cli, opsRes.Cluster, opsRequestSlice) 317 } 318 319 // addOpsRequestAnnotationToCluster adds the OpsRequest Annotation to Cluster.metadata.Annotations to acquire the lock. 320 func addOpsRequestAnnotationToCluster(ctx context.Context, cli client.Client, opsRes *OpsResource, opsBehaviour OpsBehaviour) error { 321 var ( 322 opsRequestSlice []appsv1alpha1.OpsRecorder 323 err error 324 ) 325 if opsBehaviour.ToClusterPhase == "" { 326 return nil 327 } 328 // if the running opsRequest is deleted, do not patch the opsRequest annotation on cluster. 329 if !opsRes.OpsRequest.DeletionTimestamp.IsZero() { 330 return nil 331 } 332 if opsRequestSlice, err = opsutil.GetOpsRequestSliceFromCluster(opsRes.Cluster); err != nil { 333 return err 334 } 335 // check the OpsRequest is existed 336 if _, opsRecorder := GetOpsRecorderFromSlice(opsRequestSlice, opsRes.OpsRequest.Name); opsRecorder.Name != "" { 337 return nil 338 } 339 if opsRequestSlice == nil { 340 opsRequestSlice = make([]appsv1alpha1.OpsRecorder, 0) 341 } 342 opsRequestSlice = append(opsRequestSlice, appsv1alpha1.OpsRecorder{ 343 Name: opsRes.OpsRequest.Name, 344 Type: opsRes.OpsRequest.Spec.Type, 345 }) 346 return opsutil.UpdateClusterOpsAnnotations(ctx, cli, opsRes.Cluster, opsRequestSlice) 347 } 348 349 // isOpsRequestFailedPhase checks the OpsRequest phase is Failed 350 func isOpsRequestFailedPhase(opsRequestPhase appsv1alpha1.OpsPhase) bool { 351 return opsRequestPhase == appsv1alpha1.OpsFailedPhase 352 } 353 354 func updateReconfigureStatusByCM(reconfiguringStatus *appsv1alpha1.ReconfiguringStatus, tplName string, 355 handleReconfigureStatus handleReconfigureOpsStatus) error { 356 for i, cmStatus := range reconfiguringStatus.ConfigurationStatus { 357 if cmStatus.Name == tplName { 358 // update cmStatus 359 return handleReconfigureStatus(&reconfiguringStatus.ConfigurationStatus[i]) 360 } 361 } 362 cmCount := len(reconfiguringStatus.ConfigurationStatus) 363 reconfiguringStatus.ConfigurationStatus = append(reconfiguringStatus.ConfigurationStatus, appsv1alpha1.ConfigurationItemStatus{ 364 Name: tplName, 365 Status: appsv1alpha1.ReasonReconfigurePersisting, 366 SucceedCount: core.NotStarted, 367 ExpectedCount: core.Unconfirmed, 368 }) 369 cmStatus := &reconfiguringStatus.ConfigurationStatus[cmCount] 370 return handleReconfigureStatus(cmStatus) 371 } 372 373 // patchReconfigureOpsStatus when Reconfigure is running, we should update status to OpsRequest.Status.ConfigurationStatus. 374 // 375 // NOTES: 376 // opsStatus describes status of OpsRequest. 377 // reconfiguringStatus describes status of reconfiguring operation, which contains multiple configuration templates. 378 // cmStatus describes status of configmap, it is uniquely associated with a configuration template, which contains multiple keys, each key is name of a configuration file. 379 // execStatus describes the result of the execution of the state machine, which is designed to solve how to conduct the reconfiguring operation, such as whether to restart, how to send a signal to the process. 380 func patchReconfigureOpsStatus( 381 opsRes *OpsResource, 382 tplName string, 383 handleReconfigureStatus handleReconfigureOpsStatus) error { 384 var opsRequest = opsRes.OpsRequest 385 if opsRequest.Status.ReconfiguringStatus == nil { 386 opsRequest.Status.ReconfiguringStatus = &appsv1alpha1.ReconfiguringStatus{ 387 ConfigurationStatus: make([]appsv1alpha1.ConfigurationItemStatus, 0), 388 } 389 } 390 391 reconfiguringStatus := opsRequest.Status.ReconfiguringStatus 392 return updateReconfigureStatusByCM(reconfiguringStatus, tplName, handleReconfigureStatus) 393 } 394 395 // getSlowestReconfiguringProgress gets the progress of the reconfiguring operations. 396 func getSlowestReconfiguringProgress(status []appsv1alpha1.ConfigurationItemStatus) string { 397 slowest := appsv1alpha1.ConfigurationItemStatus{ 398 SucceedCount: math.MaxInt32, 399 ExpectedCount: -1, 400 } 401 402 for _, st := range status { 403 if st.SucceedCount < slowest.SucceedCount { 404 slowest = st 405 } 406 } 407 return fmt.Sprintf("%d/%d", slowest.SucceedCount, slowest.ExpectedCount) 408 } 409 410 func getTargetResourcesOfLastComponent(lastConfiguration appsv1alpha1.LastConfiguration, compName string, resourceKey appsv1alpha1.ComponentResourceKey) []string { 411 lastComponentConfigs := lastConfiguration.Components[compName] 412 return lastComponentConfigs.TargetResources[resourceKey] 413 } 414 415 // cancelComponentOps the common function to cancel th opsRequest which updates the component attributes. 416 func cancelComponentOps(ctx context.Context, 417 cli client.Client, 418 opsRes *OpsResource, 419 updateComp func(lastConfig *appsv1alpha1.LastComponentConfiguration, comp *appsv1alpha1.ClusterComponentSpec) error) error { 420 opsRequest := opsRes.OpsRequest 421 lastCompInfos := opsRequest.Status.LastConfiguration.Components 422 if lastCompInfos == nil { 423 return nil 424 } 425 for index, comp := range opsRes.Cluster.Spec.ComponentSpecs { 426 lastConfig, ok := lastCompInfos[comp.Name] 427 if !ok { 428 continue 429 } 430 if err := updateComp(&lastConfig, &comp); err != nil { 431 return err 432 } 433 opsRes.Cluster.Spec.ComponentSpecs[index] = comp 434 lastCompInfos[comp.Name] = lastConfig 435 } 436 return cli.Update(ctx, opsRes.Cluster) 437 } 438 439 // validateOpsWaitingPhase validates whether the current cluster phase is expected, and whether the waiting time exceeds the limit. 440 // only requests with `Pending` phase will be validated. 441 func validateOpsWaitingPhase(cluster *appsv1alpha1.Cluster, ops *appsv1alpha1.OpsRequest, opsBehaviour OpsBehaviour) error { 442 if len(opsBehaviour.FromClusterPhases) == 0 || ops.Status.Phase != appsv1alpha1.OpsPendingPhase { 443 return nil 444 } 445 // check if the opsRequest can be executed in the current cluster phase unless this opsRequest is reentrant. 446 if !slices.Contains(opsBehaviour.FromClusterPhases, cluster.Status.Phase) { 447 // check if entry-condition is met 448 // if the cluster is not in the expected phase, we should wait for it for up to TTLSecondsBeforeAbort seconds. 449 // if len(opsRecorder) == 0 && !slices.Contains(opsBehaviour.FromClusterPhases, cluster.Status.Phase) { 450 // TTLSecondsBeforeAbort is 0 means that the we do not need to wait for the cluster to reach the expected phase. 451 if ops.Spec.TTLSecondsBeforeAbort == nil || (time.Now().After(ops.GetCreationTimestamp().Add(time.Duration(*ops.Spec.TTLSecondsBeforeAbort) * time.Second))) { 452 return fmt.Errorf("OpsRequest.spec.type=%s is forbidden when Cluster.status.phase=%s", ops.Spec.Type, cluster.Status.Phase) 453 } 454 455 return &WaitForClusterPhaseErr{ 456 clusterName: cluster.Name, 457 currentPhase: cluster.Status.Phase, 458 expectedPhase: opsBehaviour.FromClusterPhases, 459 } 460 } 461 return nil 462 }