github.com/1aal/kubeblocks@v0.0.0-20231107070852-e1c03e598921/controllers/apps/operations/switchover_util.go (about) 1 /* 2 Copyright (C) 2022-2023 ApeCloud Co., Ltd 3 4 This file is part of KubeBlocks project 5 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU Affero General Public License as published by 8 the Free Software Foundation, either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU Affero General Public License for more details. 15 16 You should have received a copy of the GNU Affero General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 package operations 21 22 import ( 23 "context" 24 "encoding/json" 25 "fmt" 26 "strings" 27 28 "github.com/pkg/errors" 29 "golang.org/x/exp/slices" 30 batchv1 "k8s.io/api/batch/v1" 31 corev1 "k8s.io/api/core/v1" 32 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 "k8s.io/apimachinery/pkg/types" 34 "sigs.k8s.io/controller-runtime/pkg/client" 35 36 appsv1alpha1 "github.com/1aal/kubeblocks/apis/apps/v1alpha1" 37 "github.com/1aal/kubeblocks/controllers/apps/components" 38 "github.com/1aal/kubeblocks/pkg/common" 39 "github.com/1aal/kubeblocks/pkg/constant" 40 intctrlcomputil "github.com/1aal/kubeblocks/pkg/controller/component" 41 intctrlutil "github.com/1aal/kubeblocks/pkg/controllerutil" 42 ) 43 44 const ( 45 SwitchoverCheckJobKey = "CheckJob" 46 SwitchoverCheckRoleLabelKey = "CheckRoleLabel" 47 48 OpsReasonForSkipSwitchover = "SkipSwitchover" 49 ) 50 51 // needDoSwitchover checks whether we need to perform a switchover. 52 func needDoSwitchover(ctx context.Context, 53 cli client.Client, 54 cluster *appsv1alpha1.Cluster, 55 componentSpec *appsv1alpha1.ClusterComponentSpec, 56 switchover *appsv1alpha1.Switchover) (bool, error) { 57 // get the Pod object whose current role label is primary 58 pod, err := getPrimaryOrLeaderPod(ctx, cli, *cluster, componentSpec.Name, componentSpec.ComponentDefRef) 59 if err != nil { 60 return false, err 61 } 62 if pod == nil { 63 return false, nil 64 } 65 switch switchover.InstanceName { 66 case constant.KBSwitchoverCandidateInstanceForAnyPod: 67 return true, nil 68 default: 69 podList, err := components.GetComponentPodList(ctx, cli, *cluster, componentSpec.Name) 70 if err != nil { 71 return false, err 72 } 73 podParent, _ := common.ParseParentNameAndOrdinal(pod.Name) 74 siParent, o := common.ParseParentNameAndOrdinal(switchover.InstanceName) 75 if podParent != siParent || o < 0 || o >= int32(len(podList.Items)) { 76 return false, errors.New("switchover.InstanceName is invalid") 77 } 78 // If the current instance is already the primary, then no switchover will be performed. 79 if pod.Name == switchover.InstanceName { 80 return false, nil 81 } 82 } 83 return true, nil 84 } 85 86 // createSwitchoverJob creates a switchover job to do switchover. 87 func createSwitchoverJob(reqCtx intctrlutil.RequestCtx, 88 cli client.Client, 89 cluster *appsv1alpha1.Cluster, 90 componentSpec *appsv1alpha1.ClusterComponentSpec, 91 componentDef *appsv1alpha1.ClusterComponentDefinition, 92 switchover *appsv1alpha1.Switchover) error { 93 switchoverJob, err := renderSwitchoverCmdJob(reqCtx.Ctx, cli, cluster, componentSpec, componentDef, switchover) 94 if err != nil { 95 return err 96 } 97 // check the current generation switchoverJob whether exist 98 key := types.NamespacedName{Namespace: cluster.Namespace, Name: switchoverJob.Name} 99 exists, _ := intctrlutil.CheckResourceExists(reqCtx.Ctx, cli, key, &batchv1.Job{}) 100 if !exists { 101 // check the previous generation switchoverJob whether exist 102 ml := getSwitchoverCmdJobLabel(cluster.Name, componentSpec.Name) 103 previousJobs, err := getJobWithLabels(reqCtx.Ctx, cli, cluster, ml) 104 if err != nil { 105 return err 106 } 107 if len(previousJobs) > 0 { 108 // delete the previous generation switchoverJob 109 reqCtx.Log.V(1).Info("delete previous generation switchoverJob", "job", previousJobs[0].Name) 110 if err := cleanJobWithLabels(reqCtx.Ctx, cli, cluster, ml); err != nil { 111 return err 112 } 113 } 114 // create the current generation switchoverJob 115 if err := cli.Create(reqCtx.Ctx, switchoverJob); err != nil { 116 return err 117 } 118 return nil 119 } 120 return nil 121 } 122 123 // checkPodRoleLabelConsistency checks whether the pod role label is consistent with the specified role label after switchover. 124 func checkPodRoleLabelConsistency(ctx context.Context, 125 cli client.Client, 126 cluster *appsv1alpha1.Cluster, 127 componentSpec *appsv1alpha1.ClusterComponentSpec, 128 componentDef *appsv1alpha1.ClusterComponentDefinition, 129 switchover *appsv1alpha1.Switchover, 130 switchoverCondition *metav1.Condition) (bool, error) { 131 if switchover == nil || switchoverCondition == nil { 132 return false, nil 133 } 134 // get the Pod object whose current role label is primary 135 pod, err := getPrimaryOrLeaderPod(ctx, cli, *cluster, componentSpec.Name, componentDef.Name) 136 if err != nil { 137 return false, err 138 } 139 if pod == nil { 140 return false, nil 141 } 142 var switchoverMessageMap map[string]SwitchoverMessage 143 if err := json.Unmarshal([]byte(switchoverCondition.Message), &switchoverMessageMap); err != nil { 144 return false, err 145 } 146 147 for _, switchoverMessage := range switchoverMessageMap { 148 if switchoverMessage.ComponentName != componentSpec.Name { 149 continue 150 } 151 switch switchoverMessage.Switchover.InstanceName { 152 case constant.KBSwitchoverCandidateInstanceForAnyPod: 153 if pod.Name != switchoverMessage.OldPrimary { 154 return true, nil 155 } 156 default: 157 if pod.Name == switchoverMessage.Switchover.InstanceName { 158 return true, nil 159 } 160 } 161 } 162 return false, nil 163 } 164 165 // renderSwitchoverCmdJob renders and creates the switchover command jobs. 166 func renderSwitchoverCmdJob(ctx context.Context, 167 cli client.Client, 168 cluster *appsv1alpha1.Cluster, 169 componentSpec *appsv1alpha1.ClusterComponentSpec, 170 componentDef *appsv1alpha1.ClusterComponentDefinition, 171 switchover *appsv1alpha1.Switchover) (*batchv1.Job, error) { 172 if componentDef.SwitchoverSpec == nil || switchover == nil { 173 return nil, errors.New("switchover spec not found") 174 } 175 pod, err := getPrimaryOrLeaderPod(ctx, cli, *cluster, componentSpec.Name, componentDef.Name) 176 if err != nil { 177 return nil, err 178 } 179 if pod == nil { 180 return nil, errors.New("primary pod not found") 181 } 182 183 renderJobPodVolumes := func(scriptSpecSelectors []appsv1alpha1.ScriptSpecSelector) ([]corev1.Volume, []corev1.VolumeMount) { 184 volumes := make([]corev1.Volume, 0) 185 volumeMounts := make([]corev1.VolumeMount, 0) 186 187 // find current pod's volume which mapped to configMapRefs 188 findVolumes := func(tplSpec appsv1alpha1.ComponentTemplateSpec, scriptSpecSelector appsv1alpha1.ScriptSpecSelector) { 189 if tplSpec.Name != scriptSpecSelector.Name { 190 return 191 } 192 for _, podVolume := range pod.Spec.Volumes { 193 if podVolume.Name == tplSpec.VolumeName { 194 volumes = append(volumes, podVolume) 195 break 196 } 197 } 198 } 199 200 // filter out the corresponding script configMap volumes from the volumes of the current leader pod based on the scriptSpecSelectors defined by the user. 201 for _, scriptSpecSelector := range scriptSpecSelectors { 202 for _, scriptSpec := range componentDef.ScriptSpecs { 203 findVolumes(scriptSpec, scriptSpecSelector) 204 } 205 } 206 207 // find current pod's volumeMounts which mapped to volumes 208 for _, volume := range volumes { 209 for _, volumeMount := range pod.Spec.Containers[0].VolumeMounts { 210 if volumeMount.Name == volume.Name { 211 volumeMounts = append(volumeMounts, volumeMount) 212 break 213 } 214 } 215 } 216 217 return volumes, volumeMounts 218 } 219 220 renderJob := func(switchoverSpec *appsv1alpha1.SwitchoverSpec, switchoverEnvs []corev1.EnvVar) (*batchv1.Job, error) { 221 var ( 222 cmdExecutorConfig *appsv1alpha1.CmdExecutorConfig 223 scriptSpecSelectors []appsv1alpha1.ScriptSpecSelector 224 ) 225 switch switchover.InstanceName { 226 case constant.KBSwitchoverCandidateInstanceForAnyPod: 227 if switchoverSpec.WithoutCandidate != nil { 228 cmdExecutorConfig = switchoverSpec.WithoutCandidate.CmdExecutorConfig 229 scriptSpecSelectors = switchoverSpec.WithoutCandidate.ScriptSpecSelectors 230 } 231 default: 232 if switchoverSpec.WithCandidate != nil { 233 cmdExecutorConfig = switchoverSpec.WithCandidate.CmdExecutorConfig 234 scriptSpecSelectors = switchoverSpec.WithCandidate.ScriptSpecSelectors 235 } 236 } 237 if cmdExecutorConfig == nil { 238 return nil, errors.New("switchover action not found") 239 } 240 volumes, volumeMounts := renderJobPodVolumes(scriptSpecSelectors) 241 242 // jobName named with generation to distinguish different switchover jobs. 243 jobName := genSwitchoverJobName(cluster.Name, componentSpec.Name, cluster.Generation) 244 job := &batchv1.Job{ 245 ObjectMeta: metav1.ObjectMeta{ 246 Namespace: cluster.Namespace, 247 Name: jobName, 248 Labels: getSwitchoverCmdJobLabel(cluster.Name, componentSpec.Name), 249 }, 250 Spec: batchv1.JobSpec{ 251 Template: corev1.PodTemplateSpec{ 252 ObjectMeta: metav1.ObjectMeta{ 253 Namespace: cluster.Namespace, 254 Name: jobName, 255 }, 256 Spec: corev1.PodSpec{ 257 Volumes: volumes, 258 RestartPolicy: corev1.RestartPolicyNever, 259 Containers: []corev1.Container{ 260 { 261 Name: constant.KBSwitchoverJobContainerName, 262 Image: cmdExecutorConfig.Image, 263 ImagePullPolicy: corev1.PullIfNotPresent, 264 Command: cmdExecutorConfig.Command, 265 Args: cmdExecutorConfig.Args, 266 Env: switchoverEnvs, 267 VolumeMounts: volumeMounts, 268 }, 269 }, 270 }, 271 }, 272 }, 273 } 274 if len(cluster.Spec.Tolerations) > 0 { 275 job.Spec.Template.Spec.Tolerations = cluster.Spec.Tolerations 276 } 277 return job, nil 278 } 279 280 switchoverEnvs, err := buildSwitchoverEnvs(ctx, cli, cluster, componentSpec, componentDef, switchover) 281 if err != nil { 282 return nil, err 283 } 284 job, err := renderJob(componentDef.SwitchoverSpec, switchoverEnvs) 285 if err != nil { 286 return nil, err 287 } 288 return job, nil 289 } 290 291 // genSwitchoverJobName generates the switchover job name. 292 func genSwitchoverJobName(clusterName, componentName string, generation int64) string { 293 return fmt.Sprintf("%s-%s-%s-%d", constant.KBSwitchoverJobNamePrefix, clusterName, componentName, generation) 294 } 295 296 // getSupportSwitchoverWorkload returns the kinds that support switchover. 297 func getSupportSwitchoverWorkload() []appsv1alpha1.WorkloadType { 298 return []appsv1alpha1.WorkloadType{ 299 appsv1alpha1.Replication, 300 appsv1alpha1.Consensus, 301 } 302 } 303 304 // getSwitchoverCmdJobLabel gets the labels for job that execute the switchover commands. 305 func getSwitchoverCmdJobLabel(clusterName, componentName string) map[string]string { 306 return map[string]string{ 307 constant.AppInstanceLabelKey: clusterName, 308 constant.KBAppComponentLabelKey: componentName, 309 constant.AppManagedByLabelKey: constant.AppName, 310 constant.KBSwitchoverJobLabelKey: constant.KBSwitchoverJobLabelValue, 311 } 312 } 313 314 // buildSwitchoverCandidateEnv builds the candidate instance name environment variable for the switchover job. 315 func buildSwitchoverCandidateEnv( 316 cluster *appsv1alpha1.Cluster, 317 componentSpec *appsv1alpha1.ClusterComponentSpec, 318 switchover *appsv1alpha1.Switchover) []corev1.EnvVar { 319 svcName := strings.Join([]string{cluster.Name, componentSpec.Name, "headless"}, "-") 320 if switchover == nil { 321 return nil 322 } 323 if switchover.InstanceName == constant.KBSwitchoverCandidateInstanceForAnyPod { 324 return nil 325 } 326 return []corev1.EnvVar{ 327 { 328 Name: constant.KBSwitchoverCandidateName, 329 Value: switchover.InstanceName, 330 }, 331 { 332 Name: constant.KBSwitchoverCandidateFqdn, 333 Value: fmt.Sprintf("%s.%s", switchover.InstanceName, svcName), 334 }, 335 } 336 } 337 338 // buildSwitchoverEnvs builds the environment variables for the switchover job. 339 func buildSwitchoverEnvs(ctx context.Context, 340 cli client.Client, 341 cluster *appsv1alpha1.Cluster, 342 componentSpec *appsv1alpha1.ClusterComponentSpec, 343 componentDef *appsv1alpha1.ClusterComponentDefinition, 344 switchover *appsv1alpha1.Switchover) ([]corev1.EnvVar, error) { 345 if componentSpec == nil || switchover == nil || componentDef.SwitchoverSpec == nil { 346 return nil, errors.New("switchover spec not found") 347 } 348 // replace secret env and merge envs defined in SwitchoverSpec 349 replaceSwitchoverConnCredentialEnv(cluster.Name, componentDef.SwitchoverSpec) 350 var switchoverEnvs []corev1.EnvVar 351 switch switchover.InstanceName { 352 case constant.KBSwitchoverCandidateInstanceForAnyPod: 353 if componentDef.SwitchoverSpec.WithoutCandidate != nil { 354 switchoverEnvs = append(switchoverEnvs, componentDef.SwitchoverSpec.WithoutCandidate.CmdExecutorConfig.Env...) 355 } 356 default: 357 if componentDef.SwitchoverSpec.WithCandidate != nil { 358 switchoverEnvs = append(switchoverEnvs, componentDef.SwitchoverSpec.WithCandidate.CmdExecutorConfig.Env...) 359 } 360 } 361 362 // inject the old primary info into the environment variable 363 workloadEnvs, err := buildSwitchoverWorkloadEnvs(ctx, cli, cluster, componentSpec, componentDef) 364 if err != nil { 365 return nil, err 366 } 367 switchoverEnvs = append(switchoverEnvs, workloadEnvs...) 368 369 // inject the candidate instance name into the environment variable if specify the candidate instance 370 switchoverCandidateEnvs := buildSwitchoverCandidateEnv(cluster, componentSpec, switchover) 371 switchoverEnvs = append(switchoverEnvs, switchoverCandidateEnvs...) 372 return switchoverEnvs, nil 373 } 374 375 // replaceSwitchoverConnCredentialEnv replaces the connection credential environment variables for the switchover job. 376 func replaceSwitchoverConnCredentialEnv(clusterName string, switchoverSpec *appsv1alpha1.SwitchoverSpec) { 377 if switchoverSpec == nil { 378 return 379 } 380 namedValuesMap := intctrlcomputil.GetEnvReplacementMapForConnCredential(clusterName) 381 replaceEnvVars := func(cmdExecutorConfig *appsv1alpha1.CmdExecutorConfig) { 382 if cmdExecutorConfig != nil { 383 cmdExecutorConfig.Env = intctrlcomputil.ReplaceSecretEnvVars(namedValuesMap, cmdExecutorConfig.Env) 384 } 385 } 386 replaceEnvVars(switchoverSpec.WithCandidate.CmdExecutorConfig) 387 replaceEnvVars(switchoverSpec.WithoutCandidate.CmdExecutorConfig) 388 } 389 390 // buildSwitchoverWorkloadEnvs builds the replication or consensus workload environment variables for the switchover job. 391 func buildSwitchoverWorkloadEnvs(ctx context.Context, 392 cli client.Client, 393 cluster *appsv1alpha1.Cluster, 394 componentSpec *appsv1alpha1.ClusterComponentSpec, 395 componentDef *appsv1alpha1.ClusterComponentDefinition) ([]corev1.EnvVar, error) { 396 var workloadEnvs []corev1.EnvVar 397 pod, err := getPrimaryOrLeaderPod(ctx, cli, *cluster, componentSpec.Name, componentDef.Name) 398 if err != nil { 399 return nil, err 400 } 401 if pod == nil { 402 return nil, errors.New("primary pod not found") 403 } 404 svcName := strings.Join([]string{cluster.Name, componentSpec.Name, "headless"}, "-") 405 switch componentDef.WorkloadType { 406 case appsv1alpha1.Replication: 407 rsEnvs := []corev1.EnvVar{ 408 { 409 Name: constant.KBSwitchoverReplicationPrimaryPodIP, 410 Value: pod.Status.PodIP, 411 }, 412 { 413 Name: constant.KBSwitchoverReplicationPrimaryPodName, 414 Value: pod.Name, 415 }, 416 { 417 Name: constant.KBSwitchoverReplicationPrimaryPodFqdn, 418 Value: fmt.Sprintf("%s.%s", pod.Name, svcName), 419 }, 420 } 421 workloadEnvs = append(workloadEnvs, rsEnvs...) 422 case appsv1alpha1.Consensus: 423 csEnvs := []corev1.EnvVar{ 424 { 425 Name: constant.KBSwitchoverConsensusLeaderPodIP, 426 Value: pod.Status.PodIP, 427 }, 428 { 429 Name: constant.KBSwitchoverConsensusLeaderPodName, 430 Value: pod.Name, 431 }, 432 { 433 Name: constant.KBSwitchoverConsensusLeaderPodFqdn, 434 Value: fmt.Sprintf("%s.%s", pod.Name, svcName), 435 }, 436 } 437 workloadEnvs = append(workloadEnvs, csEnvs...) 438 } 439 // add tht first container's environment variables of the primary pod 440 workloadEnvs = append(workloadEnvs, pod.Spec.Containers[0].Env...) 441 return workloadEnvs, nil 442 } 443 444 // getJobWithLabels gets the job list with the specified labels. 445 func getJobWithLabels(ctx context.Context, 446 cli client.Client, 447 cluster *appsv1alpha1.Cluster, 448 matchLabels client.MatchingLabels) ([]batchv1.Job, error) { 449 jobList := &batchv1.JobList{} 450 if err := cli.List(ctx, jobList, client.InNamespace(cluster.Namespace), matchLabels); err != nil { 451 return nil, err 452 } 453 return jobList.Items, nil 454 } 455 456 // cleanJobWithLabels cleans up the job tasks with label that execute the switchover commands. 457 func cleanJobWithLabels(ctx context.Context, 458 cli client.Client, 459 cluster *appsv1alpha1.Cluster, 460 matchLabels client.MatchingLabels) error { 461 jobList, err := getJobWithLabels(ctx, cli, cluster, matchLabels) 462 if err != nil { 463 return err 464 } 465 for _, job := range jobList { 466 var ttl = int32(constant.KBJobTTLSecondsAfterFinished) 467 patch := client.MergeFrom(job.DeepCopy()) 468 job.Spec.TTLSecondsAfterFinished = &ttl 469 if err := cli.Patch(ctx, &job, patch); err != nil { 470 return err 471 } 472 } 473 return nil 474 } 475 476 // cleanJobByName cleans up the job task by name that execute the switchover commands. 477 func cleanJobByName(ctx context.Context, 478 cli client.Client, 479 cluster *appsv1alpha1.Cluster, 480 jobName string) error { 481 job := &batchv1.Job{} 482 key := types.NamespacedName{Namespace: cluster.Namespace, Name: jobName} 483 if err := cli.Get(ctx, key, job); err != nil { 484 return err 485 } 486 var ttl = int32(constant.KBJobTTLSecondsAfterFinished) 487 patch := client.MergeFrom(job.DeepCopy()) 488 job.Spec.TTLSecondsAfterFinished = &ttl 489 if err := cli.Patch(ctx, job, patch); err != nil { 490 return err 491 } 492 return nil 493 } 494 495 // checkJobSucceed checks the result of job execution. 496 // Returns: 497 // - bool: whether job exist, true exist 498 // - error: any error that occurred during the handling 499 func checkJobSucceed(ctx context.Context, 500 cli client.Client, 501 cluster *appsv1alpha1.Cluster, 502 jobName string) error { 503 key := types.NamespacedName{Namespace: cluster.Namespace, Name: jobName} 504 currentJob := batchv1.Job{} 505 exists, err := intctrlutil.CheckResourceExists(ctx, cli, key, ¤tJob) 506 if err != nil { 507 return err 508 } 509 if !exists { 510 return errors.New("job not exist, pls check.") 511 } 512 jobStatusConditions := currentJob.Status.Conditions 513 if len(jobStatusConditions) > 0 { 514 switch jobStatusConditions[0].Type { 515 case batchv1.JobComplete: 516 return nil 517 case batchv1.JobFailed: 518 return errors.New("job failed, pls check.") 519 default: 520 return intctrlutil.NewErrorf(intctrlutil.ErrorWaitCacheRefresh, "requeue to waiting for job %s finished.", key.Name) 521 } 522 } else { 523 return errors.New("job check conditions status failed") 524 } 525 } 526 527 // getPrimaryOrLeaderPod returns the leader or primary pod of the component. 528 func getPrimaryOrLeaderPod(ctx context.Context, cli client.Client, cluster appsv1alpha1.Cluster, compSpecName, compDefName string) (*corev1.Pod, error) { 529 var ( 530 err error 531 podList *corev1.PodList 532 ) 533 compDef, err := appsv1alpha1.GetComponentDefByCluster(ctx, cli, cluster, compDefName) 534 if err != nil { 535 return nil, err 536 } 537 if !slices.Contains(getSupportSwitchoverWorkload(), compDef.WorkloadType) { 538 return nil, errors.New("component does not support switchover") 539 } 540 switch compDef.WorkloadType { 541 case appsv1alpha1.Replication: 542 podList, err = components.GetComponentPodListWithRole(ctx, cli, cluster, compSpecName, constant.Primary) 543 case appsv1alpha1.Consensus: 544 podList, err = components.GetComponentPodListWithRole(ctx, cli, cluster, compSpecName, compDef.ConsensusSpec.Leader.Name) 545 } 546 if err != nil { 547 return nil, err 548 } 549 if len(podList.Items) != 1 { 550 return nil, errors.New("component pod list is empty or has more than one pod") 551 } 552 return &podList.Items[0], nil 553 }