github.com/1aal/kubeblocks@v0.0.0-20231107070852-e1c03e598921/controllers/apps/operations/switchover.go (about)

     1  /*
     2  Copyright (C) 2022-2023 ApeCloud Co., Ltd
     3  
     4  This file is part of KubeBlocks project
     5  
     6  This program is free software: you can redistribute it and/or modify
     7  it under the terms of the GNU Affero General Public License as published by
     8  the Free Software Foundation, either version 3 of the License, or
     9  (at your option) any later version.
    10  
    11  This program is distributed in the hope that it will be useful
    12  but WITHOUT ANY WARRANTY; without even the implied warranty of
    13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14  GNU Affero General Public License for more details.
    15  
    16  You should have received a copy of the GNU Affero General Public License
    17  along with this program.  If not, see <http://www.gnu.org/licenses/>.
    18  */
    19  
    20  package operations
    21  
    22  import (
    23  	"encoding/json"
    24  	"fmt"
    25  	"reflect"
    26  	"time"
    27  
    28  	"github.com/pkg/errors"
    29  	"k8s.io/apimachinery/pkg/api/meta"
    30  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    31  	"k8s.io/client-go/tools/record"
    32  	"sigs.k8s.io/controller-runtime/pkg/client"
    33  
    34  	appsv1alpha1 "github.com/1aal/kubeblocks/apis/apps/v1alpha1"
    35  	intctrlutil "github.com/1aal/kubeblocks/pkg/controllerutil"
    36  )
    37  
    38  type switchoverOpsHandler struct{}
    39  
    40  var _ OpsHandler = switchoverOpsHandler{}
    41  
    42  // SwitchoverMessage is the OpsRequest.Status.Condition.Message for switchover.
    43  type SwitchoverMessage struct {
    44  	appsv1alpha1.Switchover
    45  	OldPrimary string
    46  	Cluster    string
    47  }
    48  
    49  func init() {
    50  	switchoverBehaviour := OpsBehaviour{
    51  		FromClusterPhases:                  appsv1alpha1.GetClusterUpRunningPhases(),
    52  		ToClusterPhase:                     appsv1alpha1.UpdatingClusterPhase,
    53  		OpsHandler:                         switchoverOpsHandler{},
    54  		ProcessingReasonInClusterCondition: ProcessingReasonSwitchovering,
    55  	}
    56  
    57  	opsMgr := GetOpsManager()
    58  	opsMgr.RegisterOps(appsv1alpha1.SwitchoverType, switchoverBehaviour)
    59  }
    60  
    61  // ActionStartedCondition the started condition when handle the switchover request.
    62  func (r switchoverOpsHandler) ActionStartedCondition(reqCtx intctrlutil.RequestCtx, cli client.Client, opsRes *OpsResource) (*metav1.Condition, error) {
    63  	switchoverMessageMap := make(map[string]SwitchoverMessage)
    64  	for _, switchover := range opsRes.OpsRequest.Spec.SwitchoverList {
    65  		pod, err := getPrimaryOrLeaderPod(reqCtx.Ctx, cli, *opsRes.Cluster, switchover.ComponentName, opsRes.Cluster.Spec.GetComponentDefRefName(switchover.ComponentName))
    66  		if err != nil {
    67  			return nil, err
    68  		}
    69  		switchoverMessageMap[switchover.ComponentName] = SwitchoverMessage{
    70  			Switchover: switchover,
    71  			OldPrimary: pod.Name,
    72  			Cluster:    opsRes.Cluster.Name,
    73  		}
    74  	}
    75  	msg, err := json.Marshal(switchoverMessageMap)
    76  	if err != nil {
    77  		return nil, err
    78  	}
    79  	return appsv1alpha1.NewSwitchoveringCondition(opsRes.Cluster.Generation, string(msg)), nil
    80  }
    81  
    82  // Action to do the switchover operation.
    83  func (r switchoverOpsHandler) Action(reqCtx intctrlutil.RequestCtx, cli client.Client, opsRes *OpsResource) error {
    84  	return doSwitchoverComponents(reqCtx, cli, opsRes, opsRes.OpsRequest.Spec.SwitchoverList)
    85  }
    86  
    87  // ReconcileAction will be performed when action is done and loops till OpsRequest.status.phase is Succeed/Failed.
    88  // the Reconcile function for switchover opsRequest.
    89  func (r switchoverOpsHandler) ReconcileAction(reqCtx intctrlutil.RequestCtx, cli client.Client, opsRes *OpsResource) (appsv1alpha1.OpsPhase, time.Duration, error) {
    90  	var (
    91  		opsRequestPhase = appsv1alpha1.OpsRunningPhase
    92  	)
    93  
    94  	expectCount, actualCount, err := handleSwitchoverProgress(reqCtx, cli, opsRes)
    95  	if err != nil {
    96  		return "", 0, err
    97  	}
    98  
    99  	if expectCount == actualCount {
   100  		opsRequestPhase = appsv1alpha1.OpsSucceedPhase
   101  	}
   102  
   103  	return opsRequestPhase, time.Second, nil
   104  }
   105  
   106  // SaveLastConfiguration this operation only restart the pods of the component, no changes for Cluster.spec.
   107  // empty implementation here.
   108  func (r switchoverOpsHandler) SaveLastConfiguration(reqCtx intctrlutil.RequestCtx, cli client.Client, opsRes *OpsResource) error {
   109  	return nil
   110  }
   111  
   112  // doSwitchoverComponents creates the switchover job for each component.
   113  func doSwitchoverComponents(reqCtx intctrlutil.RequestCtx, cli client.Client, opsRes *OpsResource, switchoverList []appsv1alpha1.Switchover) error {
   114  	var (
   115  		opsRequest          = opsRes.OpsRequest
   116  		oldOpsRequestStatus = opsRequest.Status.DeepCopy()
   117  	)
   118  	patch := client.MergeFrom(opsRequest.DeepCopy())
   119  	if opsRequest.Status.Components == nil {
   120  		opsRequest.Status.Components = make(map[string]appsv1alpha1.OpsRequestComponentStatus)
   121  	}
   122  	for _, switchover := range switchoverList {
   123  		compDef, err := appsv1alpha1.GetComponentDefByCluster(reqCtx.Ctx, cli, *opsRes.Cluster, switchover.ComponentName)
   124  		if err != nil {
   125  			return err
   126  		}
   127  		needSwitchover, err := needDoSwitchover(reqCtx.Ctx, cli, opsRes.Cluster, opsRes.Cluster.Spec.GetComponentByName(switchover.ComponentName), &switchover)
   128  		if err != nil {
   129  			return err
   130  		}
   131  		if !needSwitchover {
   132  			opsRequest.Status.Components[switchover.ComponentName] = appsv1alpha1.OpsRequestComponentStatus{
   133  				Phase:           appsv1alpha1.RunningClusterCompPhase,
   134  				Reason:          OpsReasonForSkipSwitchover,
   135  				Message:         fmt.Sprintf("This component %s is already in the expected state, skip the switchover operation", switchover.ComponentName),
   136  				ProgressDetails: []appsv1alpha1.ProgressStatusDetail{},
   137  			}
   138  			continue
   139  		} else {
   140  			opsRequest.Status.Components[switchover.ComponentName] = appsv1alpha1.OpsRequestComponentStatus{
   141  				Phase:           appsv1alpha1.UpdatingClusterCompPhase,
   142  				ProgressDetails: []appsv1alpha1.ProgressStatusDetail{},
   143  			}
   144  		}
   145  		if err := createSwitchoverJob(reqCtx, cli, opsRes.Cluster, opsRes.Cluster.Spec.GetComponentByName(switchover.ComponentName), compDef, &switchover); err != nil {
   146  			return err
   147  		}
   148  	}
   149  	if !reflect.DeepEqual(*oldOpsRequestStatus, opsRequest.Status) {
   150  		if err := cli.Status().Patch(reqCtx.Ctx, opsRequest, patch); err != nil {
   151  			return err
   152  		}
   153  	}
   154  	return nil
   155  }
   156  
   157  // handleSwitchoverProgress handles the component progressDetails during switchover.
   158  // Returns:
   159  // - expectCount: the expected count of switchover operations
   160  // - completedCount: the number of completed switchover operations
   161  // - error: any error that occurred during the handling
   162  func handleSwitchoverProgress(reqCtx intctrlutil.RequestCtx, cli client.Client, opsRes *OpsResource) (int32, int32, error) {
   163  	var (
   164  		expectCount         = int32(len(opsRes.OpsRequest.Spec.SwitchoverList))
   165  		completedCount      int32
   166  		opsRequest          = opsRes.OpsRequest
   167  		oldOpsRequestStatus = opsRequest.Status.DeepCopy()
   168  		compDef             *appsv1alpha1.ClusterComponentDefinition
   169  		consistency         bool
   170  		err                 error
   171  	)
   172  	patch := client.MergeFrom(opsRequest.DeepCopy())
   173  	succeedJobs := make([]string, 0, len(opsRes.OpsRequest.Spec.SwitchoverList))
   174  	for _, switchover := range opsRequest.Spec.SwitchoverList {
   175  		switchoverCondition := meta.FindStatusCondition(opsRes.OpsRequest.Status.Conditions, appsv1alpha1.ConditionTypeSwitchover)
   176  		if switchoverCondition == nil {
   177  			err = errors.New("switchover condition is nil")
   178  			break
   179  		}
   180  
   181  		// if the component do not need switchover, skip it
   182  		reason := opsRequest.Status.Components[switchover.ComponentName].Reason
   183  		if reason == OpsReasonForSkipSwitchover {
   184  			completedCount += 1
   185  			continue
   186  		}
   187  
   188  		// check the current component switchoverJob whether succeed
   189  		jobName := genSwitchoverJobName(opsRes.Cluster.Name, switchover.ComponentName, switchoverCondition.ObservedGeneration)
   190  		checkJobProcessDetail := appsv1alpha1.ProgressStatusDetail{
   191  			ObjectKey: getProgressObjectKey(SwitchoverCheckJobKey, jobName),
   192  			Status:    appsv1alpha1.ProcessingProgressStatus,
   193  		}
   194  		if err = checkJobSucceed(reqCtx.Ctx, cli, opsRes.Cluster, jobName); err != nil {
   195  			checkJobProcessDetail.Message = fmt.Sprintf("switchover job %s is not succeed", jobName)
   196  			setComponentSwitchoverProgressDetails(reqCtx.Recorder, opsRequest, appsv1alpha1.UpdatingClusterCompPhase, checkJobProcessDetail, switchover.ComponentName)
   197  			continue
   198  		} else {
   199  			checkJobProcessDetail.Message = fmt.Sprintf("switchover job %s is succeed", jobName)
   200  			checkJobProcessDetail.Status = appsv1alpha1.SucceedProgressStatus
   201  			setComponentSwitchoverProgressDetails(reqCtx.Recorder, opsRequest, appsv1alpha1.UpdatingClusterCompPhase, checkJobProcessDetail, switchover.ComponentName)
   202  		}
   203  
   204  		// check the current component pod role label whether correct
   205  		checkRoleLabelProcessDetail := appsv1alpha1.ProgressStatusDetail{
   206  			ObjectKey: getProgressObjectKey(SwitchoverCheckRoleLabelKey, switchover.ComponentName),
   207  			Status:    appsv1alpha1.ProcessingProgressStatus,
   208  			Message:   fmt.Sprintf("waiting for component %s pod role label consistency after switchover", switchover.ComponentName),
   209  		}
   210  		compDef, err = appsv1alpha1.GetComponentDefByCluster(reqCtx.Ctx, cli, *opsRes.Cluster, switchover.ComponentName)
   211  		if err != nil {
   212  			checkRoleLabelProcessDetail.Message = fmt.Sprintf("handleSwitchoverProgress get component %s definition failed", switchover.ComponentName)
   213  			checkRoleLabelProcessDetail.Status = appsv1alpha1.FailedProgressStatus
   214  			setComponentSwitchoverProgressDetails(reqCtx.Recorder, opsRequest, appsv1alpha1.UpdatingClusterCompPhase, checkRoleLabelProcessDetail, switchover.ComponentName)
   215  			continue
   216  		}
   217  		consistency, err = checkPodRoleLabelConsistency(reqCtx.Ctx, cli, opsRes.Cluster, opsRes.Cluster.Spec.GetComponentByName(switchover.ComponentName), compDef, &switchover, switchoverCondition)
   218  		if err != nil {
   219  			checkRoleLabelProcessDetail.Message = fmt.Sprintf("waiting for component %s pod role label consistency after switchover", switchover.ComponentName)
   220  			setComponentSwitchoverProgressDetails(reqCtx.Recorder, opsRequest, appsv1alpha1.UpdatingClusterCompPhase, checkRoleLabelProcessDetail, switchover.ComponentName)
   221  			continue
   222  		}
   223  
   224  		if !consistency {
   225  			err = intctrlutil.NewErrorf(intctrlutil.ErrorWaitCacheRefresh, "requeue to waiting for pod role label consistency.")
   226  			setComponentSwitchoverProgressDetails(reqCtx.Recorder, opsRequest, appsv1alpha1.UpdatingClusterCompPhase, checkRoleLabelProcessDetail, switchover.ComponentName)
   227  			continue
   228  		} else {
   229  			checkRoleLabelProcessDetail.Message = fmt.Sprintf("check component %s pod role label consistency after switchover is succeed", switchover.ComponentName)
   230  			checkRoleLabelProcessDetail.Status = appsv1alpha1.SucceedProgressStatus
   231  			setComponentSwitchoverProgressDetails(reqCtx.Recorder, opsRequest, appsv1alpha1.UpdatingClusterCompPhase, checkRoleLabelProcessDetail, switchover.ComponentName)
   232  		}
   233  
   234  		// component switchover is successful
   235  		completedCount += 1
   236  		succeedJobs = append(succeedJobs, jobName)
   237  		componentProcessDetail := appsv1alpha1.ProgressStatusDetail{
   238  			ObjectKey: switchover.ComponentName,
   239  			Message:   fmt.Sprintf("switchover job %s is succeed", jobName),
   240  			Status:    appsv1alpha1.SucceedProgressStatus,
   241  		}
   242  		setComponentSwitchoverProgressDetails(reqCtx.Recorder, opsRequest, appsv1alpha1.RunningClusterCompPhase, componentProcessDetail, switchover.ComponentName)
   243  	}
   244  
   245  	opsRequest.Status.Progress = fmt.Sprintf("%d/%d", completedCount, expectCount)
   246  	// patch OpsRequest.status.components
   247  	if !reflect.DeepEqual(*oldOpsRequestStatus, opsRequest.Status) {
   248  		if err := cli.Status().Patch(reqCtx.Ctx, opsRequest, patch); err != nil {
   249  			return expectCount, 0, err
   250  		}
   251  	}
   252  
   253  	if err != nil {
   254  		return expectCount, completedCount, err
   255  	}
   256  
   257  	if completedCount == expectCount {
   258  		for _, jobName := range succeedJobs {
   259  			if err := cleanJobByName(reqCtx.Ctx, cli, opsRes.Cluster, jobName); err != nil {
   260  				reqCtx.Log.Error(err, "clean switchover job failed", "jobName", jobName)
   261  				return expectCount, completedCount, err
   262  			}
   263  		}
   264  	}
   265  
   266  	return expectCount, completedCount, nil
   267  }
   268  
   269  // setComponentSwitchoverProgressDetails sets component switchover progress details.
   270  func setComponentSwitchoverProgressDetails(recorder record.EventRecorder,
   271  	opsRequest *appsv1alpha1.OpsRequest,
   272  	phase appsv1alpha1.ClusterComponentPhase,
   273  	processDetail appsv1alpha1.ProgressStatusDetail,
   274  	componentName string) {
   275  	componentProcessDetails := opsRequest.Status.Components[componentName].ProgressDetails
   276  	setComponentStatusProgressDetail(recorder, opsRequest, &componentProcessDetails, processDetail)
   277  	opsRequest.Status.Components[componentName] = appsv1alpha1.OpsRequestComponentStatus{
   278  		Phase:           phase,
   279  		ProgressDetails: componentProcessDetails,
   280  	}
   281  }