github.com/verrazzano/verrazzano-monitoring-operator@v0.0.30/verrazzano-backup-hook/utilities/k8s/k8sHelper.go (about)

     1  // Copyright (c) 2022, Oracle and/or its affiliates.
     2  // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
     3  
     4  package k8s
     5  
     6  import (
     7  	"bytes"
     8  	"context"
     9  	"encoding/json"
    10  	"fmt"
    11  	"github.com/verrazzano/verrazzano-monitoring-operator/verrazzano-backup-hook/constants"
    12  	model "github.com/verrazzano/verrazzano-monitoring-operator/verrazzano-backup-hook/types"
    13  	futil "github.com/verrazzano/verrazzano-monitoring-operator/verrazzano-backup-hook/utilities"
    14  	vmofake "github.com/verrazzano/verrazzano-monitoring-operator/verrazzano-backup-hook/utilities/k8s/fake"
    15  	"go.uber.org/zap"
    16  	apps "k8s.io/api/apps/v1"
    17  	v1 "k8s.io/api/core/v1"
    18  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    19  	"k8s.io/apimachinery/pkg/runtime/schema"
    20  	"k8s.io/apimachinery/pkg/types"
    21  	"k8s.io/client-go/kubernetes/scheme"
    22  	"k8s.io/client-go/tools/remotecommand"
    23  	"os"
    24  	"sigs.k8s.io/controller-runtime/pkg/client"
    25  	crtclient "sigs.k8s.io/controller-runtime/pkg/client"
    26  	"strconv"
    27  	"sync"
    28  	"time"
    29  )
    30  
    31  // PopulateConnData creates the connection object that's used to communicate to object store.
    32  func (k *K8sImpl) PopulateConnData(veleroNamespace, backupName string) (*model.ConnectionData, error) {
    33  	k.Log.Infof("Populating connection data from backup '%v' in namespace '%s'", backupName, veleroNamespace)
    34  
    35  	backup, err := k.GetBackup(veleroNamespace, backupName)
    36  	if err != nil {
    37  		return nil, err
    38  	}
    39  
    40  	if backup.Spec.StorageLocation == "default" {
    41  		k.Log.Infof("Default creds not supported. Custom credentaisl needs to be created before creating backup storage location")
    42  		return nil, err
    43  	}
    44  
    45  	k.Log.Infof("Detected Velero backup storage location '%s' in namespace '%s' used by backup '%s'", backup.Spec.StorageLocation, veleroNamespace, backupName)
    46  	bsl, err := k.GetBackupStorageLocation(veleroNamespace, backup.Spec.StorageLocation)
    47  	if err != nil {
    48  		return nil, err
    49  	}
    50  
    51  	secretData, err := k.GetObjectStoreCreds(bsl.Spec.Credential.Name, bsl.Metadata.Namespace, bsl.Spec.Credential.Key)
    52  	if err != nil {
    53  		return nil, err
    54  	}
    55  
    56  	var conData model.ConnectionData
    57  	conData.Secret = *secretData
    58  	conData.RegionName = bsl.Spec.Config.Region
    59  	conData.Endpoint = bsl.Spec.Config.S3URL
    60  	conData.BucketName = bsl.Spec.ObjectStorage.Bucket
    61  	conData.BackupName = backupName
    62  	// For now, we will look at the first POST hook in the first Hook in Velero Backup
    63  	conData.VeleroTimeout = backup.Spec.Hooks.Resources[0].Post[0].Exec.Timeout
    64  
    65  	return &conData, nil
    66  
    67  }
    68  
    69  // GetObjectStoreCreds fetches credentials from Velero Backup object store location.
    70  // This object will be pre-created before the execution of this hook
    71  func (k *K8sImpl) GetObjectStoreCreds(secretName, namespace, secretKey string) (*model.ObjectStoreSecret, error) {
    72  	secret := v1.Secret{}
    73  	if err := k.K8sClient.Get(context.TODO(), crtclient.ObjectKey{Name: secretName, Namespace: namespace}, &secret); err != nil {
    74  		k.Log.Errorf("Failed to retrieve secret '%s' due to : %v", secretName, err)
    75  		return nil, err
    76  	}
    77  
    78  	file, err := futil.CreateTempFileWithData(secret.Data[secretKey])
    79  	if err != nil {
    80  		return nil, err
    81  	}
    82  	defer os.Remove(file)
    83  
    84  	accessKey, secretAccessKey, err := futil.ReadTempCredsFile(file, k.CredentialProfile)
    85  	if err != nil {
    86  		k.Log.Error("Error while reading creds from file ", zap.Error(err))
    87  		return nil, err
    88  	}
    89  
    90  	var secretData model.ObjectStoreSecret
    91  	secretData.SecretName = secretName
    92  	secretData.SecretKey = secretKey
    93  	secretData.ObjectAccessKey = accessKey
    94  	secretData.ObjectSecretKey = secretAccessKey
    95  	return &secretData, nil
    96  }
    97  
    98  // GetBackupStorageLocation retrieves data from the Velero backup storage location
    99  func (k *K8sImpl) GetBackupStorageLocation(veleroNamespace, bslName string) (*model.VeleroBackupStorageLocation, error) {
   100  	k.Log.Infof("Fetching Velero backup storage location '%s' in namespace '%s'", bslName, veleroNamespace)
   101  	gvr := schema.GroupVersionResource{
   102  		Group:    "velero.io",
   103  		Version:  "v1",
   104  		Resource: "backupstoragelocations",
   105  	}
   106  	bslRecievd, err := k.DynamicK8sInterface.Resource(gvr).Namespace(veleroNamespace).Get(context.Background(), bslName, metav1.GetOptions{})
   107  	if err != nil {
   108  		return nil, err
   109  	}
   110  
   111  	if bslRecievd == nil {
   112  		k.Log.Infof("No Velero backup storage location in namespace '%s' was detected", veleroNamespace)
   113  		return nil, err
   114  	}
   115  
   116  	var bsl model.VeleroBackupStorageLocation
   117  	bdata, err := json.Marshal(bslRecievd)
   118  	if err != nil {
   119  		return nil, err
   120  	}
   121  	err = json.Unmarshal(bdata, &bsl)
   122  	if err != nil {
   123  		return nil, err
   124  	}
   125  	return &bsl, nil
   126  }
   127  
   128  // GetBackup Retrieves Velero backup object from the cluster
   129  func (k *K8sImpl) GetBackup(veleroNamespace, backupName string) (*model.VeleroBackup, error) {
   130  	k.Log.Infof("Fetching Velero backup '%s' in namespace '%s'", backupName, veleroNamespace)
   131  	gvr := schema.GroupVersionResource{
   132  		Group:    "velero.io",
   133  		Version:  "v1",
   134  		Resource: "backups",
   135  	}
   136  	backupFetched, err := k.DynamicK8sInterface.Resource(gvr).Namespace(veleroNamespace).Get(context.Background(), backupName, metav1.GetOptions{})
   137  	if err != nil {
   138  		return nil, err
   139  	}
   140  
   141  	if backupFetched == nil {
   142  		k.Log.Infof("No Velero backup in namespace '%s' was detected", veleroNamespace)
   143  		return nil, err
   144  	}
   145  
   146  	var backup model.VeleroBackup
   147  	bdata, err := json.Marshal(backupFetched)
   148  	if err != nil {
   149  		return nil, err
   150  	}
   151  	err = json.Unmarshal(bdata, &backup)
   152  	if err != nil {
   153  		return nil, err
   154  	}
   155  	return &backup, nil
   156  }
   157  
   158  // ScaleDeployment is used to scale a deployment to specific replica count
   159  // labelSelectors, namespace, deploymentName are used to identify deployments
   160  // and specific pods associated with them.
   161  func (k *K8sImpl) ScaleDeployment(labelSelector, namespace, deploymentName string, replicaCount int32) error {
   162  	k.Log.Infof("Scale deployment '%s' in namespace '%s", deploymentName, namespace)
   163  	var wg sync.WaitGroup
   164  	depPatch := apps.Deployment{}
   165  	if err := k.K8sClient.Get(context.TODO(), types.NamespacedName{Name: deploymentName, Namespace: namespace}, &depPatch); err != nil {
   166  		return err
   167  	}
   168  	currentValue := *depPatch.Spec.Replicas
   169  	desiredValue := replicaCount
   170  
   171  	if desiredValue == currentValue {
   172  		k.Log.Infof("Deployment scaling skipped as desired replicas is same as current replicas")
   173  		return nil
   174  	}
   175  
   176  	listOptions := metav1.ListOptions{LabelSelector: labelSelector}
   177  	pods, err := k.K8sInterface.CoreV1().Pods(namespace).List(context.TODO(), listOptions)
   178  	if err != nil {
   179  		return err
   180  	}
   181  	wg.Add(len(pods.Items))
   182  
   183  	mergeFromDep := client.MergeFrom(depPatch.DeepCopy())
   184  	depPatch.Spec.Replicas = &replicaCount
   185  	if err := k.K8sClient.Patch(context.TODO(), &depPatch, mergeFromDep); err != nil {
   186  		k.Log.Error("Unable to patch !!")
   187  		return err
   188  	}
   189  
   190  	timeout := futil.GetEnvWithDefault(constants.OpenSearchHealthCheckTimeoutKey, constants.OpenSearchHealthCheckTimeoutDefaultValue)
   191  
   192  	if desiredValue > currentValue {
   193  		//log.Info("Scaling up pods ...")
   194  		message := "Wait for pods to come up"
   195  		_, err := futil.WaitRandom(message, timeout, k.Log)
   196  		if err != nil {
   197  			return err
   198  		}
   199  
   200  		for _, item := range pods.Items {
   201  			k.Log.Debugf("Firing go routine to check on pod '%s'", item.Name)
   202  			go k.CheckPodStatus(item.Name, namespace, "up", timeout, &wg)
   203  		}
   204  	}
   205  
   206  	if desiredValue < currentValue {
   207  		k.Log.Info("Scaling down pods ...")
   208  		for _, item := range pods.Items {
   209  			k.Log.Debugf("Firing go routine to check on pod '%s'", item.Name)
   210  			go k.CheckPodStatus(item.Name, namespace, "down", timeout, &wg)
   211  		}
   212  	}
   213  
   214  	wg.Wait()
   215  	k.Log.Infof("Successfully scaled deployment '%s' in namespace '%s' from '%v' to '%v' replicas ", deploymentName, namespace, currentValue, replicaCount)
   216  	return nil
   217  
   218  }
   219  
   220  // CheckDeployment checks the existence of a deployment in anamespace
   221  func (k *K8sImpl) CheckDeployment(labelSelector, namespace string) (bool, error) {
   222  	k.Log.Infof("Checking deployment with labelselector '%v' exists in namespace '%s", labelSelector, namespace)
   223  	listOptions := metav1.ListOptions{LabelSelector: labelSelector}
   224  	deployment, err := k.K8sInterface.AppsV1().Deployments(namespace).List(context.TODO(), listOptions)
   225  	if err != nil {
   226  		return false, err
   227  	}
   228  
   229  	// There should be one deployment of kibana
   230  	if len(deployment.Items) == 1 {
   231  		return true, nil
   232  	}
   233  	return false, nil
   234  }
   235  
   236  // IsPodReady checks whether pod is Ready
   237  func (k *K8sImpl) IsPodReady(pod *v1.Pod) (bool, error) {
   238  	for _, condition := range pod.Status.Conditions {
   239  		if condition.Type == "Ready" && condition.Status == "True" {
   240  			k.Log.Infof("Pod '%s' in namespace '%s' is in '%s' state", pod.Name, pod.Namespace, condition.Type)
   241  			return true, nil
   242  		}
   243  	}
   244  	k.Log.Infof("Pod '%s' in namespace '%s' is still not Ready", pod.Name, pod.Namespace)
   245  	return false, nil
   246  }
   247  
   248  // CheckPodStatus checks the state of the pod depending on checkFlag
   249  func (k *K8sImpl) CheckPodStatus(podName, namespace, checkFlag string, timeout string, wg *sync.WaitGroup) error {
   250  	k.Log.Infof("Checking Pod '%s' status in namespace '%s", podName, namespace)
   251  	var timeSeconds float64
   252  	defer wg.Done()
   253  	timeParse, err := time.ParseDuration(timeout)
   254  	if err != nil {
   255  		k.Log.Errorf("Unable to parse time duration ", zap.Error(err))
   256  		return err
   257  	}
   258  	totalSeconds := timeParse.Seconds()
   259  	done := false
   260  	wait := false
   261  
   262  	for !done {
   263  		pod, err := k.K8sInterface.CoreV1().Pods(namespace).Get(context.TODO(), podName, metav1.GetOptions{})
   264  		if err != nil {
   265  			return err
   266  		}
   267  
   268  		if pod == nil && checkFlag == "down" {
   269  			// break loop when scaling down condition is met
   270  			k.Log.Infof("Pod '%s' has scaled down successfully", podName)
   271  			done = true
   272  		}
   273  
   274  		// If pod is found
   275  		if pod != nil {
   276  			switch checkFlag {
   277  			case "up":
   278  				// Check status and apply retry logic
   279  				if pod.Status.Phase != "Running" {
   280  					// Pod is not Running state so we need to wait.
   281  					wait = true
   282  				} else {
   283  					// break loop when scaling up condition is met
   284  					k.Log.Infof("Pod '%s' is in 'Running' state", pod.Name)
   285  					ok, err := k.IsPodReady(pod)
   286  					if err != nil {
   287  						return err
   288  					}
   289  					if ok {
   290  						// break loop pod is Running and pod is in Ready.
   291  						done = true
   292  					} else {
   293  						// Pod is in Running state but still not ready. Hence, we will wait.
   294  						wait = true
   295  					}
   296  				}
   297  
   298  			case "down":
   299  				wait = true
   300  			}
   301  
   302  			if wait {
   303  				fmt.Printf("timeSeconds = %v, totalSeconds = %v ", timeSeconds, totalSeconds)
   304  				if timeSeconds < totalSeconds {
   305  					message := fmt.Sprintf("Pod '%s' is in '%s' state", pod.Name, pod.Status.Phase)
   306  					duration, err := futil.WaitRandom(message, timeout, k.Log)
   307  					if err != nil {
   308  						return err
   309  					}
   310  					timeSeconds = timeSeconds + float64(duration)
   311  
   312  				} else {
   313  					return fmt.Errorf("Timeout '%s' exceeded. Pod '%s' is still not in running state", timeout, pod.Name)
   314  				}
   315  				// change wait to false after each wait
   316  				wait = false
   317  			}
   318  		}
   319  	}
   320  	return nil
   321  }
   322  
   323  // CheckAllPodsAfterRestore checks presence of pods part of Opensearch cluster implementation after restore
   324  func (k *K8sImpl) CheckAllPodsAfterRestore() error {
   325  	timeout := futil.GetEnvWithDefault(constants.OpenSearchHealthCheckTimeoutKey, constants.OpenSearchHealthCheckTimeoutDefaultValue)
   326  
   327  	message := "Waiting for Verrazzano Monitoring Operator to come up"
   328  	_, err := futil.WaitRandom(message, timeout, k.Log)
   329  	if err != nil {
   330  		return err
   331  	}
   332  
   333  	var wg sync.WaitGroup
   334  	k.Log.Infof("Checking pods with labelselector '%v' in namespace '%s", constants.IngestLabelSelector, constants.VerrazzanoSystemNamespace)
   335  	listOptions := metav1.ListOptions{LabelSelector: constants.IngestLabelSelector}
   336  	ingestPods, err := k.K8sInterface.CoreV1().Pods(constants.VerrazzanoSystemNamespace).List(context.TODO(), listOptions)
   337  	if err != nil {
   338  		return err
   339  	}
   340  
   341  	wg.Add(len(ingestPods.Items))
   342  	for _, pod := range ingestPods.Items {
   343  		k.Log.Debugf("Firing go routine to check on pod '%s'", pod.Name)
   344  		go k.CheckPodStatus(pod.Name, constants.VerrazzanoSystemNamespace, "up", timeout, &wg)
   345  	}
   346  
   347  	k.Log.Infof("Checking pods with labelselector '%v' in namespace '%s", constants.KibanaLabelSelector, constants.VerrazzanoSystemNamespace)
   348  	listOptions = metav1.ListOptions{LabelSelector: constants.KibanaLabelSelector}
   349  	kibanaPods, err := k.K8sInterface.CoreV1().Pods(constants.VerrazzanoSystemNamespace).List(context.TODO(), listOptions)
   350  	if err != nil {
   351  		return err
   352  	}
   353  
   354  	wg.Add(len(kibanaPods.Items))
   355  	for _, pod := range kibanaPods.Items {
   356  		k.Log.Debugf("Firing go routine to check on pod '%s'", pod.Name)
   357  		go k.CheckPodStatus(pod.Name, constants.VerrazzanoSystemNamespace, "up", timeout, &wg)
   358  	}
   359  
   360  	wg.Wait()
   361  	return nil
   362  }
   363  
   364  // ExecPod runs a remote command a pod, returning the stdout and stderr of the command.
   365  func (k *K8sImpl) ExecPod(pod *v1.Pod, container string, command []string) (string, string, error) {
   366  	stdout := &bytes.Buffer{}
   367  	stderr := &bytes.Buffer{}
   368  	request := k.K8sInterface.
   369  		CoreV1().
   370  		RESTClient().
   371  		Post().
   372  		Namespace(pod.Namespace).
   373  		Resource("pods").
   374  		Name(pod.Name).
   375  		SubResource("exec").
   376  		VersionedParams(&v1.PodExecOptions{
   377  			Container: container,
   378  			Command:   command,
   379  			Stdin:     false,
   380  			Stdout:    true,
   381  			Stderr:    true,
   382  			TTY:       true,
   383  		}, scheme.ParameterCodec)
   384  
   385  	var executor remotecommand.Executor
   386  	var err error
   387  	if futil.GetEnvWithDefault(constants.DevKey, constants.FalseString) == constants.TrueString {
   388  		executor, err = vmofake.NewPodExecutor(k.K8sConfig, "POST", request.URL())
   389  	} else {
   390  		executor, err = NewPodExecutor(k.K8sConfig, "POST", request.URL())
   391  	}
   392  
   393  	if err != nil {
   394  		return "", "", err
   395  	}
   396  	err = executor.Stream(remotecommand.StreamOptions{
   397  		Stdout: stdout,
   398  		Stderr: stderr,
   399  	})
   400  	if err != nil {
   401  		return "", "", fmt.Errorf("error running command %s on %v/%v: %v", command, pod.Namespace, pod.Name, err)
   402  	}
   403  
   404  	return stdout.String(), stderr.String(), nil
   405  }
   406  
   407  // UpdateKeystore Update Opensearch keystore with object store creds
   408  func (k *K8sImpl) UpdateKeystore(connData *model.ConnectionData, timeout string) (bool, error) {
   409  
   410  	var accessKeyCmd, secretKeyCmd []string
   411  	accessKeyCmd = append(accessKeyCmd, "/bin/sh", "-c", fmt.Sprintf("echo %s | %s", strconv.Quote(connData.Secret.ObjectAccessKey), constants.OpenSearchKeystoreAccessKeyCmd))
   412  	secretKeyCmd = append(secretKeyCmd, "/bin/sh", "-c", fmt.Sprintf("echo %s | %s", strconv.Quote(connData.Secret.ObjectSecretKey), constants.OpenSearchKeystoreSecretAccessKeyCmd))
   413  
   414  	// Updating keystore in other masters
   415  	listOptions := metav1.ListOptions{LabelSelector: constants.OpenSearchMasterLabel}
   416  	esMasterPods, err := k.K8sInterface.CoreV1().Pods(constants.VerrazzanoSystemNamespace).List(context.TODO(), listOptions)
   417  	if err != nil {
   418  		k.Log.Errorf("Unable to fetch list of opensearch master pods")
   419  		return false, err
   420  	}
   421  	for _, pod := range esMasterPods.Items {
   422  		err = k.ExecRetry(&pod, constants.OpenSearchMasterPodContainerName, timeout, accessKeyCmd) //nolint:gosec //#gosec G601
   423  		if err != nil {
   424  			k.Log.Errorf("Unable to exec into pod %s due to %v", pod.Name, err)
   425  			return false, err
   426  		}
   427  
   428  		err = k.ExecRetry(&pod, constants.OpenSearchMasterPodContainerName, timeout, secretKeyCmd) //nolint:gosec //#gosec G601
   429  		if err != nil {
   430  			k.Log.Errorf("Unable to exec into pod %s due to %v", pod.Name, err)
   431  			return false, err
   432  		}
   433  	}
   434  
   435  	// Updating keystore in data nodes
   436  	listOptions = metav1.ListOptions{LabelSelector: constants.OpenSearchDataLabel}
   437  	esDataPods, err := k.K8sInterface.CoreV1().Pods(constants.VerrazzanoSystemNamespace).List(context.TODO(), listOptions)
   438  	if err != nil {
   439  		k.Log.Errorf("Unable to fetch list of opensearch data pods")
   440  		return false, err
   441  	}
   442  
   443  	for _, pod := range esDataPods.Items {
   444  		err = k.ExecRetry(&pod, constants.OpenSearchDataPodContainerName, timeout, accessKeyCmd) //nolint:gosec //#gosec G601
   445  		if err != nil {
   446  			k.Log.Errorf("Unable to exec into pod %s due to %v", pod.Name, err)
   447  			return false, err
   448  		}
   449  
   450  		err = k.ExecRetry(&pod, constants.OpenSearchDataPodContainerName, timeout, secretKeyCmd) //nolint:gosec //#gosec G601
   451  		if err != nil {
   452  			k.Log.Errorf("Unable to exec into pod %s due to %v", pod.Name, err)
   453  			return false, err
   454  		}
   455  	}
   456  
   457  	return true, nil
   458  
   459  }
   460  
   461  func (k *K8sImpl) ExecRetry(pod *v1.Pod, container, timeout string, execCmd []string) error {
   462  	var timeSeconds float64
   463  	done := false
   464  
   465  	timeParse, err := time.ParseDuration(timeout)
   466  	if err != nil {
   467  		k.Log.Errorf("Unable to parse time duration ", zap.Error(err))
   468  		return err
   469  	}
   470  	totalSeconds := timeParse.Seconds()
   471  
   472  	for !done {
   473  		k.Log.Infof("Updating keystore in pod '%s'", pod.Name)
   474  		_, _, err = k.ExecPod(pod, container, execCmd) //nolint:gosec //#gosec G601
   475  		if err != nil {
   476  			if timeSeconds < totalSeconds {
   477  				message := fmt.Sprintf("Unable to exec into pod '%s'", pod.Name)
   478  				duration, err := futil.WaitRandom(message, timeout, k.Log)
   479  				if err != nil {
   480  					return err
   481  				}
   482  				timeSeconds = timeSeconds + float64(duration)
   483  			} else {
   484  				k.Log.Errorf("Global timeout '%s' exceeded. Unable to exec into pod", timeout)
   485  				return err
   486  			}
   487  		} else {
   488  			done = true
   489  		}
   490  	}
   491  	return nil
   492  }