github.com/verrazzano/verrazzano-monitoring-operator@v0.0.30/verrazzano-backup-hook/utilities/k8s/k8sHelper.go (about) 1 // Copyright (c) 2022, Oracle and/or its affiliates. 2 // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl. 3 4 package k8s 5 6 import ( 7 "bytes" 8 "context" 9 "encoding/json" 10 "fmt" 11 "github.com/verrazzano/verrazzano-monitoring-operator/verrazzano-backup-hook/constants" 12 model "github.com/verrazzano/verrazzano-monitoring-operator/verrazzano-backup-hook/types" 13 futil "github.com/verrazzano/verrazzano-monitoring-operator/verrazzano-backup-hook/utilities" 14 vmofake "github.com/verrazzano/verrazzano-monitoring-operator/verrazzano-backup-hook/utilities/k8s/fake" 15 "go.uber.org/zap" 16 apps "k8s.io/api/apps/v1" 17 v1 "k8s.io/api/core/v1" 18 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 19 "k8s.io/apimachinery/pkg/runtime/schema" 20 "k8s.io/apimachinery/pkg/types" 21 "k8s.io/client-go/kubernetes/scheme" 22 "k8s.io/client-go/tools/remotecommand" 23 "os" 24 "sigs.k8s.io/controller-runtime/pkg/client" 25 crtclient "sigs.k8s.io/controller-runtime/pkg/client" 26 "strconv" 27 "sync" 28 "time" 29 ) 30 31 // PopulateConnData creates the connection object that's used to communicate to object store. 32 func (k *K8sImpl) PopulateConnData(veleroNamespace, backupName string) (*model.ConnectionData, error) { 33 k.Log.Infof("Populating connection data from backup '%v' in namespace '%s'", backupName, veleroNamespace) 34 35 backup, err := k.GetBackup(veleroNamespace, backupName) 36 if err != nil { 37 return nil, err 38 } 39 40 if backup.Spec.StorageLocation == "default" { 41 k.Log.Infof("Default creds not supported. Custom credentaisl needs to be created before creating backup storage location") 42 return nil, err 43 } 44 45 k.Log.Infof("Detected Velero backup storage location '%s' in namespace '%s' used by backup '%s'", backup.Spec.StorageLocation, veleroNamespace, backupName) 46 bsl, err := k.GetBackupStorageLocation(veleroNamespace, backup.Spec.StorageLocation) 47 if err != nil { 48 return nil, err 49 } 50 51 secretData, err := k.GetObjectStoreCreds(bsl.Spec.Credential.Name, bsl.Metadata.Namespace, bsl.Spec.Credential.Key) 52 if err != nil { 53 return nil, err 54 } 55 56 var conData model.ConnectionData 57 conData.Secret = *secretData 58 conData.RegionName = bsl.Spec.Config.Region 59 conData.Endpoint = bsl.Spec.Config.S3URL 60 conData.BucketName = bsl.Spec.ObjectStorage.Bucket 61 conData.BackupName = backupName 62 // For now, we will look at the first POST hook in the first Hook in Velero Backup 63 conData.VeleroTimeout = backup.Spec.Hooks.Resources[0].Post[0].Exec.Timeout 64 65 return &conData, nil 66 67 } 68 69 // GetObjectStoreCreds fetches credentials from Velero Backup object store location. 70 // This object will be pre-created before the execution of this hook 71 func (k *K8sImpl) GetObjectStoreCreds(secretName, namespace, secretKey string) (*model.ObjectStoreSecret, error) { 72 secret := v1.Secret{} 73 if err := k.K8sClient.Get(context.TODO(), crtclient.ObjectKey{Name: secretName, Namespace: namespace}, &secret); err != nil { 74 k.Log.Errorf("Failed to retrieve secret '%s' due to : %v", secretName, err) 75 return nil, err 76 } 77 78 file, err := futil.CreateTempFileWithData(secret.Data[secretKey]) 79 if err != nil { 80 return nil, err 81 } 82 defer os.Remove(file) 83 84 accessKey, secretAccessKey, err := futil.ReadTempCredsFile(file, k.CredentialProfile) 85 if err != nil { 86 k.Log.Error("Error while reading creds from file ", zap.Error(err)) 87 return nil, err 88 } 89 90 var secretData model.ObjectStoreSecret 91 secretData.SecretName = secretName 92 secretData.SecretKey = secretKey 93 secretData.ObjectAccessKey = accessKey 94 secretData.ObjectSecretKey = secretAccessKey 95 return &secretData, nil 96 } 97 98 // GetBackupStorageLocation retrieves data from the Velero backup storage location 99 func (k *K8sImpl) GetBackupStorageLocation(veleroNamespace, bslName string) (*model.VeleroBackupStorageLocation, error) { 100 k.Log.Infof("Fetching Velero backup storage location '%s' in namespace '%s'", bslName, veleroNamespace) 101 gvr := schema.GroupVersionResource{ 102 Group: "velero.io", 103 Version: "v1", 104 Resource: "backupstoragelocations", 105 } 106 bslRecievd, err := k.DynamicK8sInterface.Resource(gvr).Namespace(veleroNamespace).Get(context.Background(), bslName, metav1.GetOptions{}) 107 if err != nil { 108 return nil, err 109 } 110 111 if bslRecievd == nil { 112 k.Log.Infof("No Velero backup storage location in namespace '%s' was detected", veleroNamespace) 113 return nil, err 114 } 115 116 var bsl model.VeleroBackupStorageLocation 117 bdata, err := json.Marshal(bslRecievd) 118 if err != nil { 119 return nil, err 120 } 121 err = json.Unmarshal(bdata, &bsl) 122 if err != nil { 123 return nil, err 124 } 125 return &bsl, nil 126 } 127 128 // GetBackup Retrieves Velero backup object from the cluster 129 func (k *K8sImpl) GetBackup(veleroNamespace, backupName string) (*model.VeleroBackup, error) { 130 k.Log.Infof("Fetching Velero backup '%s' in namespace '%s'", backupName, veleroNamespace) 131 gvr := schema.GroupVersionResource{ 132 Group: "velero.io", 133 Version: "v1", 134 Resource: "backups", 135 } 136 backupFetched, err := k.DynamicK8sInterface.Resource(gvr).Namespace(veleroNamespace).Get(context.Background(), backupName, metav1.GetOptions{}) 137 if err != nil { 138 return nil, err 139 } 140 141 if backupFetched == nil { 142 k.Log.Infof("No Velero backup in namespace '%s' was detected", veleroNamespace) 143 return nil, err 144 } 145 146 var backup model.VeleroBackup 147 bdata, err := json.Marshal(backupFetched) 148 if err != nil { 149 return nil, err 150 } 151 err = json.Unmarshal(bdata, &backup) 152 if err != nil { 153 return nil, err 154 } 155 return &backup, nil 156 } 157 158 // ScaleDeployment is used to scale a deployment to specific replica count 159 // labelSelectors, namespace, deploymentName are used to identify deployments 160 // and specific pods associated with them. 161 func (k *K8sImpl) ScaleDeployment(labelSelector, namespace, deploymentName string, replicaCount int32) error { 162 k.Log.Infof("Scale deployment '%s' in namespace '%s", deploymentName, namespace) 163 var wg sync.WaitGroup 164 depPatch := apps.Deployment{} 165 if err := k.K8sClient.Get(context.TODO(), types.NamespacedName{Name: deploymentName, Namespace: namespace}, &depPatch); err != nil { 166 return err 167 } 168 currentValue := *depPatch.Spec.Replicas 169 desiredValue := replicaCount 170 171 if desiredValue == currentValue { 172 k.Log.Infof("Deployment scaling skipped as desired replicas is same as current replicas") 173 return nil 174 } 175 176 listOptions := metav1.ListOptions{LabelSelector: labelSelector} 177 pods, err := k.K8sInterface.CoreV1().Pods(namespace).List(context.TODO(), listOptions) 178 if err != nil { 179 return err 180 } 181 wg.Add(len(pods.Items)) 182 183 mergeFromDep := client.MergeFrom(depPatch.DeepCopy()) 184 depPatch.Spec.Replicas = &replicaCount 185 if err := k.K8sClient.Patch(context.TODO(), &depPatch, mergeFromDep); err != nil { 186 k.Log.Error("Unable to patch !!") 187 return err 188 } 189 190 timeout := futil.GetEnvWithDefault(constants.OpenSearchHealthCheckTimeoutKey, constants.OpenSearchHealthCheckTimeoutDefaultValue) 191 192 if desiredValue > currentValue { 193 //log.Info("Scaling up pods ...") 194 message := "Wait for pods to come up" 195 _, err := futil.WaitRandom(message, timeout, k.Log) 196 if err != nil { 197 return err 198 } 199 200 for _, item := range pods.Items { 201 k.Log.Debugf("Firing go routine to check on pod '%s'", item.Name) 202 go k.CheckPodStatus(item.Name, namespace, "up", timeout, &wg) 203 } 204 } 205 206 if desiredValue < currentValue { 207 k.Log.Info("Scaling down pods ...") 208 for _, item := range pods.Items { 209 k.Log.Debugf("Firing go routine to check on pod '%s'", item.Name) 210 go k.CheckPodStatus(item.Name, namespace, "down", timeout, &wg) 211 } 212 } 213 214 wg.Wait() 215 k.Log.Infof("Successfully scaled deployment '%s' in namespace '%s' from '%v' to '%v' replicas ", deploymentName, namespace, currentValue, replicaCount) 216 return nil 217 218 } 219 220 // CheckDeployment checks the existence of a deployment in anamespace 221 func (k *K8sImpl) CheckDeployment(labelSelector, namespace string) (bool, error) { 222 k.Log.Infof("Checking deployment with labelselector '%v' exists in namespace '%s", labelSelector, namespace) 223 listOptions := metav1.ListOptions{LabelSelector: labelSelector} 224 deployment, err := k.K8sInterface.AppsV1().Deployments(namespace).List(context.TODO(), listOptions) 225 if err != nil { 226 return false, err 227 } 228 229 // There should be one deployment of kibana 230 if len(deployment.Items) == 1 { 231 return true, nil 232 } 233 return false, nil 234 } 235 236 // IsPodReady checks whether pod is Ready 237 func (k *K8sImpl) IsPodReady(pod *v1.Pod) (bool, error) { 238 for _, condition := range pod.Status.Conditions { 239 if condition.Type == "Ready" && condition.Status == "True" { 240 k.Log.Infof("Pod '%s' in namespace '%s' is in '%s' state", pod.Name, pod.Namespace, condition.Type) 241 return true, nil 242 } 243 } 244 k.Log.Infof("Pod '%s' in namespace '%s' is still not Ready", pod.Name, pod.Namespace) 245 return false, nil 246 } 247 248 // CheckPodStatus checks the state of the pod depending on checkFlag 249 func (k *K8sImpl) CheckPodStatus(podName, namespace, checkFlag string, timeout string, wg *sync.WaitGroup) error { 250 k.Log.Infof("Checking Pod '%s' status in namespace '%s", podName, namespace) 251 var timeSeconds float64 252 defer wg.Done() 253 timeParse, err := time.ParseDuration(timeout) 254 if err != nil { 255 k.Log.Errorf("Unable to parse time duration ", zap.Error(err)) 256 return err 257 } 258 totalSeconds := timeParse.Seconds() 259 done := false 260 wait := false 261 262 for !done { 263 pod, err := k.K8sInterface.CoreV1().Pods(namespace).Get(context.TODO(), podName, metav1.GetOptions{}) 264 if err != nil { 265 return err 266 } 267 268 if pod == nil && checkFlag == "down" { 269 // break loop when scaling down condition is met 270 k.Log.Infof("Pod '%s' has scaled down successfully", podName) 271 done = true 272 } 273 274 // If pod is found 275 if pod != nil { 276 switch checkFlag { 277 case "up": 278 // Check status and apply retry logic 279 if pod.Status.Phase != "Running" { 280 // Pod is not Running state so we need to wait. 281 wait = true 282 } else { 283 // break loop when scaling up condition is met 284 k.Log.Infof("Pod '%s' is in 'Running' state", pod.Name) 285 ok, err := k.IsPodReady(pod) 286 if err != nil { 287 return err 288 } 289 if ok { 290 // break loop pod is Running and pod is in Ready. 291 done = true 292 } else { 293 // Pod is in Running state but still not ready. Hence, we will wait. 294 wait = true 295 } 296 } 297 298 case "down": 299 wait = true 300 } 301 302 if wait { 303 fmt.Printf("timeSeconds = %v, totalSeconds = %v ", timeSeconds, totalSeconds) 304 if timeSeconds < totalSeconds { 305 message := fmt.Sprintf("Pod '%s' is in '%s' state", pod.Name, pod.Status.Phase) 306 duration, err := futil.WaitRandom(message, timeout, k.Log) 307 if err != nil { 308 return err 309 } 310 timeSeconds = timeSeconds + float64(duration) 311 312 } else { 313 return fmt.Errorf("Timeout '%s' exceeded. Pod '%s' is still not in running state", timeout, pod.Name) 314 } 315 // change wait to false after each wait 316 wait = false 317 } 318 } 319 } 320 return nil 321 } 322 323 // CheckAllPodsAfterRestore checks presence of pods part of Opensearch cluster implementation after restore 324 func (k *K8sImpl) CheckAllPodsAfterRestore() error { 325 timeout := futil.GetEnvWithDefault(constants.OpenSearchHealthCheckTimeoutKey, constants.OpenSearchHealthCheckTimeoutDefaultValue) 326 327 message := "Waiting for Verrazzano Monitoring Operator to come up" 328 _, err := futil.WaitRandom(message, timeout, k.Log) 329 if err != nil { 330 return err 331 } 332 333 var wg sync.WaitGroup 334 k.Log.Infof("Checking pods with labelselector '%v' in namespace '%s", constants.IngestLabelSelector, constants.VerrazzanoSystemNamespace) 335 listOptions := metav1.ListOptions{LabelSelector: constants.IngestLabelSelector} 336 ingestPods, err := k.K8sInterface.CoreV1().Pods(constants.VerrazzanoSystemNamespace).List(context.TODO(), listOptions) 337 if err != nil { 338 return err 339 } 340 341 wg.Add(len(ingestPods.Items)) 342 for _, pod := range ingestPods.Items { 343 k.Log.Debugf("Firing go routine to check on pod '%s'", pod.Name) 344 go k.CheckPodStatus(pod.Name, constants.VerrazzanoSystemNamespace, "up", timeout, &wg) 345 } 346 347 k.Log.Infof("Checking pods with labelselector '%v' in namespace '%s", constants.KibanaLabelSelector, constants.VerrazzanoSystemNamespace) 348 listOptions = metav1.ListOptions{LabelSelector: constants.KibanaLabelSelector} 349 kibanaPods, err := k.K8sInterface.CoreV1().Pods(constants.VerrazzanoSystemNamespace).List(context.TODO(), listOptions) 350 if err != nil { 351 return err 352 } 353 354 wg.Add(len(kibanaPods.Items)) 355 for _, pod := range kibanaPods.Items { 356 k.Log.Debugf("Firing go routine to check on pod '%s'", pod.Name) 357 go k.CheckPodStatus(pod.Name, constants.VerrazzanoSystemNamespace, "up", timeout, &wg) 358 } 359 360 wg.Wait() 361 return nil 362 } 363 364 // ExecPod runs a remote command a pod, returning the stdout and stderr of the command. 365 func (k *K8sImpl) ExecPod(pod *v1.Pod, container string, command []string) (string, string, error) { 366 stdout := &bytes.Buffer{} 367 stderr := &bytes.Buffer{} 368 request := k.K8sInterface. 369 CoreV1(). 370 RESTClient(). 371 Post(). 372 Namespace(pod.Namespace). 373 Resource("pods"). 374 Name(pod.Name). 375 SubResource("exec"). 376 VersionedParams(&v1.PodExecOptions{ 377 Container: container, 378 Command: command, 379 Stdin: false, 380 Stdout: true, 381 Stderr: true, 382 TTY: true, 383 }, scheme.ParameterCodec) 384 385 var executor remotecommand.Executor 386 var err error 387 if futil.GetEnvWithDefault(constants.DevKey, constants.FalseString) == constants.TrueString { 388 executor, err = vmofake.NewPodExecutor(k.K8sConfig, "POST", request.URL()) 389 } else { 390 executor, err = NewPodExecutor(k.K8sConfig, "POST", request.URL()) 391 } 392 393 if err != nil { 394 return "", "", err 395 } 396 err = executor.Stream(remotecommand.StreamOptions{ 397 Stdout: stdout, 398 Stderr: stderr, 399 }) 400 if err != nil { 401 return "", "", fmt.Errorf("error running command %s on %v/%v: %v", command, pod.Namespace, pod.Name, err) 402 } 403 404 return stdout.String(), stderr.String(), nil 405 } 406 407 // UpdateKeystore Update Opensearch keystore with object store creds 408 func (k *K8sImpl) UpdateKeystore(connData *model.ConnectionData, timeout string) (bool, error) { 409 410 var accessKeyCmd, secretKeyCmd []string 411 accessKeyCmd = append(accessKeyCmd, "/bin/sh", "-c", fmt.Sprintf("echo %s | %s", strconv.Quote(connData.Secret.ObjectAccessKey), constants.OpenSearchKeystoreAccessKeyCmd)) 412 secretKeyCmd = append(secretKeyCmd, "/bin/sh", "-c", fmt.Sprintf("echo %s | %s", strconv.Quote(connData.Secret.ObjectSecretKey), constants.OpenSearchKeystoreSecretAccessKeyCmd)) 413 414 // Updating keystore in other masters 415 listOptions := metav1.ListOptions{LabelSelector: constants.OpenSearchMasterLabel} 416 esMasterPods, err := k.K8sInterface.CoreV1().Pods(constants.VerrazzanoSystemNamespace).List(context.TODO(), listOptions) 417 if err != nil { 418 k.Log.Errorf("Unable to fetch list of opensearch master pods") 419 return false, err 420 } 421 for _, pod := range esMasterPods.Items { 422 err = k.ExecRetry(&pod, constants.OpenSearchMasterPodContainerName, timeout, accessKeyCmd) //nolint:gosec //#gosec G601 423 if err != nil { 424 k.Log.Errorf("Unable to exec into pod %s due to %v", pod.Name, err) 425 return false, err 426 } 427 428 err = k.ExecRetry(&pod, constants.OpenSearchMasterPodContainerName, timeout, secretKeyCmd) //nolint:gosec //#gosec G601 429 if err != nil { 430 k.Log.Errorf("Unable to exec into pod %s due to %v", pod.Name, err) 431 return false, err 432 } 433 } 434 435 // Updating keystore in data nodes 436 listOptions = metav1.ListOptions{LabelSelector: constants.OpenSearchDataLabel} 437 esDataPods, err := k.K8sInterface.CoreV1().Pods(constants.VerrazzanoSystemNamespace).List(context.TODO(), listOptions) 438 if err != nil { 439 k.Log.Errorf("Unable to fetch list of opensearch data pods") 440 return false, err 441 } 442 443 for _, pod := range esDataPods.Items { 444 err = k.ExecRetry(&pod, constants.OpenSearchDataPodContainerName, timeout, accessKeyCmd) //nolint:gosec //#gosec G601 445 if err != nil { 446 k.Log.Errorf("Unable to exec into pod %s due to %v", pod.Name, err) 447 return false, err 448 } 449 450 err = k.ExecRetry(&pod, constants.OpenSearchDataPodContainerName, timeout, secretKeyCmd) //nolint:gosec //#gosec G601 451 if err != nil { 452 k.Log.Errorf("Unable to exec into pod %s due to %v", pod.Name, err) 453 return false, err 454 } 455 } 456 457 return true, nil 458 459 } 460 461 func (k *K8sImpl) ExecRetry(pod *v1.Pod, container, timeout string, execCmd []string) error { 462 var timeSeconds float64 463 done := false 464 465 timeParse, err := time.ParseDuration(timeout) 466 if err != nil { 467 k.Log.Errorf("Unable to parse time duration ", zap.Error(err)) 468 return err 469 } 470 totalSeconds := timeParse.Seconds() 471 472 for !done { 473 k.Log.Infof("Updating keystore in pod '%s'", pod.Name) 474 _, _, err = k.ExecPod(pod, container, execCmd) //nolint:gosec //#gosec G601 475 if err != nil { 476 if timeSeconds < totalSeconds { 477 message := fmt.Sprintf("Unable to exec into pod '%s'", pod.Name) 478 duration, err := futil.WaitRandom(message, timeout, k.Log) 479 if err != nil { 480 return err 481 } 482 timeSeconds = timeSeconds + float64(duration) 483 } else { 484 k.Log.Errorf("Global timeout '%s' exceeded. Unable to exec into pod", timeout) 485 return err 486 } 487 } else { 488 done = true 489 } 490 } 491 return nil 492 }