github.com/vmware/go-vcloud-director/v2@v2.24.0/govcd/cse.go (about)

     1  package govcd
     2  
     3  import (
     4  	"encoding/json"
     5  	"fmt"
     6  	semver "github.com/hashicorp/go-version"
     7  	"github.com/vmware/go-vcloud-director/v2/types/v56"
     8  	"github.com/vmware/go-vcloud-director/v2/util"
     9  	"strings"
    10  	"time"
    11  )
    12  
    13  // CseCreateKubernetesCluster creates a Kubernetes cluster with the data given as input (CseClusterSettings). If the given
    14  // timeout is 0, it waits forever for the cluster creation.
    15  //
    16  // If the timeout is reached and the cluster is not available (in "provisioned" state), it will return a non-nil CseKubernetesCluster
    17  // with only the cluster ID and an error. This means that the cluster will be left in VCD in any state, and it can be retrieved afterward
    18  // with Org.CseGetKubernetesClusterById and the returned ID.
    19  //
    20  // If the cluster is created correctly, returns all the available data in CseKubernetesCluster or an error if some of the fields
    21  // of the created cluster cannot be calculated or retrieved.
    22  func (org *Org) CseCreateKubernetesCluster(clusterData CseClusterSettings, timeout time.Duration) (*CseKubernetesCluster, error) {
    23  	clusterId, err := org.CseCreateKubernetesClusterAsync(clusterData)
    24  	if err != nil {
    25  		return nil, err
    26  	}
    27  
    28  	err = waitUntilClusterIsProvisioned(org.client, clusterId, timeout)
    29  	if err != nil {
    30  		return &CseKubernetesCluster{
    31  			client: org.client,
    32  			ID:     clusterId,
    33  		}, err
    34  	}
    35  
    36  	return getCseKubernetesClusterById(org.client, clusterId)
    37  }
    38  
    39  // CseCreateKubernetesClusterAsync creates a Kubernetes cluster with the data given as input (CseClusterSettings), but does not
    40  // wait for the creation process to finish, so it doesn't monitor for any errors during the process. It returns just the ID of
    41  // the created cluster. One can manually check the status of the cluster with VCDClient.CseGetKubernetesClusterById and the result of this method.
    42  func (org *Org) CseCreateKubernetesClusterAsync(clusterSettings CseClusterSettings) (string, error) {
    43  	if org == nil {
    44  		return "", fmt.Errorf("CseCreateKubernetesClusterAsync cannot be called on a nil Organization receiver")
    45  	}
    46  
    47  	tenantContext, err := org.getTenantContext()
    48  	if err != nil {
    49  		return "", fmt.Errorf("error creating the CSE Kubernetes cluster: %s", err)
    50  	}
    51  
    52  	cseSubcomponents, err := getCseComponentsVersions(clusterSettings.CseVersion)
    53  	if err != nil {
    54  		return "", err
    55  	}
    56  
    57  	internalSettings, err := clusterSettings.toCseClusterSettingsInternal(*org)
    58  	if err != nil {
    59  		return "", fmt.Errorf("error creating the CSE Kubernetes cluster: %s", err)
    60  	}
    61  
    62  	payload, err := internalSettings.getUnmarshalledRdePayload()
    63  	if err != nil {
    64  		return "", err
    65  	}
    66  
    67  	rde, err := createRdeAndGetFromTask(org.client, cseKubernetesClusterVendor, cseKubernetesClusterNamespace, cseSubcomponents.CapvcdRdeTypeVersion,
    68  		types.DefinedEntity{
    69  			EntityType: internalSettings.RdeType.ID,
    70  			Name:       internalSettings.Name,
    71  			Entity:     payload,
    72  		}, tenantContext)
    73  	if err != nil {
    74  		return "", fmt.Errorf("error creating the CSE Kubernetes cluster: %s", err)
    75  	}
    76  
    77  	return rde.DefinedEntity.ID, nil
    78  }
    79  
    80  // CseGetKubernetesClusterById retrieves a CSE Kubernetes cluster from VCD by its unique ID
    81  func (vcdClient *VCDClient) CseGetKubernetesClusterById(id string) (*CseKubernetesCluster, error) {
    82  	return getCseKubernetesClusterById(&vcdClient.Client, id)
    83  }
    84  
    85  // CseGetKubernetesClustersByName retrieves all the CSE Kubernetes clusters from VCD with the given name that belong to the receiver Organization.
    86  // Note: The clusters retrieved won't have a valid ETag to perform operations on them. Use VCDClient.CseGetKubernetesClusterById for that instead.
    87  func (org *Org) CseGetKubernetesClustersByName(cseVersion semver.Version, name string) ([]*CseKubernetesCluster, error) {
    88  	cseSubcomponents, err := getCseComponentsVersions(cseVersion)
    89  	if err != nil {
    90  		return nil, err
    91  	}
    92  
    93  	rdes, err := getRdesByName(org.client, cseKubernetesClusterVendor, cseKubernetesClusterNamespace, cseSubcomponents.CapvcdRdeTypeVersion, name)
    94  	if err != nil {
    95  		return nil, err
    96  	}
    97  	var clusters []*CseKubernetesCluster
    98  	for _, rde := range rdes {
    99  		if rde.DefinedEntity.Org != nil && rde.DefinedEntity.Org.ID == org.Org.ID {
   100  			cluster, err := cseConvertToCseKubernetesClusterType(rde)
   101  			if err != nil {
   102  				return nil, err
   103  			}
   104  			clusters = append(clusters, cluster)
   105  		}
   106  	}
   107  	return clusters, nil
   108  }
   109  
   110  // getCseKubernetesClusterById retrieves a CSE Kubernetes cluster from VCD by its unique ID
   111  func getCseKubernetesClusterById(client *Client, clusterId string) (*CseKubernetesCluster, error) {
   112  	rde, err := getRdeById(client, clusterId)
   113  	if err != nil {
   114  		return nil, err
   115  	}
   116  	return cseConvertToCseKubernetesClusterType(rde)
   117  }
   118  
   119  // Refresh gets the latest information about the receiver CSE Kubernetes cluster and updates its properties.
   120  // All cached fields such as the supported OVAs list (from CseKubernetesCluster.GetSupportedUpgrades) are also cleared.
   121  func (cluster *CseKubernetesCluster) Refresh() error {
   122  	refreshed, err := getCseKubernetesClusterById(cluster.client, cluster.ID)
   123  	if err != nil {
   124  		return fmt.Errorf("failed refreshing the CSE Kubernetes Cluster: %s", err)
   125  	}
   126  	*cluster = *refreshed
   127  	return nil
   128  }
   129  
   130  // GetKubeconfig retrieves the Kubeconfig from an existing CSE Kubernetes cluster that is in provisioned state.
   131  // If refresh=true, it retrieves the latest state of the cluster from VCD before requesting the Kubeconfig.
   132  func (cluster *CseKubernetesCluster) GetKubeconfig(refresh bool) (string, error) {
   133  	if refresh {
   134  		err := cluster.Refresh()
   135  		if err != nil {
   136  			return "", err
   137  		}
   138  	}
   139  
   140  	if cluster.State == "" {
   141  		return "", fmt.Errorf("cannot get a Kubeconfig of a Kubernetes cluster that does not have a state (expected 'provisioned')")
   142  	}
   143  
   144  	if cluster.State != "provisioned" {
   145  		return "", fmt.Errorf("cannot get a Kubeconfig of a Kubernetes cluster that is not in 'provisioned' state. It is '%s'", cluster.State)
   146  	}
   147  
   148  	rde, err := getRdeById(cluster.client, cluster.ID)
   149  	if err != nil {
   150  		return "", err
   151  	}
   152  	versions, err := getCseComponentsVersions(cluster.CseVersion)
   153  	if err != nil {
   154  		return "", err
   155  	}
   156  
   157  	// Auxiliary wrapper of the result, as the invocation returns the RDE and
   158  	// what we need is inside of it.
   159  	type invocationResult struct {
   160  		Capvcd types.Capvcd `json:"entity,omitempty"`
   161  	}
   162  	result := invocationResult{}
   163  
   164  	err = rde.InvokeBehaviorAndMarshal(fmt.Sprintf("urn:vcloud:behavior-interface:getFullEntity:cse:capvcd:%s", versions.CseInterfaceVersion), types.BehaviorInvocation{}, &result)
   165  	if err != nil {
   166  		return "", fmt.Errorf("could not retrieve the Kubeconfig, the Behavior invocation failed: %s", err)
   167  	}
   168  	if result.Capvcd.Status.Capvcd.Private == nil {
   169  		return "", fmt.Errorf("could not retrieve the Kubeconfig, the Behavior invocation succeeded but the Kubeconfig is nil")
   170  	}
   171  	if result.Capvcd.Status.Capvcd.Private.KubeConfig == "" {
   172  		return "", fmt.Errorf("could not retrieve the Kubeconfig, the Behavior invocation succeeded but the Kubeconfig is empty")
   173  	}
   174  	return result.Capvcd.Status.Capvcd.Private.KubeConfig, nil
   175  }
   176  
   177  // UpdateWorkerPools executes an update on the receiver cluster to change the existing Worker Pools.
   178  // The input is a map where the key is the Worker pool unique name, and the value is the update payload for that Worker Pool.
   179  // If refresh=true, it retrieves the latest state of the cluster from VCD before updating.
   180  // WARNING: At least one worker pool must have one or more nodes running, otherwise the cluster will be left in an unusable state.
   181  func (cluster *CseKubernetesCluster) UpdateWorkerPools(input map[string]CseWorkerPoolUpdateInput, refresh bool) error {
   182  	return cluster.Update(CseClusterUpdateInput{
   183  		WorkerPools: &input,
   184  	}, refresh)
   185  }
   186  
   187  // AddWorkerPools executes an update on the receiver cluster to add new Worker Pools.
   188  // If refresh=true, it retrieves the latest state of the cluster from VCD before updating.
   189  func (cluster *CseKubernetesCluster) AddWorkerPools(input []CseWorkerPoolSettings, refresh bool) error {
   190  	return cluster.Update(CseClusterUpdateInput{
   191  		NewWorkerPools: &input,
   192  	}, refresh)
   193  }
   194  
   195  // UpdateControlPlane executes an update on the receiver cluster to change the existing control plane.
   196  // If refresh=true, it retrieves the latest state of the cluster from VCD before updating.
   197  func (cluster *CseKubernetesCluster) UpdateControlPlane(input CseControlPlaneUpdateInput, refresh bool) error {
   198  	return cluster.Update(CseClusterUpdateInput{
   199  		ControlPlane: &input,
   200  	}, refresh)
   201  }
   202  
   203  // GetSupportedUpgrades queries all vApp Templates from VCD, one by one, and returns those that can be used for upgrading the cluster.
   204  // As retrieving all OVAs one by one from VCD is expensive, the first time this method is called the returned OVAs are
   205  // cached to avoid querying VCD again multiple times.
   206  // If refreshOvas=true, this cache is cleared out and this method will query VCD for every vApp Template again.
   207  // Therefore, the refreshOvas flag should be set to true only when VCD has new OVAs that need to be considered or after a cluster upgrade.
   208  // NOTE: Any refresh operation from other methods will cause the cache to be cleared.
   209  func (cluster *CseKubernetesCluster) GetSupportedUpgrades(refreshOvas bool) ([]*types.VAppTemplate, error) {
   210  	if refreshOvas {
   211  		cluster.supportedUpgrades = make([]*types.VAppTemplate, 0)
   212  	}
   213  	if cluster.State != "provisioned" {
   214  		cluster.supportedUpgrades = make([]*types.VAppTemplate, 0)
   215  		return cluster.supportedUpgrades, nil
   216  	}
   217  	if len(cluster.supportedUpgrades) > 0 {
   218  		return cluster.supportedUpgrades, nil
   219  	}
   220  
   221  	vAppTemplates, err := queryVappTemplateListWithFilter(cluster.client, nil)
   222  	if err != nil {
   223  		return nil, fmt.Errorf("could not get vApp Templates: %s", err)
   224  	}
   225  	for _, template := range vAppTemplates {
   226  		// We can only know if the vApp Template is a TKGm OVA by inspecting its internals, hence we need to retrieve every one
   227  		// of them one by one. This is an expensive operation, hence the cache.
   228  		vAppTemplate, err := getVAppTemplateById(cluster.client, fmt.Sprintf("urn:vcloud:vapptemplate:%s", extractUuid(template.HREF)))
   229  		if err != nil {
   230  			continue // This means we cannot retrieve it (maybe due to some rights missing), so we cannot use it. We skip it
   231  		}
   232  		targetVersions, err := getTkgVersionBundleFromVAppTemplate(vAppTemplate.VAppTemplate)
   233  		if err != nil {
   234  			continue // This means it's not a TKGm OVA, or it is not supported, so we skip it
   235  		}
   236  		// The OVA can be used if the TKG version is equal to the actual or higher, and the Kubernetes version is at most 1 minor higher.
   237  		if targetVersions.compareTkgVersion(cluster.TkgVersion.String()) >= 0 && targetVersions.kubernetesVersionIsUpgradeableFrom(cluster.KubernetesVersion.String()) {
   238  			cluster.supportedUpgrades = append(cluster.supportedUpgrades, vAppTemplate.VAppTemplate)
   239  		}
   240  	}
   241  	return cluster.supportedUpgrades, nil
   242  }
   243  
   244  // UpgradeCluster executes an update on the receiver cluster to upgrade the Kubernetes template of the cluster.
   245  // If the cluster is not upgradeable or the OVA is incorrect, this method will return an error.
   246  // If refresh=true, it retrieves the latest state of the cluster from VCD before updating.
   247  func (cluster *CseKubernetesCluster) UpgradeCluster(kubernetesTemplateOvaId string, refresh bool) error {
   248  	return cluster.Update(CseClusterUpdateInput{
   249  		KubernetesTemplateOvaId: &kubernetesTemplateOvaId,
   250  	}, refresh)
   251  }
   252  
   253  // SetNodeHealthCheck executes an update on the receiver cluster to enable or disable the machine health check capabilities.
   254  // If refresh=true, it retrieves the latest state of the cluster from VCD before updating.
   255  func (cluster *CseKubernetesCluster) SetNodeHealthCheck(healthCheckEnabled bool, refresh bool) error {
   256  	return cluster.Update(CseClusterUpdateInput{
   257  		NodeHealthCheck: &healthCheckEnabled,
   258  	}, refresh)
   259  }
   260  
   261  // SetAutoRepairOnErrors executes an update on the receiver cluster to change the flag that controls the auto-repair
   262  // capabilities of CSE. If refresh=true, it retrieves the latest state of the cluster from VCD before updating.
   263  // NOTE: This method can only be used in CSE versions < 4.1.1
   264  func (cluster *CseKubernetesCluster) SetAutoRepairOnErrors(autoRepairOnErrors bool, refresh bool) error {
   265  	return cluster.Update(CseClusterUpdateInput{
   266  		AutoRepairOnErrors: &autoRepairOnErrors,
   267  	}, refresh)
   268  }
   269  
   270  // Update executes an update on the receiver CSE Kubernetes Cluster on any of the allowed parameters defined in the input type.
   271  // If refresh=true, it retrieves the latest state of the cluster from VCD before updating.
   272  func (cluster *CseKubernetesCluster) Update(input CseClusterUpdateInput, refresh bool) error {
   273  	if refresh {
   274  		err := cluster.Refresh()
   275  		if err != nil {
   276  			return err
   277  		}
   278  	}
   279  
   280  	if cluster.State == "" {
   281  		return fmt.Errorf("can't update a Kubernetes cluster that does not have any state")
   282  	}
   283  	if cluster.State != "provisioned" {
   284  		return fmt.Errorf("can't update a Kubernetes cluster that is not in 'provisioned' state, as it is in '%s'", cluster.capvcdType.Status.VcdKe.State)
   285  	}
   286  
   287  	if input.AutoRepairOnErrors != nil && *input.AutoRepairOnErrors != cluster.AutoRepairOnErrors {
   288  		// Since CSE 4.1.1, the AutoRepairOnError toggle can't be modified and is turned off
   289  		// automatically by the CSE Server.
   290  
   291  		v411, err := semver.NewVersion("4.1.1")
   292  		if err != nil {
   293  			return err
   294  		}
   295  		if cluster.CseVersion.GreaterThanOrEqual(v411) {
   296  			return fmt.Errorf("the 'Auto Repair on Errors' flag can't be changed after the cluster is created since CSE 4.1.1")
   297  		}
   298  		cluster.capvcdType.Spec.VcdKe.AutoRepairOnErrors = *input.AutoRepairOnErrors
   299  	}
   300  
   301  	updatedCapiYaml, err := cluster.updateCapiYaml(input)
   302  	if err != nil {
   303  		return err
   304  	}
   305  	cluster.capvcdType.Spec.CapiYaml = updatedCapiYaml
   306  
   307  	marshaledPayload, err := json.Marshal(cluster.capvcdType)
   308  	if err != nil {
   309  		return err
   310  	}
   311  	entityContent := map[string]interface{}{}
   312  	err = json.Unmarshal(marshaledPayload, &entityContent)
   313  	if err != nil {
   314  		return err
   315  	}
   316  
   317  	// We do this loop to increase the chances that the Kubernetes cluster is successfully updated, as the update operation
   318  	// can clash with the CSE Server updates on the same RDE. If the CSE Server does an update just before we do, the ETag
   319  	// verification will fail, so we must retry.
   320  	retries := 0
   321  	maxRetries := 5
   322  	updated := false
   323  	for retries <= maxRetries {
   324  		rde, err := getRdeById(cluster.client, cluster.ID)
   325  		if err != nil {
   326  			return err
   327  		}
   328  
   329  		rde.DefinedEntity.Entity = entityContent
   330  		err = rde.Update(*rde.DefinedEntity)
   331  		if err == nil {
   332  			updated = true
   333  			break
   334  		}
   335  		if err != nil {
   336  			// If it's an ETag error, we just retry without waiting
   337  			if !strings.Contains(strings.ToLower(err.Error()), "etag") {
   338  				return err
   339  			}
   340  		}
   341  		retries++
   342  		util.Logger.Printf("[DEBUG] The request to update the Kubernetes cluster '%s' failed due to a ETag lock. Trying again", cluster.ID)
   343  	}
   344  
   345  	if !updated {
   346  		return fmt.Errorf("could not update the Kubernetes cluster '%s' after %d retries, due to an ETag lock blocking the operations", cluster.ID, maxRetries)
   347  	}
   348  
   349  	return cluster.Refresh()
   350  }
   351  
   352  // Delete deletes a CSE Kubernetes cluster, waiting the specified amount of time. If the timeout is reached, this method
   353  // returns an error, even if the cluster is already marked for deletion.
   354  func (cluster *CseKubernetesCluster) Delete(timeout time.Duration) error {
   355  	var elapsed time.Duration
   356  	start := time.Now()
   357  	markForDelete := false
   358  	forceDelete := false
   359  	for elapsed <= timeout || timeout == 0 { // If the user specifies timeout=0, we wait forever
   360  		rde, err := getRdeById(cluster.client, cluster.ID)
   361  		if err != nil {
   362  			if ContainsNotFound(err) {
   363  				return nil // The RDE is gone, so the process is completed and there's nothing more to do
   364  			}
   365  			return fmt.Errorf("could not retrieve the Kubernetes cluster with ID '%s': %s", cluster.ID, err)
   366  		}
   367  
   368  		markForDelete = traverseMapAndGet[bool](rde.DefinedEntity.Entity, "spec.vcdKe.markForDelete")
   369  		forceDelete = traverseMapAndGet[bool](rde.DefinedEntity.Entity, "spec.vcdKe.forceDelete")
   370  
   371  		if !markForDelete || !forceDelete {
   372  			// Mark the cluster for deletion
   373  			rde.DefinedEntity.Entity["spec"].(map[string]interface{})["vcdKe"].(map[string]interface{})["markForDelete"] = true
   374  			rde.DefinedEntity.Entity["spec"].(map[string]interface{})["vcdKe"].(map[string]interface{})["forceDelete"] = true
   375  			err = rde.Update(*rde.DefinedEntity)
   376  			if err != nil {
   377  				// We ignore any ETag error. This just means a clash with the CSE Server, we just try again
   378  				if !strings.Contains(strings.ToLower(err.Error()), "etag") {
   379  					return fmt.Errorf("could not mark the Kubernetes cluster with ID '%s' to be deleted: %s", cluster.ID, err)
   380  				}
   381  			}
   382  		}
   383  
   384  		util.Logger.Printf("[DEBUG] Cluster '%s' is still not deleted, will check again in 10 seconds", cluster.ID)
   385  		time.Sleep(10 * time.Second)
   386  		elapsed = time.Since(start)
   387  	}
   388  
   389  	// We give a hint to the user about the deletion process result
   390  	if markForDelete && forceDelete {
   391  		return fmt.Errorf("timeout of %s reached, the cluster was successfully marked for deletion but was not removed in time", timeout)
   392  	}
   393  	return fmt.Errorf("timeout of %s reached, the cluster was not marked for deletion, please try again", timeout)
   394  }