github.com/vmware/go-vcloud-director/v2@v2.24.0/govcd/cse.go (about) 1 package govcd 2 3 import ( 4 "encoding/json" 5 "fmt" 6 semver "github.com/hashicorp/go-version" 7 "github.com/vmware/go-vcloud-director/v2/types/v56" 8 "github.com/vmware/go-vcloud-director/v2/util" 9 "strings" 10 "time" 11 ) 12 13 // CseCreateKubernetesCluster creates a Kubernetes cluster with the data given as input (CseClusterSettings). If the given 14 // timeout is 0, it waits forever for the cluster creation. 15 // 16 // If the timeout is reached and the cluster is not available (in "provisioned" state), it will return a non-nil CseKubernetesCluster 17 // with only the cluster ID and an error. This means that the cluster will be left in VCD in any state, and it can be retrieved afterward 18 // with Org.CseGetKubernetesClusterById and the returned ID. 19 // 20 // If the cluster is created correctly, returns all the available data in CseKubernetesCluster or an error if some of the fields 21 // of the created cluster cannot be calculated or retrieved. 22 func (org *Org) CseCreateKubernetesCluster(clusterData CseClusterSettings, timeout time.Duration) (*CseKubernetesCluster, error) { 23 clusterId, err := org.CseCreateKubernetesClusterAsync(clusterData) 24 if err != nil { 25 return nil, err 26 } 27 28 err = waitUntilClusterIsProvisioned(org.client, clusterId, timeout) 29 if err != nil { 30 return &CseKubernetesCluster{ 31 client: org.client, 32 ID: clusterId, 33 }, err 34 } 35 36 return getCseKubernetesClusterById(org.client, clusterId) 37 } 38 39 // CseCreateKubernetesClusterAsync creates a Kubernetes cluster with the data given as input (CseClusterSettings), but does not 40 // wait for the creation process to finish, so it doesn't monitor for any errors during the process. It returns just the ID of 41 // the created cluster. One can manually check the status of the cluster with VCDClient.CseGetKubernetesClusterById and the result of this method. 42 func (org *Org) CseCreateKubernetesClusterAsync(clusterSettings CseClusterSettings) (string, error) { 43 if org == nil { 44 return "", fmt.Errorf("CseCreateKubernetesClusterAsync cannot be called on a nil Organization receiver") 45 } 46 47 tenantContext, err := org.getTenantContext() 48 if err != nil { 49 return "", fmt.Errorf("error creating the CSE Kubernetes cluster: %s", err) 50 } 51 52 cseSubcomponents, err := getCseComponentsVersions(clusterSettings.CseVersion) 53 if err != nil { 54 return "", err 55 } 56 57 internalSettings, err := clusterSettings.toCseClusterSettingsInternal(*org) 58 if err != nil { 59 return "", fmt.Errorf("error creating the CSE Kubernetes cluster: %s", err) 60 } 61 62 payload, err := internalSettings.getUnmarshalledRdePayload() 63 if err != nil { 64 return "", err 65 } 66 67 rde, err := createRdeAndGetFromTask(org.client, cseKubernetesClusterVendor, cseKubernetesClusterNamespace, cseSubcomponents.CapvcdRdeTypeVersion, 68 types.DefinedEntity{ 69 EntityType: internalSettings.RdeType.ID, 70 Name: internalSettings.Name, 71 Entity: payload, 72 }, tenantContext) 73 if err != nil { 74 return "", fmt.Errorf("error creating the CSE Kubernetes cluster: %s", err) 75 } 76 77 return rde.DefinedEntity.ID, nil 78 } 79 80 // CseGetKubernetesClusterById retrieves a CSE Kubernetes cluster from VCD by its unique ID 81 func (vcdClient *VCDClient) CseGetKubernetesClusterById(id string) (*CseKubernetesCluster, error) { 82 return getCseKubernetesClusterById(&vcdClient.Client, id) 83 } 84 85 // CseGetKubernetesClustersByName retrieves all the CSE Kubernetes clusters from VCD with the given name that belong to the receiver Organization. 86 // Note: The clusters retrieved won't have a valid ETag to perform operations on them. Use VCDClient.CseGetKubernetesClusterById for that instead. 87 func (org *Org) CseGetKubernetesClustersByName(cseVersion semver.Version, name string) ([]*CseKubernetesCluster, error) { 88 cseSubcomponents, err := getCseComponentsVersions(cseVersion) 89 if err != nil { 90 return nil, err 91 } 92 93 rdes, err := getRdesByName(org.client, cseKubernetesClusterVendor, cseKubernetesClusterNamespace, cseSubcomponents.CapvcdRdeTypeVersion, name) 94 if err != nil { 95 return nil, err 96 } 97 var clusters []*CseKubernetesCluster 98 for _, rde := range rdes { 99 if rde.DefinedEntity.Org != nil && rde.DefinedEntity.Org.ID == org.Org.ID { 100 cluster, err := cseConvertToCseKubernetesClusterType(rde) 101 if err != nil { 102 return nil, err 103 } 104 clusters = append(clusters, cluster) 105 } 106 } 107 return clusters, nil 108 } 109 110 // getCseKubernetesClusterById retrieves a CSE Kubernetes cluster from VCD by its unique ID 111 func getCseKubernetesClusterById(client *Client, clusterId string) (*CseKubernetesCluster, error) { 112 rde, err := getRdeById(client, clusterId) 113 if err != nil { 114 return nil, err 115 } 116 return cseConvertToCseKubernetesClusterType(rde) 117 } 118 119 // Refresh gets the latest information about the receiver CSE Kubernetes cluster and updates its properties. 120 // All cached fields such as the supported OVAs list (from CseKubernetesCluster.GetSupportedUpgrades) are also cleared. 121 func (cluster *CseKubernetesCluster) Refresh() error { 122 refreshed, err := getCseKubernetesClusterById(cluster.client, cluster.ID) 123 if err != nil { 124 return fmt.Errorf("failed refreshing the CSE Kubernetes Cluster: %s", err) 125 } 126 *cluster = *refreshed 127 return nil 128 } 129 130 // GetKubeconfig retrieves the Kubeconfig from an existing CSE Kubernetes cluster that is in provisioned state. 131 // If refresh=true, it retrieves the latest state of the cluster from VCD before requesting the Kubeconfig. 132 func (cluster *CseKubernetesCluster) GetKubeconfig(refresh bool) (string, error) { 133 if refresh { 134 err := cluster.Refresh() 135 if err != nil { 136 return "", err 137 } 138 } 139 140 if cluster.State == "" { 141 return "", fmt.Errorf("cannot get a Kubeconfig of a Kubernetes cluster that does not have a state (expected 'provisioned')") 142 } 143 144 if cluster.State != "provisioned" { 145 return "", fmt.Errorf("cannot get a Kubeconfig of a Kubernetes cluster that is not in 'provisioned' state. It is '%s'", cluster.State) 146 } 147 148 rde, err := getRdeById(cluster.client, cluster.ID) 149 if err != nil { 150 return "", err 151 } 152 versions, err := getCseComponentsVersions(cluster.CseVersion) 153 if err != nil { 154 return "", err 155 } 156 157 // Auxiliary wrapper of the result, as the invocation returns the RDE and 158 // what we need is inside of it. 159 type invocationResult struct { 160 Capvcd types.Capvcd `json:"entity,omitempty"` 161 } 162 result := invocationResult{} 163 164 err = rde.InvokeBehaviorAndMarshal(fmt.Sprintf("urn:vcloud:behavior-interface:getFullEntity:cse:capvcd:%s", versions.CseInterfaceVersion), types.BehaviorInvocation{}, &result) 165 if err != nil { 166 return "", fmt.Errorf("could not retrieve the Kubeconfig, the Behavior invocation failed: %s", err) 167 } 168 if result.Capvcd.Status.Capvcd.Private == nil { 169 return "", fmt.Errorf("could not retrieve the Kubeconfig, the Behavior invocation succeeded but the Kubeconfig is nil") 170 } 171 if result.Capvcd.Status.Capvcd.Private.KubeConfig == "" { 172 return "", fmt.Errorf("could not retrieve the Kubeconfig, the Behavior invocation succeeded but the Kubeconfig is empty") 173 } 174 return result.Capvcd.Status.Capvcd.Private.KubeConfig, nil 175 } 176 177 // UpdateWorkerPools executes an update on the receiver cluster to change the existing Worker Pools. 178 // The input is a map where the key is the Worker pool unique name, and the value is the update payload for that Worker Pool. 179 // If refresh=true, it retrieves the latest state of the cluster from VCD before updating. 180 // WARNING: At least one worker pool must have one or more nodes running, otherwise the cluster will be left in an unusable state. 181 func (cluster *CseKubernetesCluster) UpdateWorkerPools(input map[string]CseWorkerPoolUpdateInput, refresh bool) error { 182 return cluster.Update(CseClusterUpdateInput{ 183 WorkerPools: &input, 184 }, refresh) 185 } 186 187 // AddWorkerPools executes an update on the receiver cluster to add new Worker Pools. 188 // If refresh=true, it retrieves the latest state of the cluster from VCD before updating. 189 func (cluster *CseKubernetesCluster) AddWorkerPools(input []CseWorkerPoolSettings, refresh bool) error { 190 return cluster.Update(CseClusterUpdateInput{ 191 NewWorkerPools: &input, 192 }, refresh) 193 } 194 195 // UpdateControlPlane executes an update on the receiver cluster to change the existing control plane. 196 // If refresh=true, it retrieves the latest state of the cluster from VCD before updating. 197 func (cluster *CseKubernetesCluster) UpdateControlPlane(input CseControlPlaneUpdateInput, refresh bool) error { 198 return cluster.Update(CseClusterUpdateInput{ 199 ControlPlane: &input, 200 }, refresh) 201 } 202 203 // GetSupportedUpgrades queries all vApp Templates from VCD, one by one, and returns those that can be used for upgrading the cluster. 204 // As retrieving all OVAs one by one from VCD is expensive, the first time this method is called the returned OVAs are 205 // cached to avoid querying VCD again multiple times. 206 // If refreshOvas=true, this cache is cleared out and this method will query VCD for every vApp Template again. 207 // Therefore, the refreshOvas flag should be set to true only when VCD has new OVAs that need to be considered or after a cluster upgrade. 208 // NOTE: Any refresh operation from other methods will cause the cache to be cleared. 209 func (cluster *CseKubernetesCluster) GetSupportedUpgrades(refreshOvas bool) ([]*types.VAppTemplate, error) { 210 if refreshOvas { 211 cluster.supportedUpgrades = make([]*types.VAppTemplate, 0) 212 } 213 if cluster.State != "provisioned" { 214 cluster.supportedUpgrades = make([]*types.VAppTemplate, 0) 215 return cluster.supportedUpgrades, nil 216 } 217 if len(cluster.supportedUpgrades) > 0 { 218 return cluster.supportedUpgrades, nil 219 } 220 221 vAppTemplates, err := queryVappTemplateListWithFilter(cluster.client, nil) 222 if err != nil { 223 return nil, fmt.Errorf("could not get vApp Templates: %s", err) 224 } 225 for _, template := range vAppTemplates { 226 // We can only know if the vApp Template is a TKGm OVA by inspecting its internals, hence we need to retrieve every one 227 // of them one by one. This is an expensive operation, hence the cache. 228 vAppTemplate, err := getVAppTemplateById(cluster.client, fmt.Sprintf("urn:vcloud:vapptemplate:%s", extractUuid(template.HREF))) 229 if err != nil { 230 continue // This means we cannot retrieve it (maybe due to some rights missing), so we cannot use it. We skip it 231 } 232 targetVersions, err := getTkgVersionBundleFromVAppTemplate(vAppTemplate.VAppTemplate) 233 if err != nil { 234 continue // This means it's not a TKGm OVA, or it is not supported, so we skip it 235 } 236 // The OVA can be used if the TKG version is equal to the actual or higher, and the Kubernetes version is at most 1 minor higher. 237 if targetVersions.compareTkgVersion(cluster.TkgVersion.String()) >= 0 && targetVersions.kubernetesVersionIsUpgradeableFrom(cluster.KubernetesVersion.String()) { 238 cluster.supportedUpgrades = append(cluster.supportedUpgrades, vAppTemplate.VAppTemplate) 239 } 240 } 241 return cluster.supportedUpgrades, nil 242 } 243 244 // UpgradeCluster executes an update on the receiver cluster to upgrade the Kubernetes template of the cluster. 245 // If the cluster is not upgradeable or the OVA is incorrect, this method will return an error. 246 // If refresh=true, it retrieves the latest state of the cluster from VCD before updating. 247 func (cluster *CseKubernetesCluster) UpgradeCluster(kubernetesTemplateOvaId string, refresh bool) error { 248 return cluster.Update(CseClusterUpdateInput{ 249 KubernetesTemplateOvaId: &kubernetesTemplateOvaId, 250 }, refresh) 251 } 252 253 // SetNodeHealthCheck executes an update on the receiver cluster to enable or disable the machine health check capabilities. 254 // If refresh=true, it retrieves the latest state of the cluster from VCD before updating. 255 func (cluster *CseKubernetesCluster) SetNodeHealthCheck(healthCheckEnabled bool, refresh bool) error { 256 return cluster.Update(CseClusterUpdateInput{ 257 NodeHealthCheck: &healthCheckEnabled, 258 }, refresh) 259 } 260 261 // SetAutoRepairOnErrors executes an update on the receiver cluster to change the flag that controls the auto-repair 262 // capabilities of CSE. If refresh=true, it retrieves the latest state of the cluster from VCD before updating. 263 // NOTE: This method can only be used in CSE versions < 4.1.1 264 func (cluster *CseKubernetesCluster) SetAutoRepairOnErrors(autoRepairOnErrors bool, refresh bool) error { 265 return cluster.Update(CseClusterUpdateInput{ 266 AutoRepairOnErrors: &autoRepairOnErrors, 267 }, refresh) 268 } 269 270 // Update executes an update on the receiver CSE Kubernetes Cluster on any of the allowed parameters defined in the input type. 271 // If refresh=true, it retrieves the latest state of the cluster from VCD before updating. 272 func (cluster *CseKubernetesCluster) Update(input CseClusterUpdateInput, refresh bool) error { 273 if refresh { 274 err := cluster.Refresh() 275 if err != nil { 276 return err 277 } 278 } 279 280 if cluster.State == "" { 281 return fmt.Errorf("can't update a Kubernetes cluster that does not have any state") 282 } 283 if cluster.State != "provisioned" { 284 return fmt.Errorf("can't update a Kubernetes cluster that is not in 'provisioned' state, as it is in '%s'", cluster.capvcdType.Status.VcdKe.State) 285 } 286 287 if input.AutoRepairOnErrors != nil && *input.AutoRepairOnErrors != cluster.AutoRepairOnErrors { 288 // Since CSE 4.1.1, the AutoRepairOnError toggle can't be modified and is turned off 289 // automatically by the CSE Server. 290 291 v411, err := semver.NewVersion("4.1.1") 292 if err != nil { 293 return err 294 } 295 if cluster.CseVersion.GreaterThanOrEqual(v411) { 296 return fmt.Errorf("the 'Auto Repair on Errors' flag can't be changed after the cluster is created since CSE 4.1.1") 297 } 298 cluster.capvcdType.Spec.VcdKe.AutoRepairOnErrors = *input.AutoRepairOnErrors 299 } 300 301 updatedCapiYaml, err := cluster.updateCapiYaml(input) 302 if err != nil { 303 return err 304 } 305 cluster.capvcdType.Spec.CapiYaml = updatedCapiYaml 306 307 marshaledPayload, err := json.Marshal(cluster.capvcdType) 308 if err != nil { 309 return err 310 } 311 entityContent := map[string]interface{}{} 312 err = json.Unmarshal(marshaledPayload, &entityContent) 313 if err != nil { 314 return err 315 } 316 317 // We do this loop to increase the chances that the Kubernetes cluster is successfully updated, as the update operation 318 // can clash with the CSE Server updates on the same RDE. If the CSE Server does an update just before we do, the ETag 319 // verification will fail, so we must retry. 320 retries := 0 321 maxRetries := 5 322 updated := false 323 for retries <= maxRetries { 324 rde, err := getRdeById(cluster.client, cluster.ID) 325 if err != nil { 326 return err 327 } 328 329 rde.DefinedEntity.Entity = entityContent 330 err = rde.Update(*rde.DefinedEntity) 331 if err == nil { 332 updated = true 333 break 334 } 335 if err != nil { 336 // If it's an ETag error, we just retry without waiting 337 if !strings.Contains(strings.ToLower(err.Error()), "etag") { 338 return err 339 } 340 } 341 retries++ 342 util.Logger.Printf("[DEBUG] The request to update the Kubernetes cluster '%s' failed due to a ETag lock. Trying again", cluster.ID) 343 } 344 345 if !updated { 346 return fmt.Errorf("could not update the Kubernetes cluster '%s' after %d retries, due to an ETag lock blocking the operations", cluster.ID, maxRetries) 347 } 348 349 return cluster.Refresh() 350 } 351 352 // Delete deletes a CSE Kubernetes cluster, waiting the specified amount of time. If the timeout is reached, this method 353 // returns an error, even if the cluster is already marked for deletion. 354 func (cluster *CseKubernetesCluster) Delete(timeout time.Duration) error { 355 var elapsed time.Duration 356 start := time.Now() 357 markForDelete := false 358 forceDelete := false 359 for elapsed <= timeout || timeout == 0 { // If the user specifies timeout=0, we wait forever 360 rde, err := getRdeById(cluster.client, cluster.ID) 361 if err != nil { 362 if ContainsNotFound(err) { 363 return nil // The RDE is gone, so the process is completed and there's nothing more to do 364 } 365 return fmt.Errorf("could not retrieve the Kubernetes cluster with ID '%s': %s", cluster.ID, err) 366 } 367 368 markForDelete = traverseMapAndGet[bool](rde.DefinedEntity.Entity, "spec.vcdKe.markForDelete") 369 forceDelete = traverseMapAndGet[bool](rde.DefinedEntity.Entity, "spec.vcdKe.forceDelete") 370 371 if !markForDelete || !forceDelete { 372 // Mark the cluster for deletion 373 rde.DefinedEntity.Entity["spec"].(map[string]interface{})["vcdKe"].(map[string]interface{})["markForDelete"] = true 374 rde.DefinedEntity.Entity["spec"].(map[string]interface{})["vcdKe"].(map[string]interface{})["forceDelete"] = true 375 err = rde.Update(*rde.DefinedEntity) 376 if err != nil { 377 // We ignore any ETag error. This just means a clash with the CSE Server, we just try again 378 if !strings.Contains(strings.ToLower(err.Error()), "etag") { 379 return fmt.Errorf("could not mark the Kubernetes cluster with ID '%s' to be deleted: %s", cluster.ID, err) 380 } 381 } 382 } 383 384 util.Logger.Printf("[DEBUG] Cluster '%s' is still not deleted, will check again in 10 seconds", cluster.ID) 385 time.Sleep(10 * time.Second) 386 elapsed = time.Since(start) 387 } 388 389 // We give a hint to the user about the deletion process result 390 if markForDelete && forceDelete { 391 return fmt.Errorf("timeout of %s reached, the cluster was successfully marked for deletion but was not removed in time", timeout) 392 } 393 return fmt.Errorf("timeout of %s reached, the cluster was not marked for deletion, please try again", timeout) 394 }