github.com/caos/orbos@v1.5.14-0.20221103111702-e6cd0cea7ad4/internal/operator/orbiter/kinds/providers/gce/computesservice.go (about) 1 package gce 2 3 import ( 4 "errors" 5 "fmt" 6 "strings" 7 "sync" 8 "time" 9 10 uuid "github.com/satori/go.uuid" 11 "google.golang.org/api/compute/v1" 12 13 "github.com/caos/orbos/internal/operator/orbiter/kinds/clusters/core/infra" 14 "github.com/caos/orbos/internal/operator/orbiter/kinds/providers/core" 15 "github.com/caos/orbos/internal/operator/orbiter/kinds/providers/ssh" 16 "github.com/caos/orbos/mntr" 17 ) 18 19 var _ core.MachinesService = (*machinesService)(nil) 20 21 type creatingInstance struct { 22 zone string 23 id string 24 } 25 26 type machinesService struct { 27 context *context 28 oneoff bool 29 key *SSHKey 30 cache struct { 31 instances map[string][]*instance 32 creatingInstances map[string][]*creatingInstance 33 sync.Mutex 34 } 35 onCreate func(pool string, machine infra.Machine) error 36 } 37 38 func newMachinesService(context *context, oneoff bool) *machinesService { 39 return &machinesService{ 40 context: context, 41 oneoff: oneoff, 42 } 43 } 44 45 func (m *machinesService) DesiredMachines(poolName string, instances int) int { 46 desired, ok := m.context.desired.Pools[poolName] 47 if !ok { 48 return 0 49 } 50 51 if desired.Multizonal != nil && len(desired.Multizonal) > 0 { 52 return (len(desired.Multizonal) * instances) 53 } 54 return instances 55 } 56 57 func (m *machinesService) use(key *SSHKey) error { 58 if key == nil || key.Private == nil || key.Public == nil || key.Private.Value == "" || key.Public.Value == "" { 59 return mntr.ToUserError(errors.New("machines are not connectable. have you configured the orb by running orbctl configure?")) 60 } 61 m.key = key 62 return nil 63 } 64 65 func (m *machinesService) restartPreemptibleMachines() error { 66 pools, err := getAllInstances(m) 67 if err != nil { 68 return err 69 } 70 71 for _, pool := range pools { 72 for _, instance := range pool { 73 if instance.start { 74 if err := operateFunc( 75 func() { instance.Monitor.Debug("Restarting preemptible instance") }, 76 computeOpCall(m.context.client.Instances.Start(m.context.projectID, instance.zone, instance.ID()).RequestId(uuid.NewV1().String()).Do), 77 func() error { instance.Monitor.Info("Preemptible instance restarted"); return nil }, 78 )(); err != nil { 79 return err 80 } 81 } 82 } 83 } 84 return nil 85 } 86 87 func getDesiredZones(defaultZone string, multizonal []string) []string { 88 zones := make([]string, 0) 89 if multizonal != nil && len(multizonal) > 0 { 90 zones = multizonal 91 } else { 92 zones = append(zones, defaultZone) 93 } 94 return zones 95 } 96 97 func (m *machinesService) Create(poolName string, desiredInstances int) (infra.Machines, error) { 98 desired, ok := m.context.desired.Pools[poolName] 99 if !ok { 100 return nil, fmt.Errorf("Pool %s is not configured", poolName) 101 } 102 usableZone := m.context.desired.Zone 103 zones := getDesiredZones(m.context.desired.Zone, desired.Multizonal) 104 105 // Calculate minimum cpu and memory according to the gce specs: 106 // https://cloud.google.com/machine/docs/instances/creating-instance-with-custom-machine-type#specifications 107 cores := desired.MinCPUCores 108 if cores > 1 { 109 if cores%2 != 0 { 110 cores++ 111 } 112 } 113 memory := float64(desired.MinMemoryGB * 1024) 114 memoryPerCore := memory / float64(cores) 115 minMemPerCore := 922 116 maxMemPerCore := 6656 117 for memoryPerCore < float64(minMemPerCore) { 118 memoryPerCore = memory / float64(cores) 119 memory += 256 120 } 121 122 for memoryPerCore > float64(maxMemPerCore) { 123 cores++ 124 memoryPerCore = float64(memory) / float64(cores) 125 } 126 127 infraMachines := make([]infra.Machine, 0) 128 currentInfraMachines := make([]infra.Machine, 0) 129 if len(zones) > 1 { 130 currentInfraMachinesT, err := m.List(poolName) 131 if err != nil { 132 return nil, err 133 } 134 currentInfraMachines = currentInfraMachinesT 135 usableZone = "" 136 for zoneI := range zones { 137 zone := zones[zoneI] 138 zoneCovered := 0 139 for _, currentInfraMachine := range currentInfraMachines { 140 currentGCEMachine, ok := currentInfraMachine.(machine) 141 replaceRequired, _, _ := currentInfraMachine.ReplacementRequired() 142 if ok && zone == currentGCEMachine.Zone() && !replaceRequired { 143 zoneCovered++ 144 } 145 } 146 m.cache.Lock() 147 for _, currentCreating := range m.cache.creatingInstances[poolName] { 148 if currentCreating.zone == zone { 149 zoneCovered++ 150 } 151 } 152 m.cache.Unlock() 153 if zoneCovered >= desiredInstances { 154 continue 155 } 156 // find first usable zone to add machine to then leave loop 157 usableZone = zone 158 break 159 } 160 161 if usableZone == "" { 162 return nil, errors.New("error while creating all zones already covered") 163 } 164 } 165 166 name := newName() 167 m.cache.Lock() 168 if m.cache.creatingInstances == nil { 169 m.cache.creatingInstances = map[string][]*creatingInstance{} 170 } 171 if m.cache.creatingInstances[poolName] == nil { 172 m.cache.creatingInstances[poolName] = make([]*creatingInstance, 0) 173 } 174 m.cache.creatingInstances[poolName] = append(m.cache.creatingInstances[poolName], &creatingInstance{ 175 zone: usableZone, 176 id: name, 177 }) 178 m.cache.Unlock() 179 180 monitor := m.context.monitor.WithFields(map[string]interface{}{ 181 "machine": name, 182 "pool": poolName, 183 }) 184 infraMachine, err := m.getCreatableMachine( 185 monitor, 186 poolName, 187 desired, 188 name, 189 usableZone, 190 cores, 191 memory, 192 ) 193 if err != nil { 194 return nil, err 195 } 196 197 if m.cache.instances != nil { 198 m.cache.Lock() 199 if _, ok := m.cache.instances[poolName]; !ok { 200 m.cache.instances[poolName] = make([]*instance, 0) 201 } 202 m.cache.instances[poolName] = append(m.cache.instances[poolName], infraMachine) 203 m.cache.Unlock() 204 } 205 206 if err := m.onCreate(poolName, infraMachine); err != nil { 207 return nil, err 208 } 209 monitor.Info("Machine created") 210 infraMachines = append(infraMachines, infraMachine) 211 for i, instance := range m.cache.creatingInstances[poolName] { 212 if instance.id == infraMachine.ID() { 213 m.cache.Lock() 214 m.cache.creatingInstances[poolName] = append(m.cache.creatingInstances[poolName][:i], m.cache.creatingInstances[poolName][i+1:]...) 215 m.cache.Unlock() 216 } 217 } 218 return infraMachines, nil 219 } 220 221 func (m *machinesService) getCreatableMachine(monitor mntr.Monitor, poolName string, desired *Pool, name string, zone string, cores int, memory float64) (*instance, error) { 222 disks := []*compute.AttachedDisk{{ 223 Type: "PERSISTENT", 224 AutoDelete: true, 225 Boot: true, 226 InitializeParams: &compute.AttachedDiskInitializeParams{ 227 DiskSizeGb: int64(desired.StorageGB), 228 SourceImage: desired.OSImage, 229 DiskType: fmt.Sprintf("zones/%s/diskTypes/%s", zone, desired.StorageDiskType), 230 }}, 231 } 232 233 diskNames := make([]string, desired.LocalSSDs) 234 for i := 0; i < int(desired.LocalSSDs); i++ { 235 name := fmt.Sprintf("nvme0n%d", i+1) 236 disks = append(disks, &compute.AttachedDisk{ 237 Type: "SCRATCH", 238 AutoDelete: true, 239 Boot: false, 240 Interface: "NVME", 241 InitializeParams: &compute.AttachedDiskInitializeParams{ 242 DiskType: fmt.Sprintf("zones/%s/diskTypes/local-ssd", zone), 243 }, 244 DeviceName: name, 245 }) 246 diskNames[i] = name 247 } 248 249 nwTags := networkTags(m.context.orbID, m.context.providerID, poolName) 250 sshKey := fmt.Sprintf("orbiter:%s", m.key.Public.Value) 251 createInstance := &compute.Instance{ 252 Name: name, 253 MachineType: fmt.Sprintf("zones/%s/machineTypes/custom-%d-%d", zone, cores, int(memory)), 254 Tags: &compute.Tags{Items: nwTags}, 255 NetworkInterfaces: []*compute.NetworkInterface{{ 256 Network: m.context.networkURL, 257 }}, 258 Labels: map[string]string{"orb": m.context.orbID, "provider": m.context.providerID, "pool": poolName}, 259 Disks: disks, 260 Scheduling: &compute.Scheduling{Preemptible: desired.Preemptible}, 261 Metadata: &compute.Metadata{ 262 Items: []*compute.MetadataItems{{ 263 Key: "ssh-keys", 264 Value: &sshKey, 265 }}, 266 }, 267 ServiceAccounts: []*compute.ServiceAccount{{ 268 Scopes: []string{"https://www.googleapis.com/auth/compute"}, 269 }}, 270 } 271 272 if err := operateFunc( 273 func() { monitor.Debug("Creating instance") }, 274 computeOpCall(m.context.client.Instances.Insert(m.context.projectID, zone, createInstance).RequestId(uuid.NewV1().String()).Do), 275 func() error { monitor.Info("Instance created"); return nil }, 276 )(); err != nil { 277 return nil, err 278 } 279 280 newInstance, err := m.context.client.Instances.Get(m.context.projectID, zone, createInstance.Name). 281 Fields("selfLink,networkInterfaces(networkIP)"). 282 Do() 283 if err != nil { 284 return nil, err 285 } 286 287 var machine machine 288 if m.oneoff { 289 machine = newGCEMachine(m.context, monitor, createInstance.Name, zone) 290 } else { 291 sshMachine := ssh.NewMachine(monitor, "orbiter", newInstance.NetworkInterfaces[0].NetworkIP) 292 if err := sshMachine.UseKey([]byte(m.key.Private.Value)); err != nil { 293 return nil, err 294 } 295 machine = sshMachine 296 } 297 298 infraMachine := newMachine( 299 m.context, 300 monitor, 301 createInstance.Name, 302 newInstance.NetworkInterfaces[0].NetworkIP, 303 newInstance.SelfLink, 304 poolName, 305 zone, 306 m.removeMachineFunc( 307 poolName, 308 createInstance.Name, 309 zone, 310 ), 311 false, 312 machine, 313 false, 314 func() {}, 315 func() {}, 316 false, 317 func() {}, 318 func() {}, 319 ) 320 321 for idx, name := range diskNames { 322 mountPoint := fmt.Sprintf("/mnt/disks/%s", name) 323 if err := infra.Try(monitor, time.NewTimer(time.Minute), 10*time.Second, infraMachine, func(m infra.Machine) error { 324 _, formatErr := m.Execute(nil, 325 fmt.Sprintf("sudo mkfs.ext4 -F /dev/%s && sudo mkdir -p /mnt/disks/%s && sudo mount -o discard,defaults,nobarrier /dev/%s %s && sudo chmod a+w %s && echo UUID=`sudo blkid -s UUID -o value /dev/disk/by-id/google-local-nvme-ssd-%d` %s ext4 discard,defaults,nofail,nobarrier 0 2 | sudo tee -a /etc/fstab", name, name, name, mountPoint, mountPoint, idx, mountPoint), 326 ) 327 return formatErr 328 }); err != nil { 329 remove, cleanupErr := infraMachine.Destroy() 330 if cleanupErr != nil { 331 panic(cleanupErr) 332 } 333 if rmErr := remove(); rmErr != nil { 334 panic(rmErr) 335 } 336 return nil, err 337 } 338 monitor.WithField("mountpoint", mountPoint).Info("Disk formatted") 339 } 340 341 return infraMachine, nil 342 } 343 344 func (m *machinesService) ListPools() ([]string, error) { 345 pools, err := getAllInstances(m) 346 if err != nil { 347 return nil, err 348 } 349 350 var poolNames []string 351 for poolName := range pools { 352 copyPoolName := poolName 353 poolNames = append(poolNames, copyPoolName) 354 } 355 return poolNames, nil 356 } 357 358 func (m *machinesService) List(poolName string) (infra.Machines, error) { 359 pools, err := getAllInstances(m) 360 if err != nil { 361 return nil, err 362 } 363 364 pool := pools[poolName] 365 machines := make([]infra.Machine, len(pool)) 366 for idx, machine := range pool { 367 copyInstance := *machine 368 machines[idx] = ©Instance 369 } 370 371 return machines, nil 372 } 373 374 func getAllInstances(m *machinesService) (map[string][]*instance, error) { 375 m.cache.Lock() 376 if m.cache.instances != nil { 377 m.cache.Unlock() 378 return m.cache.instances, nil 379 } else { 380 m.cache.instances = make(map[string][]*instance) 381 } 382 m.cache.Unlock() 383 384 region, err := m.context.client.Regions.Get(m.context.projectID, m.context.desired.Region).Do() 385 if err != nil { 386 return nil, err 387 } 388 389 for zoneURLI := range region.Zones { 390 zoneURL := region.Zones[zoneURLI] 391 zoneURLParts := strings.Split(zoneURL, "/") 392 zone := zoneURLParts[(len(zoneURLParts) - 1)] 393 394 instances, err := m.context.client.Instances. 395 List(m.context.projectID, zone). 396 Filter(fmt.Sprintf(`labels.orb=%s AND labels.provider=%s`, m.context.orbID, m.context.providerID)). 397 Fields("items(name,labels,selfLink,status,scheduling(preemptible),networkInterfaces(networkIP))"). 398 Do() 399 if err != nil { 400 return nil, err 401 } 402 403 for _, inst := range instances.Items { 404 405 if inst.Labels["orb"] != m.context.orbID || inst.Labels["provider"] != m.context.providerID { 406 continue 407 } 408 409 pool := inst.Labels["pool"] 410 411 var machine machine 412 if m.oneoff { 413 machine = newGCEMachine(m.context, m.context.monitor.WithFields(toFields(inst.Labels)), inst.Name, zone) 414 } else { 415 sshMachine := ssh.NewMachine(m.context.monitor.WithFields(toFields(inst.Labels)), "orbiter", inst.NetworkInterfaces[0].NetworkIP) 416 if err := sshMachine.UseKey([]byte(m.key.Private.Value)); err != nil { 417 return nil, err 418 } 419 machine = sshMachine 420 } 421 422 rebootRequired := false 423 unrequireReboot := func() {} 424 for idx, req := range m.context.desired.RebootRequired { 425 if req == inst.Name { 426 rebootRequired = true 427 unrequireReboot = func(pos int) func() { 428 return func() { 429 copy(m.context.desired.RebootRequired[pos:], m.context.desired.RebootRequired[pos+1:]) 430 m.context.desired.RebootRequired[len(m.context.desired.RebootRequired)-1] = "" 431 m.context.desired.RebootRequired = m.context.desired.RebootRequired[:len(m.context.desired.RebootRequired)-1] 432 } 433 }(idx) 434 break 435 } 436 } 437 438 replacementRequired := false 439 unrequireReplacement := func() {} 440 for idx, req := range m.context.desired.ReplacementRequired { 441 if req == inst.Name { 442 replacementRequired = true 443 unrequireReplacement = func(pos int) func() { 444 return func() { 445 copy(m.context.desired.ReplacementRequired[pos:], m.context.desired.ReplacementRequired[pos+1:]) 446 m.context.desired.ReplacementRequired[len(m.context.desired.ReplacementRequired)-1] = "" 447 m.context.desired.ReplacementRequired = m.context.desired.ReplacementRequired[:len(m.context.desired.ReplacementRequired)-1] 448 } 449 }(idx) 450 break 451 } 452 } 453 454 mach := newMachine( 455 m.context, 456 m.context.monitor.WithField("name", inst.Name).WithFields(toFields(inst.Labels)), 457 inst.Name, 458 inst.NetworkInterfaces[0].NetworkIP, 459 inst.SelfLink, 460 pool, 461 zone, 462 m.removeMachineFunc(pool, inst.Name, zone), 463 inst.Status == "TERMINATED" && inst.Scheduling.Preemptible, 464 machine, 465 rebootRequired, 466 func(id string) func() { 467 return func() { m.context.desired.RebootRequired = append(m.context.desired.RebootRequired, id) } 468 }(inst.Name), 469 unrequireReboot, 470 replacementRequired, 471 func(id string) func() { 472 return func() { m.context.desired.ReplacementRequired = append(m.context.desired.ReplacementRequired, id) } 473 }(inst.Name), 474 unrequireReplacement, 475 ) 476 m.cache.Lock() 477 m.cache.instances[pool] = append(m.cache.instances[pool], mach) 478 m.cache.Unlock() 479 } 480 } 481 482 return m.cache.instances, nil 483 } 484 485 func toFields(labels map[string]string) map[string]interface{} { 486 fields := make(map[string]interface{}) 487 for key, label := range labels { 488 fields[key] = label 489 } 490 return fields 491 } 492 493 func (m *machinesService) removeMachineFunc(pool, id, zone string) func() error { 494 return func() error { 495 496 m.cache.Lock() 497 cleanMachines := make([]*instance, 0) 498 for _, cachedMachine := range m.cache.instances[pool] { 499 if cachedMachine.X_ID != id { 500 cleanMachines = append(cleanMachines, cachedMachine) 501 } 502 } 503 m.cache.instances[pool] = cleanMachines 504 m.cache.Unlock() 505 506 return removeResourceFunc( 507 m.context.monitor.WithFields(map[string]interface{}{ 508 "pool": pool, 509 "machine": id, 510 }), 511 "instance", 512 id, 513 m.context.client.Instances.Delete(m.context.projectID, zone, id).RequestId(uuid.NewV1().String()).Do, 514 )() 515 } 516 } 517 518 func networkTags(orbID, providerID string, poolName ...string) []string { 519 tags := []string{ 520 orbNetworkTag(orbID), 521 fmt.Sprintf("provider-%s", providerID), 522 } 523 for _, pool := range poolName { 524 tags = append(tags, fmt.Sprintf("pool-%s", pool)) 525 } 526 return tags 527 } 528 529 func orbNetworkTag(orbID string) string { 530 return fmt.Sprintf("orb-%s", orbID) 531 }