github.com/caos/orbos@v1.5.14-0.20221103111702-e6cd0cea7ad4/internal/operator/orbiter/kinds/providers/gce/computesservice.go (about)

     1  package gce
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	uuid "github.com/satori/go.uuid"
    11  	"google.golang.org/api/compute/v1"
    12  
    13  	"github.com/caos/orbos/internal/operator/orbiter/kinds/clusters/core/infra"
    14  	"github.com/caos/orbos/internal/operator/orbiter/kinds/providers/core"
    15  	"github.com/caos/orbos/internal/operator/orbiter/kinds/providers/ssh"
    16  	"github.com/caos/orbos/mntr"
    17  )
    18  
    19  var _ core.MachinesService = (*machinesService)(nil)
    20  
    21  type creatingInstance struct {
    22  	zone string
    23  	id   string
    24  }
    25  
    26  type machinesService struct {
    27  	context *context
    28  	oneoff  bool
    29  	key     *SSHKey
    30  	cache   struct {
    31  		instances         map[string][]*instance
    32  		creatingInstances map[string][]*creatingInstance
    33  		sync.Mutex
    34  	}
    35  	onCreate func(pool string, machine infra.Machine) error
    36  }
    37  
    38  func newMachinesService(context *context, oneoff bool) *machinesService {
    39  	return &machinesService{
    40  		context: context,
    41  		oneoff:  oneoff,
    42  	}
    43  }
    44  
    45  func (m *machinesService) DesiredMachines(poolName string, instances int) int {
    46  	desired, ok := m.context.desired.Pools[poolName]
    47  	if !ok {
    48  		return 0
    49  	}
    50  
    51  	if desired.Multizonal != nil && len(desired.Multizonal) > 0 {
    52  		return (len(desired.Multizonal) * instances)
    53  	}
    54  	return instances
    55  }
    56  
    57  func (m *machinesService) use(key *SSHKey) error {
    58  	if key == nil || key.Private == nil || key.Public == nil || key.Private.Value == "" || key.Public.Value == "" {
    59  		return mntr.ToUserError(errors.New("machines are not connectable. have you configured the orb by running orbctl configure?"))
    60  	}
    61  	m.key = key
    62  	return nil
    63  }
    64  
    65  func (m *machinesService) restartPreemptibleMachines() error {
    66  	pools, err := getAllInstances(m)
    67  	if err != nil {
    68  		return err
    69  	}
    70  
    71  	for _, pool := range pools {
    72  		for _, instance := range pool {
    73  			if instance.start {
    74  				if err := operateFunc(
    75  					func() { instance.Monitor.Debug("Restarting preemptible instance") },
    76  					computeOpCall(m.context.client.Instances.Start(m.context.projectID, instance.zone, instance.ID()).RequestId(uuid.NewV1().String()).Do),
    77  					func() error { instance.Monitor.Info("Preemptible instance restarted"); return nil },
    78  				)(); err != nil {
    79  					return err
    80  				}
    81  			}
    82  		}
    83  	}
    84  	return nil
    85  }
    86  
    87  func getDesiredZones(defaultZone string, multizonal []string) []string {
    88  	zones := make([]string, 0)
    89  	if multizonal != nil && len(multizonal) > 0 {
    90  		zones = multizonal
    91  	} else {
    92  		zones = append(zones, defaultZone)
    93  	}
    94  	return zones
    95  }
    96  
    97  func (m *machinesService) Create(poolName string, desiredInstances int) (infra.Machines, error) {
    98  	desired, ok := m.context.desired.Pools[poolName]
    99  	if !ok {
   100  		return nil, fmt.Errorf("Pool %s is not configured", poolName)
   101  	}
   102  	usableZone := m.context.desired.Zone
   103  	zones := getDesiredZones(m.context.desired.Zone, desired.Multizonal)
   104  
   105  	// Calculate minimum cpu and memory according to the gce specs:
   106  	// https://cloud.google.com/machine/docs/instances/creating-instance-with-custom-machine-type#specifications
   107  	cores := desired.MinCPUCores
   108  	if cores > 1 {
   109  		if cores%2 != 0 {
   110  			cores++
   111  		}
   112  	}
   113  	memory := float64(desired.MinMemoryGB * 1024)
   114  	memoryPerCore := memory / float64(cores)
   115  	minMemPerCore := 922
   116  	maxMemPerCore := 6656
   117  	for memoryPerCore < float64(minMemPerCore) {
   118  		memoryPerCore = memory / float64(cores)
   119  		memory += 256
   120  	}
   121  
   122  	for memoryPerCore > float64(maxMemPerCore) {
   123  		cores++
   124  		memoryPerCore = float64(memory) / float64(cores)
   125  	}
   126  
   127  	infraMachines := make([]infra.Machine, 0)
   128  	currentInfraMachines := make([]infra.Machine, 0)
   129  	if len(zones) > 1 {
   130  		currentInfraMachinesT, err := m.List(poolName)
   131  		if err != nil {
   132  			return nil, err
   133  		}
   134  		currentInfraMachines = currentInfraMachinesT
   135  		usableZone = ""
   136  		for zoneI := range zones {
   137  			zone := zones[zoneI]
   138  			zoneCovered := 0
   139  			for _, currentInfraMachine := range currentInfraMachines {
   140  				currentGCEMachine, ok := currentInfraMachine.(machine)
   141  				replaceRequired, _, _ := currentInfraMachine.ReplacementRequired()
   142  				if ok && zone == currentGCEMachine.Zone() && !replaceRequired {
   143  					zoneCovered++
   144  				}
   145  			}
   146  			m.cache.Lock()
   147  			for _, currentCreating := range m.cache.creatingInstances[poolName] {
   148  				if currentCreating.zone == zone {
   149  					zoneCovered++
   150  				}
   151  			}
   152  			m.cache.Unlock()
   153  			if zoneCovered >= desiredInstances {
   154  				continue
   155  			}
   156  			// find first usable zone to add machine to then leave loop
   157  			usableZone = zone
   158  			break
   159  		}
   160  
   161  		if usableZone == "" {
   162  			return nil, errors.New("error while creating all zones already covered")
   163  		}
   164  	}
   165  
   166  	name := newName()
   167  	m.cache.Lock()
   168  	if m.cache.creatingInstances == nil {
   169  		m.cache.creatingInstances = map[string][]*creatingInstance{}
   170  	}
   171  	if m.cache.creatingInstances[poolName] == nil {
   172  		m.cache.creatingInstances[poolName] = make([]*creatingInstance, 0)
   173  	}
   174  	m.cache.creatingInstances[poolName] = append(m.cache.creatingInstances[poolName], &creatingInstance{
   175  		zone: usableZone,
   176  		id:   name,
   177  	})
   178  	m.cache.Unlock()
   179  
   180  	monitor := m.context.monitor.WithFields(map[string]interface{}{
   181  		"machine": name,
   182  		"pool":    poolName,
   183  	})
   184  	infraMachine, err := m.getCreatableMachine(
   185  		monitor,
   186  		poolName,
   187  		desired,
   188  		name,
   189  		usableZone,
   190  		cores,
   191  		memory,
   192  	)
   193  	if err != nil {
   194  		return nil, err
   195  	}
   196  
   197  	if m.cache.instances != nil {
   198  		m.cache.Lock()
   199  		if _, ok := m.cache.instances[poolName]; !ok {
   200  			m.cache.instances[poolName] = make([]*instance, 0)
   201  		}
   202  		m.cache.instances[poolName] = append(m.cache.instances[poolName], infraMachine)
   203  		m.cache.Unlock()
   204  	}
   205  
   206  	if err := m.onCreate(poolName, infraMachine); err != nil {
   207  		return nil, err
   208  	}
   209  	monitor.Info("Machine created")
   210  	infraMachines = append(infraMachines, infraMachine)
   211  	for i, instance := range m.cache.creatingInstances[poolName] {
   212  		if instance.id == infraMachine.ID() {
   213  			m.cache.Lock()
   214  			m.cache.creatingInstances[poolName] = append(m.cache.creatingInstances[poolName][:i], m.cache.creatingInstances[poolName][i+1:]...)
   215  			m.cache.Unlock()
   216  		}
   217  	}
   218  	return infraMachines, nil
   219  }
   220  
   221  func (m *machinesService) getCreatableMachine(monitor mntr.Monitor, poolName string, desired *Pool, name string, zone string, cores int, memory float64) (*instance, error) {
   222  	disks := []*compute.AttachedDisk{{
   223  		Type:       "PERSISTENT",
   224  		AutoDelete: true,
   225  		Boot:       true,
   226  		InitializeParams: &compute.AttachedDiskInitializeParams{
   227  			DiskSizeGb:  int64(desired.StorageGB),
   228  			SourceImage: desired.OSImage,
   229  			DiskType:    fmt.Sprintf("zones/%s/diskTypes/%s", zone, desired.StorageDiskType),
   230  		}},
   231  	}
   232  
   233  	diskNames := make([]string, desired.LocalSSDs)
   234  	for i := 0; i < int(desired.LocalSSDs); i++ {
   235  		name := fmt.Sprintf("nvme0n%d", i+1)
   236  		disks = append(disks, &compute.AttachedDisk{
   237  			Type:       "SCRATCH",
   238  			AutoDelete: true,
   239  			Boot:       false,
   240  			Interface:  "NVME",
   241  			InitializeParams: &compute.AttachedDiskInitializeParams{
   242  				DiskType: fmt.Sprintf("zones/%s/diskTypes/local-ssd", zone),
   243  			},
   244  			DeviceName: name,
   245  		})
   246  		diskNames[i] = name
   247  	}
   248  
   249  	nwTags := networkTags(m.context.orbID, m.context.providerID, poolName)
   250  	sshKey := fmt.Sprintf("orbiter:%s", m.key.Public.Value)
   251  	createInstance := &compute.Instance{
   252  		Name:        name,
   253  		MachineType: fmt.Sprintf("zones/%s/machineTypes/custom-%d-%d", zone, cores, int(memory)),
   254  		Tags:        &compute.Tags{Items: nwTags},
   255  		NetworkInterfaces: []*compute.NetworkInterface{{
   256  			Network: m.context.networkURL,
   257  		}},
   258  		Labels:     map[string]string{"orb": m.context.orbID, "provider": m.context.providerID, "pool": poolName},
   259  		Disks:      disks,
   260  		Scheduling: &compute.Scheduling{Preemptible: desired.Preemptible},
   261  		Metadata: &compute.Metadata{
   262  			Items: []*compute.MetadataItems{{
   263  				Key:   "ssh-keys",
   264  				Value: &sshKey,
   265  			}},
   266  		},
   267  		ServiceAccounts: []*compute.ServiceAccount{{
   268  			Scopes: []string{"https://www.googleapis.com/auth/compute"},
   269  		}},
   270  	}
   271  
   272  	if err := operateFunc(
   273  		func() { monitor.Debug("Creating instance") },
   274  		computeOpCall(m.context.client.Instances.Insert(m.context.projectID, zone, createInstance).RequestId(uuid.NewV1().String()).Do),
   275  		func() error { monitor.Info("Instance created"); return nil },
   276  	)(); err != nil {
   277  		return nil, err
   278  	}
   279  
   280  	newInstance, err := m.context.client.Instances.Get(m.context.projectID, zone, createInstance.Name).
   281  		Fields("selfLink,networkInterfaces(networkIP)").
   282  		Do()
   283  	if err != nil {
   284  		return nil, err
   285  	}
   286  
   287  	var machine machine
   288  	if m.oneoff {
   289  		machine = newGCEMachine(m.context, monitor, createInstance.Name, zone)
   290  	} else {
   291  		sshMachine := ssh.NewMachine(monitor, "orbiter", newInstance.NetworkInterfaces[0].NetworkIP)
   292  		if err := sshMachine.UseKey([]byte(m.key.Private.Value)); err != nil {
   293  			return nil, err
   294  		}
   295  		machine = sshMachine
   296  	}
   297  
   298  	infraMachine := newMachine(
   299  		m.context,
   300  		monitor,
   301  		createInstance.Name,
   302  		newInstance.NetworkInterfaces[0].NetworkIP,
   303  		newInstance.SelfLink,
   304  		poolName,
   305  		zone,
   306  		m.removeMachineFunc(
   307  			poolName,
   308  			createInstance.Name,
   309  			zone,
   310  		),
   311  		false,
   312  		machine,
   313  		false,
   314  		func() {},
   315  		func() {},
   316  		false,
   317  		func() {},
   318  		func() {},
   319  	)
   320  
   321  	for idx, name := range diskNames {
   322  		mountPoint := fmt.Sprintf("/mnt/disks/%s", name)
   323  		if err := infra.Try(monitor, time.NewTimer(time.Minute), 10*time.Second, infraMachine, func(m infra.Machine) error {
   324  			_, formatErr := m.Execute(nil,
   325  				fmt.Sprintf("sudo mkfs.ext4 -F /dev/%s && sudo mkdir -p /mnt/disks/%s && sudo mount -o discard,defaults,nobarrier /dev/%s %s && sudo chmod a+w %s && echo UUID=`sudo blkid -s UUID -o value /dev/disk/by-id/google-local-nvme-ssd-%d` %s ext4 discard,defaults,nofail,nobarrier 0 2 | sudo tee -a /etc/fstab", name, name, name, mountPoint, mountPoint, idx, mountPoint),
   326  			)
   327  			return formatErr
   328  		}); err != nil {
   329  			remove, cleanupErr := infraMachine.Destroy()
   330  			if cleanupErr != nil {
   331  				panic(cleanupErr)
   332  			}
   333  			if rmErr := remove(); rmErr != nil {
   334  				panic(rmErr)
   335  			}
   336  			return nil, err
   337  		}
   338  		monitor.WithField("mountpoint", mountPoint).Info("Disk formatted")
   339  	}
   340  
   341  	return infraMachine, nil
   342  }
   343  
   344  func (m *machinesService) ListPools() ([]string, error) {
   345  	pools, err := getAllInstances(m)
   346  	if err != nil {
   347  		return nil, err
   348  	}
   349  
   350  	var poolNames []string
   351  	for poolName := range pools {
   352  		copyPoolName := poolName
   353  		poolNames = append(poolNames, copyPoolName)
   354  	}
   355  	return poolNames, nil
   356  }
   357  
   358  func (m *machinesService) List(poolName string) (infra.Machines, error) {
   359  	pools, err := getAllInstances(m)
   360  	if err != nil {
   361  		return nil, err
   362  	}
   363  
   364  	pool := pools[poolName]
   365  	machines := make([]infra.Machine, len(pool))
   366  	for idx, machine := range pool {
   367  		copyInstance := *machine
   368  		machines[idx] = &copyInstance
   369  	}
   370  
   371  	return machines, nil
   372  }
   373  
   374  func getAllInstances(m *machinesService) (map[string][]*instance, error) {
   375  	m.cache.Lock()
   376  	if m.cache.instances != nil {
   377  		m.cache.Unlock()
   378  		return m.cache.instances, nil
   379  	} else {
   380  		m.cache.instances = make(map[string][]*instance)
   381  	}
   382  	m.cache.Unlock()
   383  
   384  	region, err := m.context.client.Regions.Get(m.context.projectID, m.context.desired.Region).Do()
   385  	if err != nil {
   386  		return nil, err
   387  	}
   388  
   389  	for zoneURLI := range region.Zones {
   390  		zoneURL := region.Zones[zoneURLI]
   391  		zoneURLParts := strings.Split(zoneURL, "/")
   392  		zone := zoneURLParts[(len(zoneURLParts) - 1)]
   393  
   394  		instances, err := m.context.client.Instances.
   395  			List(m.context.projectID, zone).
   396  			Filter(fmt.Sprintf(`labels.orb=%s AND labels.provider=%s`, m.context.orbID, m.context.providerID)).
   397  			Fields("items(name,labels,selfLink,status,scheduling(preemptible),networkInterfaces(networkIP))").
   398  			Do()
   399  		if err != nil {
   400  			return nil, err
   401  		}
   402  
   403  		for _, inst := range instances.Items {
   404  
   405  			if inst.Labels["orb"] != m.context.orbID || inst.Labels["provider"] != m.context.providerID {
   406  				continue
   407  			}
   408  
   409  			pool := inst.Labels["pool"]
   410  
   411  			var machine machine
   412  			if m.oneoff {
   413  				machine = newGCEMachine(m.context, m.context.monitor.WithFields(toFields(inst.Labels)), inst.Name, zone)
   414  			} else {
   415  				sshMachine := ssh.NewMachine(m.context.monitor.WithFields(toFields(inst.Labels)), "orbiter", inst.NetworkInterfaces[0].NetworkIP)
   416  				if err := sshMachine.UseKey([]byte(m.key.Private.Value)); err != nil {
   417  					return nil, err
   418  				}
   419  				machine = sshMachine
   420  			}
   421  
   422  			rebootRequired := false
   423  			unrequireReboot := func() {}
   424  			for idx, req := range m.context.desired.RebootRequired {
   425  				if req == inst.Name {
   426  					rebootRequired = true
   427  					unrequireReboot = func(pos int) func() {
   428  						return func() {
   429  							copy(m.context.desired.RebootRequired[pos:], m.context.desired.RebootRequired[pos+1:])
   430  							m.context.desired.RebootRequired[len(m.context.desired.RebootRequired)-1] = ""
   431  							m.context.desired.RebootRequired = m.context.desired.RebootRequired[:len(m.context.desired.RebootRequired)-1]
   432  						}
   433  					}(idx)
   434  					break
   435  				}
   436  			}
   437  
   438  			replacementRequired := false
   439  			unrequireReplacement := func() {}
   440  			for idx, req := range m.context.desired.ReplacementRequired {
   441  				if req == inst.Name {
   442  					replacementRequired = true
   443  					unrequireReplacement = func(pos int) func() {
   444  						return func() {
   445  							copy(m.context.desired.ReplacementRequired[pos:], m.context.desired.ReplacementRequired[pos+1:])
   446  							m.context.desired.ReplacementRequired[len(m.context.desired.ReplacementRequired)-1] = ""
   447  							m.context.desired.ReplacementRequired = m.context.desired.ReplacementRequired[:len(m.context.desired.ReplacementRequired)-1]
   448  						}
   449  					}(idx)
   450  					break
   451  				}
   452  			}
   453  
   454  			mach := newMachine(
   455  				m.context,
   456  				m.context.monitor.WithField("name", inst.Name).WithFields(toFields(inst.Labels)),
   457  				inst.Name,
   458  				inst.NetworkInterfaces[0].NetworkIP,
   459  				inst.SelfLink,
   460  				pool,
   461  				zone,
   462  				m.removeMachineFunc(pool, inst.Name, zone),
   463  				inst.Status == "TERMINATED" && inst.Scheduling.Preemptible,
   464  				machine,
   465  				rebootRequired,
   466  				func(id string) func() {
   467  					return func() { m.context.desired.RebootRequired = append(m.context.desired.RebootRequired, id) }
   468  				}(inst.Name),
   469  				unrequireReboot,
   470  				replacementRequired,
   471  				func(id string) func() {
   472  					return func() { m.context.desired.ReplacementRequired = append(m.context.desired.ReplacementRequired, id) }
   473  				}(inst.Name),
   474  				unrequireReplacement,
   475  			)
   476  			m.cache.Lock()
   477  			m.cache.instances[pool] = append(m.cache.instances[pool], mach)
   478  			m.cache.Unlock()
   479  		}
   480  	}
   481  
   482  	return m.cache.instances, nil
   483  }
   484  
   485  func toFields(labels map[string]string) map[string]interface{} {
   486  	fields := make(map[string]interface{})
   487  	for key, label := range labels {
   488  		fields[key] = label
   489  	}
   490  	return fields
   491  }
   492  
   493  func (m *machinesService) removeMachineFunc(pool, id, zone string) func() error {
   494  	return func() error {
   495  
   496  		m.cache.Lock()
   497  		cleanMachines := make([]*instance, 0)
   498  		for _, cachedMachine := range m.cache.instances[pool] {
   499  			if cachedMachine.X_ID != id {
   500  				cleanMachines = append(cleanMachines, cachedMachine)
   501  			}
   502  		}
   503  		m.cache.instances[pool] = cleanMachines
   504  		m.cache.Unlock()
   505  
   506  		return removeResourceFunc(
   507  			m.context.monitor.WithFields(map[string]interface{}{
   508  				"pool":    pool,
   509  				"machine": id,
   510  			}),
   511  			"instance",
   512  			id,
   513  			m.context.client.Instances.Delete(m.context.projectID, zone, id).RequestId(uuid.NewV1().String()).Do,
   514  		)()
   515  	}
   516  }
   517  
   518  func networkTags(orbID, providerID string, poolName ...string) []string {
   519  	tags := []string{
   520  		orbNetworkTag(orbID),
   521  		fmt.Sprintf("provider-%s", providerID),
   522  	}
   523  	for _, pool := range poolName {
   524  		tags = append(tags, fmt.Sprintf("pool-%s", pool))
   525  	}
   526  	return tags
   527  }
   528  
   529  func orbNetworkTag(orbID string) string {
   530  	return fmt.Sprintf("orb-%s", orbID)
   531  }