github.com/secure-build/gitlab-runner@v12.5.0+incompatible/executors/docker/machine/provider.go (about)

     1  package machine
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/prometheus/client_golang/prometheus"
    10  	"github.com/sirupsen/logrus"
    11  
    12  	"gitlab.com/gitlab-org/gitlab-runner/common"
    13  	docker_helpers "gitlab.com/gitlab-org/gitlab-runner/helpers/docker"
    14  )
    15  
    16  type machineProvider struct {
    17  	name        string
    18  	machine     docker_helpers.Machine
    19  	details     machinesDetails
    20  	lock        sync.RWMutex
    21  	acquireLock sync.Mutex
    22  	// provider stores a real executor that is used to start run the builds
    23  	provider common.ExecutorProvider
    24  
    25  	stuckRemoveLock sync.Mutex
    26  
    27  	// metrics
    28  	totalActions      *prometheus.CounterVec
    29  	currentStatesDesc *prometheus.Desc
    30  	creationHistogram prometheus.Histogram
    31  }
    32  
    33  func (m *machineProvider) machineDetails(name string, acquire bool) *machineDetails {
    34  	m.lock.Lock()
    35  	defer m.lock.Unlock()
    36  
    37  	details, ok := m.details[name]
    38  	if !ok {
    39  		details = &machineDetails{
    40  			Name:      name,
    41  			Created:   time.Now(),
    42  			Used:      time.Now(),
    43  			LastSeen:  time.Now(),
    44  			UsedCount: 1, // any machine that we find we mark as already used
    45  			State:     machineStateIdle,
    46  		}
    47  		m.details[name] = details
    48  	}
    49  
    50  	if acquire {
    51  		if details.isUsed() {
    52  			return nil
    53  		}
    54  		details.State = machineStateAcquired
    55  	}
    56  
    57  	return details
    58  }
    59  
    60  func (m *machineProvider) create(config *common.RunnerConfig, state machineState) (details *machineDetails, errCh chan error) {
    61  	name := newMachineName(config)
    62  	details = m.machineDetails(name, true)
    63  	details.State = machineStateCreating
    64  	details.UsedCount = 0
    65  	details.RetryCount = 0
    66  	details.LastSeen = time.Now()
    67  	errCh = make(chan error, 1)
    68  
    69  	// Create machine asynchronously
    70  	go func() {
    71  		started := time.Now()
    72  		err := m.machine.Create(config.Machine.MachineDriver, details.Name, config.Machine.MachineOptions...)
    73  		for i := 0; i < 3 && err != nil; i++ {
    74  			details.RetryCount++
    75  			logrus.WithField("name", details.Name).
    76  				WithError(err).
    77  				Warningln("Machine creation failed, trying to provision")
    78  			time.Sleep(provisionRetryInterval)
    79  			err = m.machine.Provision(details.Name)
    80  		}
    81  
    82  		if err != nil {
    83  			logrus.WithField("name", details.Name).
    84  				WithField("time", time.Since(started)).
    85  				WithError(err).
    86  				Errorln("Machine creation failed")
    87  			m.remove(details.Name, "Failed to create")
    88  		} else {
    89  			details.State = state
    90  			details.Used = time.Now()
    91  			creationTime := time.Since(started)
    92  			logrus.WithField("duration", creationTime).
    93  				WithField("name", details.Name).
    94  				WithField("now", time.Now()).
    95  				WithField("retries", details.RetryCount).
    96  				Infoln("Machine created")
    97  			m.totalActions.WithLabelValues("created").Inc()
    98  			m.creationHistogram.Observe(creationTime.Seconds())
    99  		}
   100  		errCh <- err
   101  	}()
   102  	return
   103  }
   104  
   105  func (m *machineProvider) findFreeMachine(skipCache bool, machines ...string) (details *machineDetails) {
   106  	// Enumerate all machines in reverse order, to always take the newest machines first
   107  	for idx := range machines {
   108  		name := machines[len(machines)-idx-1]
   109  		details := m.machineDetails(name, true)
   110  		if details == nil {
   111  			continue
   112  		}
   113  
   114  		// Check if node is running
   115  		canConnect := m.machine.CanConnect(name, skipCache)
   116  		if !canConnect {
   117  			m.remove(name, "machine is unavailable")
   118  			continue
   119  		}
   120  		return details
   121  	}
   122  
   123  	return nil
   124  }
   125  
   126  func (m *machineProvider) useMachine(config *common.RunnerConfig) (details *machineDetails, err error) {
   127  	machines, err := m.loadMachines(config)
   128  	if err != nil {
   129  		return
   130  	}
   131  	details = m.findFreeMachine(true, machines...)
   132  	if details == nil {
   133  		var errCh chan error
   134  		details, errCh = m.create(config, machineStateAcquired)
   135  		err = <-errCh
   136  	}
   137  	return
   138  }
   139  
   140  func (m *machineProvider) retryUseMachine(config *common.RunnerConfig) (details *machineDetails, err error) {
   141  	// Try to find a machine
   142  	for i := 0; i < 3; i++ {
   143  		details, err = m.useMachine(config)
   144  		if err == nil {
   145  			break
   146  		}
   147  		time.Sleep(provisionRetryInterval)
   148  	}
   149  	return
   150  }
   151  
   152  func (m *machineProvider) removeMachine(details *machineDetails) (err error) {
   153  	if !m.machine.Exist(details.Name) {
   154  		details.logger().
   155  			Warningln("Skipping machine removal, because it doesn't exist")
   156  		return nil
   157  	}
   158  
   159  	// This code limits amount of removal of stuck machines to one machine per interval
   160  	if details.isStuckOnRemove() {
   161  		m.stuckRemoveLock.Lock()
   162  		defer m.stuckRemoveLock.Unlock()
   163  	}
   164  
   165  	details.logger().
   166  		Warningln("Stopping machine")
   167  	err = m.machine.Stop(details.Name, machineStopCommandTimeout)
   168  	if err != nil {
   169  		details.logger().
   170  			WithError(err).
   171  			Warningln("Error while stopping machine")
   172  	}
   173  
   174  	details.logger().
   175  		Warningln("Removing machine")
   176  	err = m.machine.Remove(details.Name)
   177  	if err != nil {
   178  		details.RetryCount++
   179  		time.Sleep(removeRetryInterval)
   180  		return err
   181  	}
   182  
   183  	return nil
   184  }
   185  
   186  func (m *machineProvider) finalizeRemoval(details *machineDetails) {
   187  	for {
   188  		err := m.removeMachine(details)
   189  		if err == nil {
   190  			break
   191  		}
   192  	}
   193  
   194  	m.lock.Lock()
   195  	defer m.lock.Unlock()
   196  	delete(m.details, details.Name)
   197  
   198  	details.logger().
   199  		WithField("now", time.Now()).
   200  		WithField("retries", details.RetryCount).
   201  		Infoln("Machine removed")
   202  
   203  	m.totalActions.WithLabelValues("removed").Inc()
   204  }
   205  
   206  func (m *machineProvider) remove(machineName string, reason ...interface{}) error {
   207  	m.lock.Lock()
   208  	defer m.lock.Unlock()
   209  
   210  	details, _ := m.details[machineName]
   211  	if details == nil {
   212  		return errors.New("machine not found")
   213  	}
   214  
   215  	details.Reason = fmt.Sprint(reason...)
   216  	details.State = machineStateRemoving
   217  	details.RetryCount = 0
   218  
   219  	details.logger().
   220  		WithField("now", time.Now()).
   221  		Warningln("Requesting machine removal")
   222  
   223  	details.Used = time.Now()
   224  	details.writeDebugInformation()
   225  
   226  	go m.finalizeRemoval(details)
   227  	return nil
   228  }
   229  
   230  func (m *machineProvider) updateMachine(config *common.RunnerConfig, data *machinesData, details *machineDetails) error {
   231  	if details.State != machineStateIdle {
   232  		return nil
   233  	}
   234  
   235  	if config.Machine.MaxBuilds > 0 && details.UsedCount >= config.Machine.MaxBuilds {
   236  		// Limit number of builds
   237  		return errors.New("too many builds")
   238  	}
   239  
   240  	if data.Total() >= config.Limit && config.Limit > 0 {
   241  		// Limit maximum number of machines
   242  		return errors.New("too many machines")
   243  	}
   244  
   245  	if time.Since(details.Used) > time.Second*time.Duration(config.Machine.GetIdleTime()) {
   246  		if data.Idle >= config.Machine.GetIdleCount() {
   247  			// Remove machine that are way over the idle time
   248  			return errors.New("too many idle machines")
   249  		}
   250  	}
   251  	return nil
   252  }
   253  
   254  func (m *machineProvider) updateMachines(machines []string, config *common.RunnerConfig) (data machinesData, validMachines []string) {
   255  	data.Runner = config.ShortDescription()
   256  	validMachines = make([]string, 0, len(machines))
   257  
   258  	for _, name := range machines {
   259  		details := m.machineDetails(name, false)
   260  		details.LastSeen = time.Now()
   261  
   262  		err := m.updateMachine(config, &data, details)
   263  		if err == nil {
   264  			validMachines = append(validMachines, name)
   265  		} else {
   266  			m.remove(details.Name, err)
   267  		}
   268  
   269  		data.Add(details)
   270  	}
   271  	return
   272  }
   273  
   274  func (m *machineProvider) createMachines(config *common.RunnerConfig, data *machinesData) {
   275  	// Create a new machines and mark them as Idle
   276  	for {
   277  		if data.Available() >= config.Machine.GetIdleCount() {
   278  			// Limit maximum number of idle machines
   279  			break
   280  		}
   281  		if data.Total() >= config.Limit && config.Limit > 0 {
   282  			// Limit maximum number of machines
   283  			break
   284  		}
   285  		m.create(config, machineStateIdle)
   286  		data.Creating++
   287  	}
   288  }
   289  
   290  // intermediateMachineList returns a list of machines that might not yet be
   291  // persisted on disk, these machines are the ones between being virtually
   292  // created, and `docker-machine create` getting executed we populate this data
   293  // set to overcome the race conditions related to not-full set of machines
   294  // returned by `docker-machine ls -q`
   295  func (m *machineProvider) intermediateMachineList(excludedMachines []string) []string {
   296  	var excludedSet map[string]struct{}
   297  	var intermediateMachines []string
   298  
   299  	m.lock.Lock()
   300  	defer m.lock.Unlock()
   301  
   302  	for _, details := range m.details {
   303  		if details.isPersistedOnDisk() {
   304  			continue
   305  		}
   306  
   307  		// lazy init set, as most of times we don't create new machines
   308  		if excludedSet == nil {
   309  			excludedSet = make(map[string]struct{}, len(excludedMachines))
   310  			for _, excludedMachine := range excludedMachines {
   311  				excludedSet[excludedMachine] = struct{}{}
   312  			}
   313  		}
   314  
   315  		if _, ok := excludedSet[details.Name]; ok {
   316  			continue
   317  		}
   318  
   319  		intermediateMachines = append(intermediateMachines, details.Name)
   320  	}
   321  
   322  	return intermediateMachines
   323  }
   324  
   325  func (m *machineProvider) loadMachines(config *common.RunnerConfig) (machines []string, err error) {
   326  	machines, err = m.machine.List()
   327  	if err != nil {
   328  		return nil, err
   329  	}
   330  
   331  	machines = append(machines, m.intermediateMachineList(machines)...)
   332  	machines = filterMachineList(machines, machineFilter(config))
   333  	return
   334  }
   335  
   336  func (m *machineProvider) Acquire(config *common.RunnerConfig) (data common.ExecutorData, err error) {
   337  	if config.Machine == nil || config.Machine.MachineName == "" {
   338  		err = fmt.Errorf("missing Machine options")
   339  		return
   340  	}
   341  
   342  	// Lock updating machines, because two Acquires can be run at the same time
   343  	m.acquireLock.Lock()
   344  	defer m.acquireLock.Unlock()
   345  
   346  	machines, err := m.loadMachines(config)
   347  	if err != nil {
   348  		return
   349  	}
   350  
   351  	// Update a list of currently configured machines
   352  	machinesData, validMachines := m.updateMachines(machines, config)
   353  
   354  	// Pre-create machines
   355  	m.createMachines(config, &machinesData)
   356  
   357  	logrus.WithFields(machinesData.Fields()).
   358  		WithField("runner", config.ShortDescription()).
   359  		WithField("minIdleCount", config.Machine.GetIdleCount()).
   360  		WithField("maxMachines", config.Limit).
   361  		WithField("time", time.Now()).
   362  		Debugln("Docker Machine Details")
   363  	machinesData.writeDebugInformation()
   364  
   365  	// Try to find a free machine
   366  	details := m.findFreeMachine(false, validMachines...)
   367  	if details != nil {
   368  		data = details
   369  		return
   370  	}
   371  
   372  	// If we have a free machines we can process a build
   373  	if config.Machine.GetIdleCount() != 0 && machinesData.Idle == 0 {
   374  		err = errors.New("no free machines that can process builds")
   375  	}
   376  	return
   377  }
   378  
   379  func (m *machineProvider) Use(config *common.RunnerConfig, data common.ExecutorData) (newConfig common.RunnerConfig, newData common.ExecutorData, err error) {
   380  	// Find a new machine
   381  	details, _ := data.(*machineDetails)
   382  	if details == nil || !details.canBeUsed() || !m.machine.CanConnect(details.Name, true) {
   383  		details, err = m.retryUseMachine(config)
   384  		if err != nil {
   385  			return
   386  		}
   387  
   388  		// Return details only if this is a new instance
   389  		newData = details
   390  	}
   391  
   392  	// Get machine credentials
   393  	dc, err := m.machine.Credentials(details.Name)
   394  	if err != nil {
   395  		if newData != nil {
   396  			m.Release(config, newData)
   397  		}
   398  		newData = nil
   399  		return
   400  	}
   401  
   402  	// Create shallow copy of config and store in it docker credentials
   403  	newConfig = *config
   404  	newConfig.Docker = &common.DockerConfig{}
   405  	if config.Docker != nil {
   406  		*newConfig.Docker = *config.Docker
   407  	}
   408  	newConfig.Docker.DockerCredentials = dc
   409  
   410  	// Mark machine as used
   411  	details.State = machineStateUsed
   412  	details.Used = time.Now()
   413  	details.UsedCount++
   414  	m.totalActions.WithLabelValues("used").Inc()
   415  	return
   416  }
   417  
   418  func (m *machineProvider) Release(config *common.RunnerConfig, data common.ExecutorData) {
   419  	// Release machine
   420  	details, ok := data.(*machineDetails)
   421  	if ok {
   422  		// Mark last used time when is Used
   423  		if details.State == machineStateUsed {
   424  			details.Used = time.Now()
   425  		}
   426  
   427  		// Remove machine if we already used it
   428  		if config != nil && config.Machine != nil &&
   429  			config.Machine.MaxBuilds > 0 && details.UsedCount >= config.Machine.MaxBuilds {
   430  			err := m.remove(details.Name, "Too many builds")
   431  			if err == nil {
   432  				return
   433  			}
   434  		}
   435  		details.State = machineStateIdle
   436  	}
   437  }
   438  
   439  func (m *machineProvider) CanCreate() bool {
   440  	return m.provider.CanCreate()
   441  }
   442  
   443  func (m *machineProvider) GetFeatures(features *common.FeaturesInfo) error {
   444  	return m.provider.GetFeatures(features)
   445  }
   446  
   447  func (m *machineProvider) GetDefaultShell() string {
   448  	return m.provider.GetDefaultShell()
   449  }
   450  
   451  func (m *machineProvider) Create() common.Executor {
   452  	return &machineExecutor{
   453  		provider: m,
   454  	}
   455  }
   456  
   457  func newMachineProvider(name, executor string) *machineProvider {
   458  	provider := common.GetExecutor(executor)
   459  	if provider == nil {
   460  		logrus.Panicln("Missing", executor)
   461  	}
   462  
   463  	return &machineProvider{
   464  		name:     name,
   465  		details:  make(machinesDetails),
   466  		machine:  docker_helpers.NewMachineCommand(),
   467  		provider: provider,
   468  		totalActions: prometheus.NewCounterVec(
   469  			prometheus.CounterOpts{
   470  				Name: "gitlab_runner_autoscaling_actions_total",
   471  				Help: "The total number of actions executed by the provider.",
   472  				ConstLabels: prometheus.Labels{
   473  					"executor": name,
   474  				},
   475  			},
   476  			[]string{"action"},
   477  		),
   478  		currentStatesDesc: prometheus.NewDesc(
   479  			"gitlab_runner_autoscaling_machine_states",
   480  			"The current number of machines per state in this provider.",
   481  			[]string{"state"},
   482  			prometheus.Labels{
   483  				"executor": name,
   484  			},
   485  		),
   486  		creationHistogram: prometheus.NewHistogram(
   487  			prometheus.HistogramOpts{
   488  				Name:    "gitlab_runner_autoscaling_machine_creation_duration_seconds",
   489  				Help:    "Histogram of machine creation time.",
   490  				Buckets: prometheus.ExponentialBuckets(30, 1.25, 10),
   491  				ConstLabels: prometheus.Labels{
   492  					"executor": name,
   493  				},
   494  			},
   495  		),
   496  	}
   497  }