gitlab.com/jfprevost/gitlab-runner-notlscheck@v11.11.4+incompatible/executors/docker/machine/provider.go (about)

     1  package machine
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/prometheus/client_golang/prometheus"
    10  	"github.com/sirupsen/logrus"
    11  
    12  	"gitlab.com/gitlab-org/gitlab-runner/common"
    13  	"gitlab.com/gitlab-org/gitlab-runner/helpers/docker"
    14  )
    15  
    16  type machineProvider struct {
    17  	name        string
    18  	machine     docker_helpers.Machine
    19  	details     machinesDetails
    20  	lock        sync.RWMutex
    21  	acquireLock sync.Mutex
    22  	// provider stores a real executor that is used to start run the builds
    23  	provider common.ExecutorProvider
    24  
    25  	stuckRemoveLock sync.Mutex
    26  
    27  	// metrics
    28  	totalActions      *prometheus.CounterVec
    29  	currentStatesDesc *prometheus.Desc
    30  	creationHistogram prometheus.Histogram
    31  }
    32  
    33  func (m *machineProvider) machineDetails(name string, acquire bool) *machineDetails {
    34  	m.lock.Lock()
    35  	defer m.lock.Unlock()
    36  
    37  	details, ok := m.details[name]
    38  	if !ok {
    39  		details = &machineDetails{
    40  			Name:      name,
    41  			Created:   time.Now(),
    42  			Used:      time.Now(),
    43  			LastSeen:  time.Now(),
    44  			UsedCount: 1, // any machine that we find we mark as already used
    45  			State:     machineStateIdle,
    46  		}
    47  		m.details[name] = details
    48  	}
    49  
    50  	if acquire {
    51  		if details.isUsed() {
    52  			return nil
    53  		}
    54  		details.State = machineStateAcquired
    55  	}
    56  
    57  	return details
    58  }
    59  
    60  func (m *machineProvider) create(config *common.RunnerConfig, state machineState) (details *machineDetails, errCh chan error) {
    61  	name := newMachineName(config)
    62  	details = m.machineDetails(name, true)
    63  	details.State = machineStateCreating
    64  	details.UsedCount = 0
    65  	details.RetryCount = 0
    66  	details.LastSeen = time.Now()
    67  	errCh = make(chan error, 1)
    68  
    69  	// Create machine asynchronously
    70  	go func() {
    71  		started := time.Now()
    72  		err := m.machine.Create(config.Machine.MachineDriver, details.Name, config.Machine.MachineOptions...)
    73  		for i := 0; i < 3 && err != nil; i++ {
    74  			details.RetryCount++
    75  			logrus.WithField("name", details.Name).
    76  				WithError(err).
    77  				Warningln("Machine creation failed, trying to provision")
    78  			time.Sleep(provisionRetryInterval)
    79  			err = m.machine.Provision(details.Name)
    80  		}
    81  
    82  		if err != nil {
    83  			logrus.WithField("name", details.Name).
    84  				WithField("time", time.Since(started)).
    85  				WithError(err).
    86  				Errorln("Machine creation failed")
    87  			m.remove(details.Name, "Failed to create")
    88  		} else {
    89  			details.State = state
    90  			details.Used = time.Now()
    91  			creationTime := time.Since(started)
    92  			logrus.WithField("time", creationTime).
    93  				WithField("name", details.Name).
    94  				WithField("now", time.Now()).
    95  				WithField("retries", details.RetryCount).
    96  				Infoln("Machine created")
    97  			m.totalActions.WithLabelValues("created").Inc()
    98  			m.creationHistogram.Observe(creationTime.Seconds())
    99  		}
   100  		errCh <- err
   101  	}()
   102  	return
   103  }
   104  
   105  func (m *machineProvider) findFreeMachine(skipCache bool, machines ...string) (details *machineDetails) {
   106  	// Enumerate all machines in reverse order, to always take the newest machines first
   107  	for idx := range machines {
   108  		name := machines[len(machines)-idx-1]
   109  		details := m.machineDetails(name, true)
   110  		if details == nil {
   111  			continue
   112  		}
   113  
   114  		// Check if node is running
   115  		canConnect := m.machine.CanConnect(name, skipCache)
   116  		if !canConnect {
   117  			m.remove(name, "machine is unavailable")
   118  			continue
   119  		}
   120  		return details
   121  	}
   122  
   123  	return nil
   124  }
   125  
   126  func (m *machineProvider) useMachine(config *common.RunnerConfig) (details *machineDetails, err error) {
   127  	machines, err := m.loadMachines(config)
   128  	if err != nil {
   129  		return
   130  	}
   131  	details = m.findFreeMachine(true, machines...)
   132  	if details == nil {
   133  		var errCh chan error
   134  		details, errCh = m.create(config, machineStateAcquired)
   135  		err = <-errCh
   136  	}
   137  	return
   138  }
   139  
   140  func (m *machineProvider) retryUseMachine(config *common.RunnerConfig) (details *machineDetails, err error) {
   141  	// Try to find a machine
   142  	for i := 0; i < 3; i++ {
   143  		details, err = m.useMachine(config)
   144  		if err == nil {
   145  			break
   146  		}
   147  		time.Sleep(provisionRetryInterval)
   148  	}
   149  	return
   150  }
   151  
   152  func (m *machineProvider) removeMachine(details *machineDetails) (err error) {
   153  	if !m.machine.Exist(details.Name) {
   154  		details.logger().
   155  			Warningln("Skipping machine removal, because it doesn't exist")
   156  		return nil
   157  	}
   158  
   159  	// This code limits amount of removal of stuck machines to one machine per interval
   160  	if details.isStuckOnRemove() {
   161  		m.stuckRemoveLock.Lock()
   162  		defer m.stuckRemoveLock.Unlock()
   163  	}
   164  
   165  	details.logger().
   166  		Warningln("Stopping machine")
   167  	err = m.machine.Stop(details.Name, machineStopCommandTimeout)
   168  	if err != nil {
   169  		details.logger().
   170  			WithError(err).
   171  			Warningln("Error while stopping machine")
   172  	}
   173  
   174  	details.logger().
   175  		Warningln("Removing machine")
   176  	err = m.machine.Remove(details.Name)
   177  	if err != nil {
   178  		details.RetryCount++
   179  		time.Sleep(removeRetryInterval)
   180  		return err
   181  	}
   182  
   183  	return nil
   184  }
   185  
   186  func (m *machineProvider) finalizeRemoval(details *machineDetails) {
   187  	for {
   188  		err := m.removeMachine(details)
   189  		if err == nil {
   190  			break
   191  		}
   192  	}
   193  
   194  	m.lock.Lock()
   195  	defer m.lock.Unlock()
   196  	delete(m.details, details.Name)
   197  
   198  	details.logger().
   199  		WithField("now", time.Now()).
   200  		WithField("retries", details.RetryCount).
   201  		Infoln("Machine removed")
   202  
   203  	m.totalActions.WithLabelValues("removed").Inc()
   204  }
   205  
   206  func (m *machineProvider) remove(machineName string, reason ...interface{}) error {
   207  	m.lock.Lock()
   208  	defer m.lock.Unlock()
   209  
   210  	details, _ := m.details[machineName]
   211  	if details == nil {
   212  		return errors.New("Machine not found")
   213  	}
   214  
   215  	details.Reason = fmt.Sprint(reason...)
   216  	details.State = machineStateRemoving
   217  	details.RetryCount = 0
   218  
   219  	details.logger().
   220  		WithField("now", time.Now()).
   221  		Warningln("Requesting machine removal")
   222  
   223  	details.Used = time.Now()
   224  	details.writeDebugInformation()
   225  
   226  	go m.finalizeRemoval(details)
   227  	return nil
   228  }
   229  
   230  func (m *machineProvider) updateMachine(config *common.RunnerConfig, data *machinesData, details *machineDetails) error {
   231  	if details.State != machineStateIdle {
   232  		return nil
   233  	}
   234  
   235  	if config.Machine.MaxBuilds > 0 && details.UsedCount >= config.Machine.MaxBuilds {
   236  		// Limit number of builds
   237  		return errors.New("Too many builds")
   238  	}
   239  
   240  	if data.Total() >= config.Limit && config.Limit > 0 {
   241  		// Limit maximum number of machines
   242  		return errors.New("Too many machines")
   243  	}
   244  
   245  	if time.Since(details.Used) > time.Second*time.Duration(config.Machine.GetIdleTime()) {
   246  		if data.Idle >= config.Machine.GetIdleCount() {
   247  			// Remove machine that are way over the idle time
   248  			return errors.New("Too many idle machines")
   249  		}
   250  	}
   251  	return nil
   252  }
   253  
   254  func (m *machineProvider) updateMachines(machines []string, config *common.RunnerConfig) (data machinesData, validMachines []string) {
   255  	data.Runner = config.ShortDescription()
   256  	validMachines = make([]string, 0, len(machines))
   257  
   258  	for _, name := range machines {
   259  		details := m.machineDetails(name, false)
   260  		details.LastSeen = time.Now()
   261  
   262  		err := m.updateMachine(config, &data, details)
   263  		if err == nil {
   264  			validMachines = append(validMachines, name)
   265  		} else {
   266  			m.remove(details.Name, err)
   267  		}
   268  
   269  		data.Add(details)
   270  	}
   271  	return
   272  }
   273  
   274  func (m *machineProvider) createMachines(config *common.RunnerConfig, data *machinesData) {
   275  	// Create a new machines and mark them as Idle
   276  	for {
   277  		if data.Available() >= config.Machine.GetIdleCount() {
   278  			// Limit maximum number of idle machines
   279  			break
   280  		}
   281  		if data.Total() >= config.Limit && config.Limit > 0 {
   282  			// Limit maximum number of machines
   283  			break
   284  		}
   285  		m.create(config, machineStateIdle)
   286  		data.Creating++
   287  	}
   288  }
   289  
   290  func (m *machineProvider) loadMachines(config *common.RunnerConfig) (machines []string, err error) {
   291  	machines, err = m.machine.List()
   292  	if err != nil {
   293  		return nil, err
   294  	}
   295  
   296  	machines = filterMachineList(machines, machineFilter(config))
   297  	return
   298  }
   299  
   300  func (m *machineProvider) Acquire(config *common.RunnerConfig) (data common.ExecutorData, err error) {
   301  	if config.Machine == nil || config.Machine.MachineName == "" {
   302  		err = fmt.Errorf("Missing Machine options")
   303  		return
   304  	}
   305  
   306  	// Lock updating machines, because two Acquires can be run at the same time
   307  	m.acquireLock.Lock()
   308  	defer m.acquireLock.Unlock()
   309  
   310  	machines, err := m.loadMachines(config)
   311  	if err != nil {
   312  		return
   313  	}
   314  
   315  	// Update a list of currently configured machines
   316  	machinesData, validMachines := m.updateMachines(machines, config)
   317  
   318  	// Pre-create machines
   319  	m.createMachines(config, &machinesData)
   320  
   321  	logrus.WithFields(machinesData.Fields()).
   322  		WithField("runner", config.ShortDescription()).
   323  		WithField("minIdleCount", config.Machine.GetIdleCount()).
   324  		WithField("maxMachines", config.Limit).
   325  		WithField("time", time.Now()).
   326  		Debugln("Docker Machine Details")
   327  	machinesData.writeDebugInformation()
   328  
   329  	// Try to find a free machine
   330  	details := m.findFreeMachine(false, validMachines...)
   331  	if details != nil {
   332  		data = details
   333  		return
   334  	}
   335  
   336  	// If we have a free machines we can process a build
   337  	if config.Machine.GetIdleCount() != 0 && machinesData.Idle == 0 {
   338  		err = errors.New("No free machines that can process builds")
   339  	}
   340  	return
   341  }
   342  
   343  func (m *machineProvider) Use(config *common.RunnerConfig, data common.ExecutorData) (newConfig common.RunnerConfig, newData common.ExecutorData, err error) {
   344  	// Find a new machine
   345  	details, _ := data.(*machineDetails)
   346  	if details == nil || !details.canBeUsed() || !m.machine.CanConnect(details.Name, true) {
   347  		details, err = m.retryUseMachine(config)
   348  		if err != nil {
   349  			return
   350  		}
   351  
   352  		// Return details only if this is a new instance
   353  		newData = details
   354  	}
   355  
   356  	// Get machine credentials
   357  	dc, err := m.machine.Credentials(details.Name)
   358  	if err != nil {
   359  		if newData != nil {
   360  			m.Release(config, newData)
   361  		}
   362  		newData = nil
   363  		return
   364  	}
   365  
   366  	// Create shallow copy of config and store in it docker credentials
   367  	newConfig = *config
   368  	newConfig.Docker = &common.DockerConfig{}
   369  	if config.Docker != nil {
   370  		*newConfig.Docker = *config.Docker
   371  	}
   372  	newConfig.Docker.DockerCredentials = dc
   373  
   374  	// Mark machine as used
   375  	details.State = machineStateUsed
   376  	details.Used = time.Now()
   377  	details.UsedCount++
   378  	m.totalActions.WithLabelValues("used").Inc()
   379  	return
   380  }
   381  
   382  func (m *machineProvider) Release(config *common.RunnerConfig, data common.ExecutorData) {
   383  	// Release machine
   384  	details, ok := data.(*machineDetails)
   385  	if ok {
   386  		// Mark last used time when is Used
   387  		if details.State == machineStateUsed {
   388  			details.Used = time.Now()
   389  		}
   390  
   391  		// Remove machine if we already used it
   392  		if config != nil && config.Machine != nil &&
   393  			config.Machine.MaxBuilds > 0 && details.UsedCount >= config.Machine.MaxBuilds {
   394  			err := m.remove(details.Name, "Too many builds")
   395  			if err == nil {
   396  				return
   397  			}
   398  		}
   399  		details.State = machineStateIdle
   400  	}
   401  }
   402  
   403  func (m *machineProvider) CanCreate() bool {
   404  	return m.provider.CanCreate()
   405  }
   406  
   407  func (m *machineProvider) GetFeatures(features *common.FeaturesInfo) error {
   408  	return m.provider.GetFeatures(features)
   409  }
   410  
   411  func (m *machineProvider) GetDefaultShell() string {
   412  	return m.provider.GetDefaultShell()
   413  }
   414  
   415  func (m *machineProvider) Create() common.Executor {
   416  	return &machineExecutor{
   417  		provider: m,
   418  	}
   419  }
   420  
   421  func newMachineProvider(name, executor string) *machineProvider {
   422  	provider := common.GetExecutor(executor)
   423  	if provider == nil {
   424  		logrus.Panicln("Missing", executor)
   425  	}
   426  
   427  	return &machineProvider{
   428  		name:     name,
   429  		details:  make(machinesDetails),
   430  		machine:  docker_helpers.NewMachineCommand(),
   431  		provider: provider,
   432  		totalActions: prometheus.NewCounterVec(
   433  			prometheus.CounterOpts{
   434  				Name: "gitlab_runner_autoscaling_actions_total",
   435  				Help: "The total number of actions executed by the provider.",
   436  				ConstLabels: prometheus.Labels{
   437  					"executor": name,
   438  				},
   439  			},
   440  			[]string{"action"},
   441  		),
   442  		currentStatesDesc: prometheus.NewDesc(
   443  			"gitlab_runner_autoscaling_machine_states",
   444  			"The current number of machines per state in this provider.",
   445  			[]string{"state"},
   446  			prometheus.Labels{
   447  				"executor": name,
   448  			},
   449  		),
   450  		creationHistogram: prometheus.NewHistogram(
   451  			prometheus.HistogramOpts{
   452  				Name:    "gitlab_runner_autoscaling_machine_creation_duration_seconds",
   453  				Help:    "Histogram of machine creation time.",
   454  				Buckets: prometheus.ExponentialBuckets(30, 1.25, 10),
   455  				ConstLabels: prometheus.Labels{
   456  					"executor": name,
   457  				},
   458  			},
   459  		),
   460  	}
   461  }