gitlab.com/jfprevost/gitlab-runner-notlscheck@v11.11.4+incompatible/executors/docker/machine/provider.go (about) 1 package machine 2 3 import ( 4 "errors" 5 "fmt" 6 "sync" 7 "time" 8 9 "github.com/prometheus/client_golang/prometheus" 10 "github.com/sirupsen/logrus" 11 12 "gitlab.com/gitlab-org/gitlab-runner/common" 13 "gitlab.com/gitlab-org/gitlab-runner/helpers/docker" 14 ) 15 16 type machineProvider struct { 17 name string 18 machine docker_helpers.Machine 19 details machinesDetails 20 lock sync.RWMutex 21 acquireLock sync.Mutex 22 // provider stores a real executor that is used to start run the builds 23 provider common.ExecutorProvider 24 25 stuckRemoveLock sync.Mutex 26 27 // metrics 28 totalActions *prometheus.CounterVec 29 currentStatesDesc *prometheus.Desc 30 creationHistogram prometheus.Histogram 31 } 32 33 func (m *machineProvider) machineDetails(name string, acquire bool) *machineDetails { 34 m.lock.Lock() 35 defer m.lock.Unlock() 36 37 details, ok := m.details[name] 38 if !ok { 39 details = &machineDetails{ 40 Name: name, 41 Created: time.Now(), 42 Used: time.Now(), 43 LastSeen: time.Now(), 44 UsedCount: 1, // any machine that we find we mark as already used 45 State: machineStateIdle, 46 } 47 m.details[name] = details 48 } 49 50 if acquire { 51 if details.isUsed() { 52 return nil 53 } 54 details.State = machineStateAcquired 55 } 56 57 return details 58 } 59 60 func (m *machineProvider) create(config *common.RunnerConfig, state machineState) (details *machineDetails, errCh chan error) { 61 name := newMachineName(config) 62 details = m.machineDetails(name, true) 63 details.State = machineStateCreating 64 details.UsedCount = 0 65 details.RetryCount = 0 66 details.LastSeen = time.Now() 67 errCh = make(chan error, 1) 68 69 // Create machine asynchronously 70 go func() { 71 started := time.Now() 72 err := m.machine.Create(config.Machine.MachineDriver, details.Name, config.Machine.MachineOptions...) 73 for i := 0; i < 3 && err != nil; i++ { 74 details.RetryCount++ 75 logrus.WithField("name", details.Name). 76 WithError(err). 77 Warningln("Machine creation failed, trying to provision") 78 time.Sleep(provisionRetryInterval) 79 err = m.machine.Provision(details.Name) 80 } 81 82 if err != nil { 83 logrus.WithField("name", details.Name). 84 WithField("time", time.Since(started)). 85 WithError(err). 86 Errorln("Machine creation failed") 87 m.remove(details.Name, "Failed to create") 88 } else { 89 details.State = state 90 details.Used = time.Now() 91 creationTime := time.Since(started) 92 logrus.WithField("time", creationTime). 93 WithField("name", details.Name). 94 WithField("now", time.Now()). 95 WithField("retries", details.RetryCount). 96 Infoln("Machine created") 97 m.totalActions.WithLabelValues("created").Inc() 98 m.creationHistogram.Observe(creationTime.Seconds()) 99 } 100 errCh <- err 101 }() 102 return 103 } 104 105 func (m *machineProvider) findFreeMachine(skipCache bool, machines ...string) (details *machineDetails) { 106 // Enumerate all machines in reverse order, to always take the newest machines first 107 for idx := range machines { 108 name := machines[len(machines)-idx-1] 109 details := m.machineDetails(name, true) 110 if details == nil { 111 continue 112 } 113 114 // Check if node is running 115 canConnect := m.machine.CanConnect(name, skipCache) 116 if !canConnect { 117 m.remove(name, "machine is unavailable") 118 continue 119 } 120 return details 121 } 122 123 return nil 124 } 125 126 func (m *machineProvider) useMachine(config *common.RunnerConfig) (details *machineDetails, err error) { 127 machines, err := m.loadMachines(config) 128 if err != nil { 129 return 130 } 131 details = m.findFreeMachine(true, machines...) 132 if details == nil { 133 var errCh chan error 134 details, errCh = m.create(config, machineStateAcquired) 135 err = <-errCh 136 } 137 return 138 } 139 140 func (m *machineProvider) retryUseMachine(config *common.RunnerConfig) (details *machineDetails, err error) { 141 // Try to find a machine 142 for i := 0; i < 3; i++ { 143 details, err = m.useMachine(config) 144 if err == nil { 145 break 146 } 147 time.Sleep(provisionRetryInterval) 148 } 149 return 150 } 151 152 func (m *machineProvider) removeMachine(details *machineDetails) (err error) { 153 if !m.machine.Exist(details.Name) { 154 details.logger(). 155 Warningln("Skipping machine removal, because it doesn't exist") 156 return nil 157 } 158 159 // This code limits amount of removal of stuck machines to one machine per interval 160 if details.isStuckOnRemove() { 161 m.stuckRemoveLock.Lock() 162 defer m.stuckRemoveLock.Unlock() 163 } 164 165 details.logger(). 166 Warningln("Stopping machine") 167 err = m.machine.Stop(details.Name, machineStopCommandTimeout) 168 if err != nil { 169 details.logger(). 170 WithError(err). 171 Warningln("Error while stopping machine") 172 } 173 174 details.logger(). 175 Warningln("Removing machine") 176 err = m.machine.Remove(details.Name) 177 if err != nil { 178 details.RetryCount++ 179 time.Sleep(removeRetryInterval) 180 return err 181 } 182 183 return nil 184 } 185 186 func (m *machineProvider) finalizeRemoval(details *machineDetails) { 187 for { 188 err := m.removeMachine(details) 189 if err == nil { 190 break 191 } 192 } 193 194 m.lock.Lock() 195 defer m.lock.Unlock() 196 delete(m.details, details.Name) 197 198 details.logger(). 199 WithField("now", time.Now()). 200 WithField("retries", details.RetryCount). 201 Infoln("Machine removed") 202 203 m.totalActions.WithLabelValues("removed").Inc() 204 } 205 206 func (m *machineProvider) remove(machineName string, reason ...interface{}) error { 207 m.lock.Lock() 208 defer m.lock.Unlock() 209 210 details, _ := m.details[machineName] 211 if details == nil { 212 return errors.New("Machine not found") 213 } 214 215 details.Reason = fmt.Sprint(reason...) 216 details.State = machineStateRemoving 217 details.RetryCount = 0 218 219 details.logger(). 220 WithField("now", time.Now()). 221 Warningln("Requesting machine removal") 222 223 details.Used = time.Now() 224 details.writeDebugInformation() 225 226 go m.finalizeRemoval(details) 227 return nil 228 } 229 230 func (m *machineProvider) updateMachine(config *common.RunnerConfig, data *machinesData, details *machineDetails) error { 231 if details.State != machineStateIdle { 232 return nil 233 } 234 235 if config.Machine.MaxBuilds > 0 && details.UsedCount >= config.Machine.MaxBuilds { 236 // Limit number of builds 237 return errors.New("Too many builds") 238 } 239 240 if data.Total() >= config.Limit && config.Limit > 0 { 241 // Limit maximum number of machines 242 return errors.New("Too many machines") 243 } 244 245 if time.Since(details.Used) > time.Second*time.Duration(config.Machine.GetIdleTime()) { 246 if data.Idle >= config.Machine.GetIdleCount() { 247 // Remove machine that are way over the idle time 248 return errors.New("Too many idle machines") 249 } 250 } 251 return nil 252 } 253 254 func (m *machineProvider) updateMachines(machines []string, config *common.RunnerConfig) (data machinesData, validMachines []string) { 255 data.Runner = config.ShortDescription() 256 validMachines = make([]string, 0, len(machines)) 257 258 for _, name := range machines { 259 details := m.machineDetails(name, false) 260 details.LastSeen = time.Now() 261 262 err := m.updateMachine(config, &data, details) 263 if err == nil { 264 validMachines = append(validMachines, name) 265 } else { 266 m.remove(details.Name, err) 267 } 268 269 data.Add(details) 270 } 271 return 272 } 273 274 func (m *machineProvider) createMachines(config *common.RunnerConfig, data *machinesData) { 275 // Create a new machines and mark them as Idle 276 for { 277 if data.Available() >= config.Machine.GetIdleCount() { 278 // Limit maximum number of idle machines 279 break 280 } 281 if data.Total() >= config.Limit && config.Limit > 0 { 282 // Limit maximum number of machines 283 break 284 } 285 m.create(config, machineStateIdle) 286 data.Creating++ 287 } 288 } 289 290 func (m *machineProvider) loadMachines(config *common.RunnerConfig) (machines []string, err error) { 291 machines, err = m.machine.List() 292 if err != nil { 293 return nil, err 294 } 295 296 machines = filterMachineList(machines, machineFilter(config)) 297 return 298 } 299 300 func (m *machineProvider) Acquire(config *common.RunnerConfig) (data common.ExecutorData, err error) { 301 if config.Machine == nil || config.Machine.MachineName == "" { 302 err = fmt.Errorf("Missing Machine options") 303 return 304 } 305 306 // Lock updating machines, because two Acquires can be run at the same time 307 m.acquireLock.Lock() 308 defer m.acquireLock.Unlock() 309 310 machines, err := m.loadMachines(config) 311 if err != nil { 312 return 313 } 314 315 // Update a list of currently configured machines 316 machinesData, validMachines := m.updateMachines(machines, config) 317 318 // Pre-create machines 319 m.createMachines(config, &machinesData) 320 321 logrus.WithFields(machinesData.Fields()). 322 WithField("runner", config.ShortDescription()). 323 WithField("minIdleCount", config.Machine.GetIdleCount()). 324 WithField("maxMachines", config.Limit). 325 WithField("time", time.Now()). 326 Debugln("Docker Machine Details") 327 machinesData.writeDebugInformation() 328 329 // Try to find a free machine 330 details := m.findFreeMachine(false, validMachines...) 331 if details != nil { 332 data = details 333 return 334 } 335 336 // If we have a free machines we can process a build 337 if config.Machine.GetIdleCount() != 0 && machinesData.Idle == 0 { 338 err = errors.New("No free machines that can process builds") 339 } 340 return 341 } 342 343 func (m *machineProvider) Use(config *common.RunnerConfig, data common.ExecutorData) (newConfig common.RunnerConfig, newData common.ExecutorData, err error) { 344 // Find a new machine 345 details, _ := data.(*machineDetails) 346 if details == nil || !details.canBeUsed() || !m.machine.CanConnect(details.Name, true) { 347 details, err = m.retryUseMachine(config) 348 if err != nil { 349 return 350 } 351 352 // Return details only if this is a new instance 353 newData = details 354 } 355 356 // Get machine credentials 357 dc, err := m.machine.Credentials(details.Name) 358 if err != nil { 359 if newData != nil { 360 m.Release(config, newData) 361 } 362 newData = nil 363 return 364 } 365 366 // Create shallow copy of config and store in it docker credentials 367 newConfig = *config 368 newConfig.Docker = &common.DockerConfig{} 369 if config.Docker != nil { 370 *newConfig.Docker = *config.Docker 371 } 372 newConfig.Docker.DockerCredentials = dc 373 374 // Mark machine as used 375 details.State = machineStateUsed 376 details.Used = time.Now() 377 details.UsedCount++ 378 m.totalActions.WithLabelValues("used").Inc() 379 return 380 } 381 382 func (m *machineProvider) Release(config *common.RunnerConfig, data common.ExecutorData) { 383 // Release machine 384 details, ok := data.(*machineDetails) 385 if ok { 386 // Mark last used time when is Used 387 if details.State == machineStateUsed { 388 details.Used = time.Now() 389 } 390 391 // Remove machine if we already used it 392 if config != nil && config.Machine != nil && 393 config.Machine.MaxBuilds > 0 && details.UsedCount >= config.Machine.MaxBuilds { 394 err := m.remove(details.Name, "Too many builds") 395 if err == nil { 396 return 397 } 398 } 399 details.State = machineStateIdle 400 } 401 } 402 403 func (m *machineProvider) CanCreate() bool { 404 return m.provider.CanCreate() 405 } 406 407 func (m *machineProvider) GetFeatures(features *common.FeaturesInfo) error { 408 return m.provider.GetFeatures(features) 409 } 410 411 func (m *machineProvider) GetDefaultShell() string { 412 return m.provider.GetDefaultShell() 413 } 414 415 func (m *machineProvider) Create() common.Executor { 416 return &machineExecutor{ 417 provider: m, 418 } 419 } 420 421 func newMachineProvider(name, executor string) *machineProvider { 422 provider := common.GetExecutor(executor) 423 if provider == nil { 424 logrus.Panicln("Missing", executor) 425 } 426 427 return &machineProvider{ 428 name: name, 429 details: make(machinesDetails), 430 machine: docker_helpers.NewMachineCommand(), 431 provider: provider, 432 totalActions: prometheus.NewCounterVec( 433 prometheus.CounterOpts{ 434 Name: "gitlab_runner_autoscaling_actions_total", 435 Help: "The total number of actions executed by the provider.", 436 ConstLabels: prometheus.Labels{ 437 "executor": name, 438 }, 439 }, 440 []string{"action"}, 441 ), 442 currentStatesDesc: prometheus.NewDesc( 443 "gitlab_runner_autoscaling_machine_states", 444 "The current number of machines per state in this provider.", 445 []string{"state"}, 446 prometheus.Labels{ 447 "executor": name, 448 }, 449 ), 450 creationHistogram: prometheus.NewHistogram( 451 prometheus.HistogramOpts{ 452 Name: "gitlab_runner_autoscaling_machine_creation_duration_seconds", 453 Help: "Histogram of machine creation time.", 454 Buckets: prometheus.ExponentialBuckets(30, 1.25, 10), 455 ConstLabels: prometheus.Labels{ 456 "executor": name, 457 }, 458 }, 459 ), 460 } 461 }