github.com/secure-build/gitlab-runner@v12.5.0+incompatible/executors/docker/machine/provider.go (about) 1 package machine 2 3 import ( 4 "errors" 5 "fmt" 6 "sync" 7 "time" 8 9 "github.com/prometheus/client_golang/prometheus" 10 "github.com/sirupsen/logrus" 11 12 "gitlab.com/gitlab-org/gitlab-runner/common" 13 docker_helpers "gitlab.com/gitlab-org/gitlab-runner/helpers/docker" 14 ) 15 16 type machineProvider struct { 17 name string 18 machine docker_helpers.Machine 19 details machinesDetails 20 lock sync.RWMutex 21 acquireLock sync.Mutex 22 // provider stores a real executor that is used to start run the builds 23 provider common.ExecutorProvider 24 25 stuckRemoveLock sync.Mutex 26 27 // metrics 28 totalActions *prometheus.CounterVec 29 currentStatesDesc *prometheus.Desc 30 creationHistogram prometheus.Histogram 31 } 32 33 func (m *machineProvider) machineDetails(name string, acquire bool) *machineDetails { 34 m.lock.Lock() 35 defer m.lock.Unlock() 36 37 details, ok := m.details[name] 38 if !ok { 39 details = &machineDetails{ 40 Name: name, 41 Created: time.Now(), 42 Used: time.Now(), 43 LastSeen: time.Now(), 44 UsedCount: 1, // any machine that we find we mark as already used 45 State: machineStateIdle, 46 } 47 m.details[name] = details 48 } 49 50 if acquire { 51 if details.isUsed() { 52 return nil 53 } 54 details.State = machineStateAcquired 55 } 56 57 return details 58 } 59 60 func (m *machineProvider) create(config *common.RunnerConfig, state machineState) (details *machineDetails, errCh chan error) { 61 name := newMachineName(config) 62 details = m.machineDetails(name, true) 63 details.State = machineStateCreating 64 details.UsedCount = 0 65 details.RetryCount = 0 66 details.LastSeen = time.Now() 67 errCh = make(chan error, 1) 68 69 // Create machine asynchronously 70 go func() { 71 started := time.Now() 72 err := m.machine.Create(config.Machine.MachineDriver, details.Name, config.Machine.MachineOptions...) 73 for i := 0; i < 3 && err != nil; i++ { 74 details.RetryCount++ 75 logrus.WithField("name", details.Name). 76 WithError(err). 77 Warningln("Machine creation failed, trying to provision") 78 time.Sleep(provisionRetryInterval) 79 err = m.machine.Provision(details.Name) 80 } 81 82 if err != nil { 83 logrus.WithField("name", details.Name). 84 WithField("time", time.Since(started)). 85 WithError(err). 86 Errorln("Machine creation failed") 87 m.remove(details.Name, "Failed to create") 88 } else { 89 details.State = state 90 details.Used = time.Now() 91 creationTime := time.Since(started) 92 logrus.WithField("duration", creationTime). 93 WithField("name", details.Name). 94 WithField("now", time.Now()). 95 WithField("retries", details.RetryCount). 96 Infoln("Machine created") 97 m.totalActions.WithLabelValues("created").Inc() 98 m.creationHistogram.Observe(creationTime.Seconds()) 99 } 100 errCh <- err 101 }() 102 return 103 } 104 105 func (m *machineProvider) findFreeMachine(skipCache bool, machines ...string) (details *machineDetails) { 106 // Enumerate all machines in reverse order, to always take the newest machines first 107 for idx := range machines { 108 name := machines[len(machines)-idx-1] 109 details := m.machineDetails(name, true) 110 if details == nil { 111 continue 112 } 113 114 // Check if node is running 115 canConnect := m.machine.CanConnect(name, skipCache) 116 if !canConnect { 117 m.remove(name, "machine is unavailable") 118 continue 119 } 120 return details 121 } 122 123 return nil 124 } 125 126 func (m *machineProvider) useMachine(config *common.RunnerConfig) (details *machineDetails, err error) { 127 machines, err := m.loadMachines(config) 128 if err != nil { 129 return 130 } 131 details = m.findFreeMachine(true, machines...) 132 if details == nil { 133 var errCh chan error 134 details, errCh = m.create(config, machineStateAcquired) 135 err = <-errCh 136 } 137 return 138 } 139 140 func (m *machineProvider) retryUseMachine(config *common.RunnerConfig) (details *machineDetails, err error) { 141 // Try to find a machine 142 for i := 0; i < 3; i++ { 143 details, err = m.useMachine(config) 144 if err == nil { 145 break 146 } 147 time.Sleep(provisionRetryInterval) 148 } 149 return 150 } 151 152 func (m *machineProvider) removeMachine(details *machineDetails) (err error) { 153 if !m.machine.Exist(details.Name) { 154 details.logger(). 155 Warningln("Skipping machine removal, because it doesn't exist") 156 return nil 157 } 158 159 // This code limits amount of removal of stuck machines to one machine per interval 160 if details.isStuckOnRemove() { 161 m.stuckRemoveLock.Lock() 162 defer m.stuckRemoveLock.Unlock() 163 } 164 165 details.logger(). 166 Warningln("Stopping machine") 167 err = m.machine.Stop(details.Name, machineStopCommandTimeout) 168 if err != nil { 169 details.logger(). 170 WithError(err). 171 Warningln("Error while stopping machine") 172 } 173 174 details.logger(). 175 Warningln("Removing machine") 176 err = m.machine.Remove(details.Name) 177 if err != nil { 178 details.RetryCount++ 179 time.Sleep(removeRetryInterval) 180 return err 181 } 182 183 return nil 184 } 185 186 func (m *machineProvider) finalizeRemoval(details *machineDetails) { 187 for { 188 err := m.removeMachine(details) 189 if err == nil { 190 break 191 } 192 } 193 194 m.lock.Lock() 195 defer m.lock.Unlock() 196 delete(m.details, details.Name) 197 198 details.logger(). 199 WithField("now", time.Now()). 200 WithField("retries", details.RetryCount). 201 Infoln("Machine removed") 202 203 m.totalActions.WithLabelValues("removed").Inc() 204 } 205 206 func (m *machineProvider) remove(machineName string, reason ...interface{}) error { 207 m.lock.Lock() 208 defer m.lock.Unlock() 209 210 details, _ := m.details[machineName] 211 if details == nil { 212 return errors.New("machine not found") 213 } 214 215 details.Reason = fmt.Sprint(reason...) 216 details.State = machineStateRemoving 217 details.RetryCount = 0 218 219 details.logger(). 220 WithField("now", time.Now()). 221 Warningln("Requesting machine removal") 222 223 details.Used = time.Now() 224 details.writeDebugInformation() 225 226 go m.finalizeRemoval(details) 227 return nil 228 } 229 230 func (m *machineProvider) updateMachine(config *common.RunnerConfig, data *machinesData, details *machineDetails) error { 231 if details.State != machineStateIdle { 232 return nil 233 } 234 235 if config.Machine.MaxBuilds > 0 && details.UsedCount >= config.Machine.MaxBuilds { 236 // Limit number of builds 237 return errors.New("too many builds") 238 } 239 240 if data.Total() >= config.Limit && config.Limit > 0 { 241 // Limit maximum number of machines 242 return errors.New("too many machines") 243 } 244 245 if time.Since(details.Used) > time.Second*time.Duration(config.Machine.GetIdleTime()) { 246 if data.Idle >= config.Machine.GetIdleCount() { 247 // Remove machine that are way over the idle time 248 return errors.New("too many idle machines") 249 } 250 } 251 return nil 252 } 253 254 func (m *machineProvider) updateMachines(machines []string, config *common.RunnerConfig) (data machinesData, validMachines []string) { 255 data.Runner = config.ShortDescription() 256 validMachines = make([]string, 0, len(machines)) 257 258 for _, name := range machines { 259 details := m.machineDetails(name, false) 260 details.LastSeen = time.Now() 261 262 err := m.updateMachine(config, &data, details) 263 if err == nil { 264 validMachines = append(validMachines, name) 265 } else { 266 m.remove(details.Name, err) 267 } 268 269 data.Add(details) 270 } 271 return 272 } 273 274 func (m *machineProvider) createMachines(config *common.RunnerConfig, data *machinesData) { 275 // Create a new machines and mark them as Idle 276 for { 277 if data.Available() >= config.Machine.GetIdleCount() { 278 // Limit maximum number of idle machines 279 break 280 } 281 if data.Total() >= config.Limit && config.Limit > 0 { 282 // Limit maximum number of machines 283 break 284 } 285 m.create(config, machineStateIdle) 286 data.Creating++ 287 } 288 } 289 290 // intermediateMachineList returns a list of machines that might not yet be 291 // persisted on disk, these machines are the ones between being virtually 292 // created, and `docker-machine create` getting executed we populate this data 293 // set to overcome the race conditions related to not-full set of machines 294 // returned by `docker-machine ls -q` 295 func (m *machineProvider) intermediateMachineList(excludedMachines []string) []string { 296 var excludedSet map[string]struct{} 297 var intermediateMachines []string 298 299 m.lock.Lock() 300 defer m.lock.Unlock() 301 302 for _, details := range m.details { 303 if details.isPersistedOnDisk() { 304 continue 305 } 306 307 // lazy init set, as most of times we don't create new machines 308 if excludedSet == nil { 309 excludedSet = make(map[string]struct{}, len(excludedMachines)) 310 for _, excludedMachine := range excludedMachines { 311 excludedSet[excludedMachine] = struct{}{} 312 } 313 } 314 315 if _, ok := excludedSet[details.Name]; ok { 316 continue 317 } 318 319 intermediateMachines = append(intermediateMachines, details.Name) 320 } 321 322 return intermediateMachines 323 } 324 325 func (m *machineProvider) loadMachines(config *common.RunnerConfig) (machines []string, err error) { 326 machines, err = m.machine.List() 327 if err != nil { 328 return nil, err 329 } 330 331 machines = append(machines, m.intermediateMachineList(machines)...) 332 machines = filterMachineList(machines, machineFilter(config)) 333 return 334 } 335 336 func (m *machineProvider) Acquire(config *common.RunnerConfig) (data common.ExecutorData, err error) { 337 if config.Machine == nil || config.Machine.MachineName == "" { 338 err = fmt.Errorf("missing Machine options") 339 return 340 } 341 342 // Lock updating machines, because two Acquires can be run at the same time 343 m.acquireLock.Lock() 344 defer m.acquireLock.Unlock() 345 346 machines, err := m.loadMachines(config) 347 if err != nil { 348 return 349 } 350 351 // Update a list of currently configured machines 352 machinesData, validMachines := m.updateMachines(machines, config) 353 354 // Pre-create machines 355 m.createMachines(config, &machinesData) 356 357 logrus.WithFields(machinesData.Fields()). 358 WithField("runner", config.ShortDescription()). 359 WithField("minIdleCount", config.Machine.GetIdleCount()). 360 WithField("maxMachines", config.Limit). 361 WithField("time", time.Now()). 362 Debugln("Docker Machine Details") 363 machinesData.writeDebugInformation() 364 365 // Try to find a free machine 366 details := m.findFreeMachine(false, validMachines...) 367 if details != nil { 368 data = details 369 return 370 } 371 372 // If we have a free machines we can process a build 373 if config.Machine.GetIdleCount() != 0 && machinesData.Idle == 0 { 374 err = errors.New("no free machines that can process builds") 375 } 376 return 377 } 378 379 func (m *machineProvider) Use(config *common.RunnerConfig, data common.ExecutorData) (newConfig common.RunnerConfig, newData common.ExecutorData, err error) { 380 // Find a new machine 381 details, _ := data.(*machineDetails) 382 if details == nil || !details.canBeUsed() || !m.machine.CanConnect(details.Name, true) { 383 details, err = m.retryUseMachine(config) 384 if err != nil { 385 return 386 } 387 388 // Return details only if this is a new instance 389 newData = details 390 } 391 392 // Get machine credentials 393 dc, err := m.machine.Credentials(details.Name) 394 if err != nil { 395 if newData != nil { 396 m.Release(config, newData) 397 } 398 newData = nil 399 return 400 } 401 402 // Create shallow copy of config and store in it docker credentials 403 newConfig = *config 404 newConfig.Docker = &common.DockerConfig{} 405 if config.Docker != nil { 406 *newConfig.Docker = *config.Docker 407 } 408 newConfig.Docker.DockerCredentials = dc 409 410 // Mark machine as used 411 details.State = machineStateUsed 412 details.Used = time.Now() 413 details.UsedCount++ 414 m.totalActions.WithLabelValues("used").Inc() 415 return 416 } 417 418 func (m *machineProvider) Release(config *common.RunnerConfig, data common.ExecutorData) { 419 // Release machine 420 details, ok := data.(*machineDetails) 421 if ok { 422 // Mark last used time when is Used 423 if details.State == machineStateUsed { 424 details.Used = time.Now() 425 } 426 427 // Remove machine if we already used it 428 if config != nil && config.Machine != nil && 429 config.Machine.MaxBuilds > 0 && details.UsedCount >= config.Machine.MaxBuilds { 430 err := m.remove(details.Name, "Too many builds") 431 if err == nil { 432 return 433 } 434 } 435 details.State = machineStateIdle 436 } 437 } 438 439 func (m *machineProvider) CanCreate() bool { 440 return m.provider.CanCreate() 441 } 442 443 func (m *machineProvider) GetFeatures(features *common.FeaturesInfo) error { 444 return m.provider.GetFeatures(features) 445 } 446 447 func (m *machineProvider) GetDefaultShell() string { 448 return m.provider.GetDefaultShell() 449 } 450 451 func (m *machineProvider) Create() common.Executor { 452 return &machineExecutor{ 453 provider: m, 454 } 455 } 456 457 func newMachineProvider(name, executor string) *machineProvider { 458 provider := common.GetExecutor(executor) 459 if provider == nil { 460 logrus.Panicln("Missing", executor) 461 } 462 463 return &machineProvider{ 464 name: name, 465 details: make(machinesDetails), 466 machine: docker_helpers.NewMachineCommand(), 467 provider: provider, 468 totalActions: prometheus.NewCounterVec( 469 prometheus.CounterOpts{ 470 Name: "gitlab_runner_autoscaling_actions_total", 471 Help: "The total number of actions executed by the provider.", 472 ConstLabels: prometheus.Labels{ 473 "executor": name, 474 }, 475 }, 476 []string{"action"}, 477 ), 478 currentStatesDesc: prometheus.NewDesc( 479 "gitlab_runner_autoscaling_machine_states", 480 "The current number of machines per state in this provider.", 481 []string{"state"}, 482 prometheus.Labels{ 483 "executor": name, 484 }, 485 ), 486 creationHistogram: prometheus.NewHistogram( 487 prometheus.HistogramOpts{ 488 Name: "gitlab_runner_autoscaling_machine_creation_duration_seconds", 489 Help: "Histogram of machine creation time.", 490 Buckets: prometheus.ExponentialBuckets(30, 1.25, 10), 491 ConstLabels: prometheus.Labels{ 492 "executor": name, 493 }, 494 }, 495 ), 496 } 497 }