github.com/nilium/gitlab-runner@v12.5.0+incompatible/commands/multi.go (about) 1 package commands 2 3 import ( 4 "errors" 5 "fmt" 6 "net" 7 "net/http" 8 "net/http/pprof" 9 "os" 10 "os/signal" 11 "runtime" 12 "syscall" 13 "time" 14 15 service "github.com/ayufan/golang-kardianos-service" 16 "github.com/prometheus/client_golang/prometheus" 17 "github.com/prometheus/client_golang/prometheus/promhttp" 18 "github.com/sirupsen/logrus" 19 "github.com/urfave/cli" 20 21 "gitlab.com/gitlab-org/gitlab-runner/common" 22 "gitlab.com/gitlab-org/gitlab-runner/helpers" 23 "gitlab.com/gitlab-org/gitlab-runner/helpers/certificate" 24 prometheus_helper "gitlab.com/gitlab-org/gitlab-runner/helpers/prometheus" 25 "gitlab.com/gitlab-org/gitlab-runner/helpers/sentry" 26 service_helpers "gitlab.com/gitlab-org/gitlab-runner/helpers/service" 27 "gitlab.com/gitlab-org/gitlab-runner/log" 28 "gitlab.com/gitlab-org/gitlab-runner/network" 29 "gitlab.com/gitlab-org/gitlab-runner/session" 30 ) 31 32 var ( 33 concurrentDesc = prometheus.NewDesc( 34 "gitlab_runner_concurrent", 35 "The current value of concurrent setting", 36 nil, 37 nil, 38 ) 39 40 limitDesc = prometheus.NewDesc( 41 "gitlab_runner_limit", 42 "The current value of concurrent setting", 43 []string{"runner"}, 44 nil, 45 ) 46 ) 47 48 type RunCommand struct { 49 configOptionsWithListenAddress 50 network common.Network 51 healthHelper 52 53 buildsHelper buildsHelper 54 55 ServiceName string `short:"n" long:"service" description:"Use different names for different services"` 56 WorkingDirectory string `short:"d" long:"working-directory" description:"Specify custom working directory"` 57 User string `short:"u" long:"user" description:"Use specific user to execute shell scripts"` 58 Syslog bool `long:"syslog" description:"Log to system service logger" env:"LOG_SYSLOG"` 59 60 sentryLogHook sentry.LogHook 61 prometheusLogHook prometheus_helper.LogHook 62 63 failuresCollector *prometheus_helper.FailuresCollector 64 networkRequestStatusesCollector prometheus.Collector 65 66 sessionServer *session.Server 67 68 // abortBuilds is used to abort running builds 69 abortBuilds chan os.Signal 70 71 // runSignal is used to abort current operation (scaling workers, waiting for config) 72 runSignal chan os.Signal 73 74 // reloadSignal is used to trigger forceful config reload 75 reloadSignal chan os.Signal 76 77 // stopSignals is to catch a signals notified to process: SIGTERM, SIGQUIT, Interrupt, Kill 78 stopSignals chan os.Signal 79 80 // stopSignal is used to preserve the signal that was used to stop the 81 // process In case this is SIGQUIT it makes to finish all builds and session 82 // server. 83 stopSignal os.Signal 84 85 // runFinished is used to notify that Run() did finish 86 runFinished chan bool 87 88 currentWorkers int 89 } 90 91 func (mr *RunCommand) log() *logrus.Entry { 92 return logrus.WithField("builds", mr.buildsHelper.buildsCount()) 93 } 94 95 func (mr *RunCommand) feedRunner(runner *common.RunnerConfig, runners chan *common.RunnerConfig) { 96 if !mr.isHealthy(runner.UniqueID()) { 97 return 98 } 99 100 runners <- runner 101 } 102 103 func (mr *RunCommand) feedRunners(runners chan *common.RunnerConfig) { 104 for mr.stopSignal == nil { 105 mr.log().Debugln("Feeding runners to channel") 106 config := mr.config 107 108 // If no runners wait full interval to test again 109 if len(config.Runners) == 0 { 110 time.Sleep(config.GetCheckInterval()) 111 continue 112 } 113 114 interval := config.GetCheckInterval() / time.Duration(len(config.Runners)) 115 116 // Feed runner with waiting exact amount of time 117 for _, runner := range config.Runners { 118 mr.feedRunner(runner, runners) 119 time.Sleep(interval) 120 } 121 } 122 } 123 124 func (mr *RunCommand) requeueRunner(runner *common.RunnerConfig, runners chan *common.RunnerConfig) { 125 select { 126 case runners <- runner: 127 mr.log().WithField("runner", runner.ShortDescription()).Debugln("Requeued the runner") 128 129 default: 130 mr.log().WithField("runner", runner.ShortDescription()).Debugln("Failed to requeue the runner: ") 131 } 132 } 133 134 // requestJob will check if the runner can send another concurrent request to 135 // GitLab, if not the return value is nil. 136 func (mr *RunCommand) requestJob(runner *common.RunnerConfig, sessionInfo *common.SessionInfo) (common.JobTrace, *common.JobResponse, error) { 137 if !mr.buildsHelper.acquireRequest(runner) { 138 mr.log().WithField("runner", runner.ShortDescription()). 139 Debugln("Failed to request job: runner requestConcurrency meet") 140 return nil, nil, nil 141 } 142 defer mr.buildsHelper.releaseRequest(runner) 143 144 jobData, healthy := mr.network.RequestJob(*runner, sessionInfo) 145 mr.makeHealthy(runner.UniqueID(), healthy) 146 147 if jobData == nil { 148 return nil, nil, nil 149 } 150 151 // Make sure to always close output 152 jobCredentials := &common.JobCredentials{ 153 ID: jobData.ID, 154 Token: jobData.Token, 155 } 156 157 trace, err := mr.network.ProcessJob(*runner, jobCredentials) 158 if err != nil { 159 jobInfo := common.UpdateJobInfo{ 160 ID: jobCredentials.ID, 161 State: common.Failed, 162 FailureReason: common.RunnerSystemFailure, 163 } 164 165 // send failure once 166 mr.network.UpdateJob(*runner, jobCredentials, jobInfo) 167 return nil, nil, err 168 } 169 170 trace.SetFailuresCollector(mr.failuresCollector) 171 return trace, jobData, nil 172 } 173 174 func (mr *RunCommand) processRunner(id int, runner *common.RunnerConfig, runners chan *common.RunnerConfig) (err error) { 175 provider := common.GetExecutor(runner.Executor) 176 if provider == nil { 177 return 178 } 179 180 executorData, err := provider.Acquire(runner) 181 if err != nil { 182 return fmt.Errorf("failed to update executor: %v", err) 183 } 184 defer provider.Release(runner, executorData) 185 186 if !mr.buildsHelper.acquireBuild(runner) { 187 logrus.WithField("runner", runner.ShortDescription()). 188 Debug("Failed to request job, runner limit met") 189 return 190 } 191 defer mr.buildsHelper.releaseBuild(runner) 192 193 buildSession, sessionInfo, err := mr.createSession(provider) 194 if err != nil { 195 return 196 } 197 198 // Receive a new build 199 trace, jobData, err := mr.requestJob(runner, sessionInfo) 200 if err != nil || jobData == nil { 201 return 202 } 203 defer func() { 204 if err != nil { 205 fmt.Fprintln(trace, err.Error()) 206 trace.Fail(err, common.RunnerSystemFailure) 207 } else { 208 trace.Fail(nil, common.NoneFailure) 209 } 210 }() 211 212 // Create a new build 213 build, err := common.NewBuild(*jobData, runner, mr.abortBuilds, executorData) 214 if err != nil { 215 return 216 } 217 build.Session = buildSession 218 219 // Add build to list of builds to assign numbers 220 mr.buildsHelper.addBuild(build) 221 defer mr.buildsHelper.removeBuild(build) 222 223 // Process the same runner by different worker again 224 // to speed up taking the builds 225 mr.requeueRunner(runner, runners) 226 227 // Process a build 228 return build.Run(mr.config, trace) 229 } 230 231 func (mr *RunCommand) createSession(provider common.ExecutorProvider) (*session.Session, *common.SessionInfo, error) { 232 var features common.FeaturesInfo 233 234 if err := provider.GetFeatures(&features); err != nil { 235 return nil, nil, err 236 } 237 238 if mr.sessionServer == nil || !features.Session { 239 return nil, nil, nil 240 } 241 242 sess, err := session.NewSession(mr.log()) 243 if err != nil { 244 return nil, nil, err 245 } 246 247 sessionInfo := &common.SessionInfo{ 248 URL: mr.sessionServer.AdvertiseAddress + sess.Endpoint, 249 Certificate: string(mr.sessionServer.CertificatePublicKey), 250 Authorization: sess.Token, 251 } 252 253 return sess, sessionInfo, err 254 } 255 256 func (mr *RunCommand) processRunners(id int, stopWorker chan bool, runners chan *common.RunnerConfig) { 257 mr.log().WithField("worker", id).Debugln("Starting worker") 258 for mr.stopSignal == nil { 259 select { 260 case runner := <-runners: 261 err := mr.processRunner(id, runner, runners) 262 if err != nil { 263 mr.log().WithFields(logrus.Fields{ 264 "runner": runner.ShortDescription(), 265 "executor": runner.Executor, 266 }).WithError(err). 267 Warn("Failed to process runner") 268 } 269 270 // force GC cycle after processing build 271 runtime.GC() 272 273 case <-stopWorker: 274 mr.log().WithField("worker", id).Debugln("Stopping worker") 275 return 276 } 277 } 278 <-stopWorker 279 } 280 281 func (mr *RunCommand) startWorkers(startWorker chan int, stopWorker chan bool, runners chan *common.RunnerConfig) { 282 for mr.stopSignal == nil { 283 id := <-startWorker 284 go mr.processRunners(id, stopWorker, runners) 285 } 286 } 287 288 func (mr *RunCommand) loadConfig() error { 289 err := mr.configOptions.loadConfig() 290 if err != nil { 291 return err 292 } 293 294 // Set log level 295 err = mr.updateLoggingConfiguration() 296 if err != nil { 297 return err 298 } 299 300 // pass user to execute scripts as specific user 301 if mr.User != "" { 302 mr.config.User = mr.User 303 } 304 305 mr.healthy = nil 306 mr.log().Println("Configuration loaded") 307 mr.log().Debugln(helpers.ToYAML(mr.config)) 308 309 // initialize sentry 310 if mr.config.SentryDSN != nil { 311 var err error 312 mr.sentryLogHook, err = sentry.NewLogHook(*mr.config.SentryDSN) 313 if err != nil { 314 mr.log().WithError(err).Errorln("Sentry failure") 315 } 316 } else { 317 mr.sentryLogHook = sentry.LogHook{} 318 } 319 320 return nil 321 } 322 323 func (mr *RunCommand) updateLoggingConfiguration() error { 324 reloadNeeded := false 325 326 if mr.config.LogLevel != nil && !log.Configuration().IsLevelSetWithCli() { 327 err := log.Configuration().SetLevel(*mr.config.LogLevel) 328 if err != nil { 329 return err 330 } 331 332 reloadNeeded = true 333 } 334 335 if mr.config.LogFormat != nil && !log.Configuration().IsFormatSetWithCli() { 336 err := log.Configuration().SetFormat(*mr.config.LogFormat) 337 if err != nil { 338 return err 339 } 340 341 reloadNeeded = true 342 } 343 344 if reloadNeeded { 345 log.Configuration().ReloadConfiguration() 346 } 347 348 return nil 349 } 350 351 func (mr *RunCommand) checkConfig() (err error) { 352 info, err := os.Stat(mr.ConfigFile) 353 if err != nil { 354 return err 355 } 356 357 if !mr.config.ModTime.Before(info.ModTime()) { 358 return nil 359 } 360 361 err = mr.loadConfig() 362 if err != nil { 363 mr.log().Errorln("Failed to load config", err) 364 // don't reload the same file 365 mr.config.ModTime = info.ModTime() 366 return 367 } 368 return nil 369 } 370 371 func (mr *RunCommand) Start(s service.Service) error { 372 mr.abortBuilds = make(chan os.Signal) 373 mr.runSignal = make(chan os.Signal, 1) 374 mr.reloadSignal = make(chan os.Signal, 1) 375 mr.runFinished = make(chan bool, 1) 376 mr.stopSignals = make(chan os.Signal) 377 mr.log().Println("Starting multi-runner from", mr.ConfigFile, "...") 378 379 userModeWarning(false) 380 381 if len(mr.WorkingDirectory) > 0 { 382 err := os.Chdir(mr.WorkingDirectory) 383 if err != nil { 384 return err 385 } 386 } 387 388 err := mr.loadConfig() 389 if err != nil { 390 return err 391 } 392 393 // Start should not block. Do the actual work async. 394 go mr.RunWithLock() 395 396 return nil 397 } 398 399 func (mr *RunCommand) updateWorkers(workerIndex *int, startWorker chan int, stopWorker chan bool) os.Signal { 400 buildLimit := mr.config.Concurrent 401 402 if buildLimit < 1 { 403 mr.log().Fatalln("Concurrent is less than 1 - no jobs will be processed") 404 } 405 406 for mr.currentWorkers > buildLimit { 407 select { 408 case stopWorker <- true: 409 case signaled := <-mr.runSignal: 410 return signaled 411 } 412 mr.currentWorkers-- 413 } 414 415 for mr.currentWorkers < buildLimit { 416 select { 417 case startWorker <- *workerIndex: 418 case signaled := <-mr.runSignal: 419 return signaled 420 } 421 mr.currentWorkers++ 422 *workerIndex++ 423 } 424 425 return nil 426 } 427 428 func (mr *RunCommand) updateConfig() os.Signal { 429 select { 430 case <-time.After(common.ReloadConfigInterval * time.Second): 431 err := mr.checkConfig() 432 if err != nil { 433 mr.log().Errorln("Failed to load config", err) 434 } 435 436 case <-mr.reloadSignal: 437 err := mr.loadConfig() 438 if err != nil { 439 mr.log().Errorln("Failed to load config", err) 440 } 441 442 case signaled := <-mr.runSignal: 443 return signaled 444 } 445 return nil 446 } 447 448 func (mr *RunCommand) runWait() { 449 mr.log().Debugln("Waiting for stop signal") 450 451 // Save the stop signal and exit to execute Stop() 452 mr.stopSignal = <-mr.stopSignals 453 } 454 455 func (mr *RunCommand) serveMetrics(mux *http.ServeMux) { 456 registry := prometheus.NewRegistry() 457 // Metrics about the runner's business logic. 458 registry.MustRegister(&mr.buildsHelper) 459 registry.MustRegister(mr) 460 // Metrics about API connections 461 registry.MustRegister(mr.networkRequestStatusesCollector) 462 // Metrics about jobs failures 463 registry.MustRegister(mr.failuresCollector) 464 // Metrics about catched errors 465 registry.MustRegister(&mr.prometheusLogHook) 466 // Metrics about the program's build version. 467 registry.MustRegister(common.AppVersion.NewMetricsCollector()) 468 // Go-specific metrics about the process (GC stats, goroutines, etc.). 469 registry.MustRegister(prometheus.NewGoCollector()) 470 // Go-unrelated process metrics (memory usage, file descriptors, etc.). 471 registry.MustRegister(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{})) 472 473 // Register all executor provider collectors 474 for _, provider := range common.GetExecutorProviders() { 475 if collector, ok := provider.(prometheus.Collector); ok && collector != nil { 476 registry.MustRegister(collector) 477 } 478 } 479 480 mux.Handle("/metrics", promhttp.HandlerFor(registry, promhttp.HandlerOpts{})) 481 } 482 483 func (mr *RunCommand) serveDebugData(mux *http.ServeMux) { 484 mux.HandleFunc("/debug/jobs/list", mr.buildsHelper.ListJobsHandler) 485 } 486 487 func (mr *RunCommand) servePprof(mux *http.ServeMux) { 488 mux.HandleFunc("/debug/pprof/", pprof.Index) 489 mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline) 490 mux.HandleFunc("/debug/pprof/profile", pprof.Profile) 491 mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol) 492 mux.HandleFunc("/debug/pprof/trace", pprof.Trace) 493 } 494 495 func (mr *RunCommand) setupMetricsAndDebugServer() { 496 listenAddress, err := mr.listenAddress() 497 498 if err != nil { 499 mr.log().Errorf("invalid listen address: %s", err.Error()) 500 return 501 } 502 503 if listenAddress == "" { 504 mr.log().Info("listen_address not defined, metrics & debug endpoints disabled") 505 return 506 } 507 508 // We separate out the listener creation here so that we can return an error if 509 // the provided address is invalid or there is some other listener error. 510 listener, err := net.Listen("tcp", listenAddress) 511 if err != nil { 512 mr.log().WithError(err).Fatal("Failed to create listener for metrics server") 513 } 514 515 mux := http.NewServeMux() 516 517 go func() { 518 err := http.Serve(listener, mux) 519 if err != nil { 520 mr.log().WithError(err).Fatal("Metrics server terminated") 521 } 522 }() 523 524 mr.serveMetrics(mux) 525 mr.serveDebugData(mux) 526 mr.servePprof(mux) 527 528 mr.log(). 529 WithField("address", listenAddress). 530 Info("Metrics server listening") 531 } 532 533 func (mr *RunCommand) setupSessionServer() { 534 if mr.config.SessionServer.ListenAddress == "" { 535 mr.log().Info("[session_server].listen_address not defined, session endpoints disabled") 536 return 537 } 538 539 var err error 540 mr.sessionServer, err = session.NewServer( 541 session.ServerConfig{ 542 AdvertiseAddress: mr.config.SessionServer.AdvertiseAddress, 543 ListenAddress: mr.config.SessionServer.ListenAddress, 544 ShutdownTimeout: common.ShutdownTimeout * time.Second, 545 }, 546 mr.log(), 547 certificate.X509Generator{}, 548 mr.buildsHelper.findSessionByURL, 549 ) 550 if err != nil { 551 mr.log().WithError(err).Fatal("Failed to create session server") 552 } 553 554 go func() { 555 err := mr.sessionServer.Start() 556 if err != nil { 557 mr.log().WithError(err).Fatal("Session server terminated") 558 } 559 }() 560 561 mr.log(). 562 WithField("address", mr.config.SessionServer.ListenAddress). 563 Info("Session server listening") 564 } 565 566 func (mr *RunCommand) RunWithLock() { 567 log := mr.log().WithFields(logrus.Fields{ 568 "file": mr.ConfigFile, 569 "pid": os.Getpid(), 570 }) 571 log.Info("Locking configuration file") 572 573 err := mr.inLock(mr.Run) 574 if err != nil { 575 log.WithError(err).Fatal("Could not handle configuration file locking") 576 } 577 } 578 579 func (mr *RunCommand) Run() { 580 mr.setupMetricsAndDebugServer() 581 mr.setupSessionServer() 582 583 runners := make(chan *common.RunnerConfig) 584 go mr.feedRunners(runners) 585 586 signal.Notify(mr.stopSignals, syscall.SIGQUIT, syscall.SIGTERM, os.Interrupt, os.Kill) 587 signal.Notify(mr.reloadSignal, syscall.SIGHUP) 588 589 startWorker := make(chan int) 590 stopWorker := make(chan bool) 591 go mr.startWorkers(startWorker, stopWorker, runners) 592 593 workerIndex := 0 594 595 for mr.stopSignal == nil { 596 signaled := mr.updateWorkers(&workerIndex, startWorker, stopWorker) 597 if signaled != nil { 598 break 599 } 600 601 signaled = mr.updateConfig() 602 if signaled != nil { 603 break 604 } 605 } 606 607 // Wait for workers to shutdown 608 for mr.currentWorkers > 0 { 609 stopWorker <- true 610 mr.currentWorkers-- 611 } 612 mr.log().Println("All workers stopped. Can exit now") 613 close(mr.runFinished) 614 } 615 616 func (mr *RunCommand) interruptRun() { 617 // Pump interrupt signal 618 for { 619 mr.runSignal <- mr.stopSignal 620 } 621 } 622 623 func (mr *RunCommand) abortAllBuilds() { 624 // Pump signal to abort all current builds 625 for { 626 mr.abortBuilds <- mr.stopSignal 627 } 628 } 629 630 func (mr *RunCommand) handleGracefulShutdown() error { 631 // We wait till we have a SIGQUIT 632 for mr.stopSignal == syscall.SIGQUIT { 633 mr.log().Warningln("Requested quit, waiting for builds to finish") 634 635 // Wait for other signals to finish builds 636 select { 637 case mr.stopSignal = <-mr.stopSignals: 638 // We received a new signal 639 640 case <-mr.runFinished: 641 // Everything finished we can exit now 642 return nil 643 } 644 } 645 646 return fmt.Errorf("received: %v", mr.stopSignal) 647 } 648 649 func (mr *RunCommand) handleShutdown() error { 650 mr.log().Warningln("Requested service stop:", mr.stopSignal) 651 652 go mr.abortAllBuilds() 653 654 if mr.sessionServer != nil { 655 mr.sessionServer.Close() 656 } 657 658 // Wait for graceful shutdown or abort after timeout 659 for { 660 select { 661 case mr.stopSignal = <-mr.stopSignals: 662 return fmt.Errorf("forced exit: %v", mr.stopSignal) 663 664 case <-time.After(common.ShutdownTimeout * time.Second): 665 return errors.New("shutdown timed out") 666 667 case <-mr.runFinished: 668 // Everything finished we can exit now 669 return nil 670 } 671 } 672 } 673 674 func (mr *RunCommand) Stop(s service.Service) (err error) { 675 go mr.interruptRun() 676 err = mr.handleGracefulShutdown() 677 if err == nil { 678 return 679 } 680 err = mr.handleShutdown() 681 return 682 } 683 684 // Describe implements prometheus.Collector. 685 func (mr *RunCommand) Describe(ch chan<- *prometheus.Desc) { 686 ch <- concurrentDesc 687 ch <- limitDesc 688 } 689 690 // Collect implements prometheus.Collector. 691 func (mr *RunCommand) Collect(ch chan<- prometheus.Metric) { 692 config := mr.config 693 694 ch <- prometheus.MustNewConstMetric( 695 concurrentDesc, 696 prometheus.GaugeValue, 697 float64(config.Concurrent), 698 ) 699 700 for _, runner := range config.Runners { 701 ch <- prometheus.MustNewConstMetric( 702 limitDesc, 703 prometheus.GaugeValue, 704 float64(runner.Limit), 705 runner.ShortDescription(), 706 ) 707 } 708 } 709 710 func (mr *RunCommand) Execute(context *cli.Context) { 711 svcConfig := &service.Config{ 712 Name: mr.ServiceName, 713 DisplayName: mr.ServiceName, 714 Description: defaultDescription, 715 Arguments: []string{"run"}, 716 Option: service.KeyValue{ 717 "RunWait": mr.runWait, 718 }, 719 } 720 721 svc, err := service_helpers.New(mr, svcConfig) 722 if err != nil { 723 logrus.Fatalln(err) 724 } 725 726 if mr.Syslog { 727 log.SetSystemLogger(logrus.StandardLogger(), svc) 728 } 729 730 logrus.AddHook(&mr.sentryLogHook) 731 logrus.AddHook(&mr.prometheusLogHook) 732 733 err = svc.Run() 734 if err != nil { 735 logrus.Fatalln(err) 736 } 737 } 738 739 func init() { 740 requestStatusesCollector := network.NewAPIRequestStatusesMap() 741 742 common.RegisterCommand2("run", "run multi runner service", &RunCommand{ 743 ServiceName: defaultServiceName, 744 network: network.NewGitLabClientWithRequestStatusesMap(requestStatusesCollector), 745 networkRequestStatusesCollector: requestStatusesCollector, 746 prometheusLogHook: prometheus_helper.NewLogHook(), 747 failuresCollector: prometheus_helper.NewFailuresCollector(), 748 buildsHelper: newBuildsHelper(), 749 }) 750 }