gitlab.com/jfprevost/gitlab-runner-notlscheck@v11.11.4+incompatible/commands/multi.go (about) 1 package commands 2 3 import ( 4 "errors" 5 "fmt" 6 "net" 7 "net/http" 8 "net/http/pprof" 9 "os" 10 "os/signal" 11 "runtime" 12 "syscall" 13 "time" 14 15 "github.com/ayufan/golang-kardianos-service" 16 "github.com/prometheus/client_golang/prometheus" 17 "github.com/prometheus/client_golang/prometheus/promhttp" 18 "github.com/sirupsen/logrus" 19 "github.com/urfave/cli" 20 21 "gitlab.com/gitlab-org/gitlab-runner/common" 22 "gitlab.com/gitlab-org/gitlab-runner/helpers" 23 "gitlab.com/gitlab-org/gitlab-runner/helpers/certificate" 24 prometheus_helper "gitlab.com/gitlab-org/gitlab-runner/helpers/prometheus" 25 "gitlab.com/gitlab-org/gitlab-runner/helpers/sentry" 26 "gitlab.com/gitlab-org/gitlab-runner/helpers/service" 27 "gitlab.com/gitlab-org/gitlab-runner/log" 28 "gitlab.com/gitlab-org/gitlab-runner/network" 29 "gitlab.com/gitlab-org/gitlab-runner/session" 30 ) 31 32 var ( 33 concurrentDesc = prometheus.NewDesc( 34 "gitlab_runner_concurrent", 35 "The current value of concurrent setting", 36 nil, 37 nil, 38 ) 39 40 limitDesc = prometheus.NewDesc( 41 "gitlab_runner_limit", 42 "The current value of concurrent setting", 43 []string{"runner"}, 44 nil, 45 ) 46 ) 47 48 type RunCommand struct { 49 configOptionsWithListenAddress 50 network common.Network 51 healthHelper 52 53 buildsHelper buildsHelper 54 55 ServiceName string `short:"n" long:"service" description:"Use different names for different services"` 56 WorkingDirectory string `short:"d" long:"working-directory" description:"Specify custom working directory"` 57 User string `short:"u" long:"user" description:"Use specific user to execute shell scripts"` 58 Syslog bool `long:"syslog" description:"Log to system service logger" env:"LOG_SYSLOG"` 59 60 sentryLogHook sentry.LogHook 61 prometheusLogHook prometheus_helper.LogHook 62 63 failuresCollector *prometheus_helper.FailuresCollector 64 networkRequestStatusesCollector prometheus.Collector 65 66 sessionServer *session.Server 67 68 // abortBuilds is used to abort running builds 69 abortBuilds chan os.Signal 70 71 // runSignal is used to abort current operation (scaling workers, waiting for config) 72 runSignal chan os.Signal 73 74 // reloadSignal is used to trigger forceful config reload 75 reloadSignal chan os.Signal 76 77 // stopSignals is to catch a signals notified to process: SIGTERM, SIGQUIT, Interrupt, Kill 78 stopSignals chan os.Signal 79 80 // stopSignal is used to preserve the signal that was used to stop the 81 // process In case this is SIGQUIT it makes to finish all builds and session 82 // server. 83 stopSignal os.Signal 84 85 // runFinished is used to notify that Run() did finish 86 runFinished chan bool 87 88 currentWorkers int 89 } 90 91 func (mr *RunCommand) log() *logrus.Entry { 92 return logrus.WithField("builds", mr.buildsHelper.buildsCount()) 93 } 94 95 func (mr *RunCommand) feedRunner(runner *common.RunnerConfig, runners chan *common.RunnerConfig) { 96 if !mr.isHealthy(runner.UniqueID()) { 97 return 98 } 99 100 runners <- runner 101 } 102 103 func (mr *RunCommand) feedRunners(runners chan *common.RunnerConfig) { 104 for mr.stopSignal == nil { 105 mr.log().Debugln("Feeding runners to channel") 106 config := mr.config 107 108 // If no runners wait full interval to test again 109 if len(config.Runners) == 0 { 110 time.Sleep(config.GetCheckInterval()) 111 continue 112 } 113 114 interval := config.GetCheckInterval() / time.Duration(len(config.Runners)) 115 116 // Feed runner with waiting exact amount of time 117 for _, runner := range config.Runners { 118 mr.feedRunner(runner, runners) 119 time.Sleep(interval) 120 } 121 } 122 } 123 124 // requestJob will check if the runner can send another concurrent request to 125 // GitLab, if not the return value is nil. 126 func (mr *RunCommand) requestJob(runner *common.RunnerConfig, sessionInfo *common.SessionInfo) *common.JobResponse { 127 if !mr.buildsHelper.acquireRequest(runner) { 128 mr.log().WithField("runner", runner.ShortDescription()). 129 Debugln("Failed to request job: runner requestConcurrency meet") 130 131 return nil 132 } 133 defer mr.buildsHelper.releaseRequest(runner) 134 135 jobData, healthy := mr.network.RequestJob(*runner, sessionInfo) 136 mr.makeHealthy(runner.UniqueID(), healthy) 137 138 return jobData 139 } 140 141 func (mr *RunCommand) processRunner(id int, runner *common.RunnerConfig, runners chan *common.RunnerConfig) (err error) { 142 provider := common.GetExecutor(runner.Executor) 143 if provider == nil { 144 return 145 } 146 147 executorData, releaseFn, err := mr.acquireRunnerResources(provider, runner) 148 if err != nil { 149 return 150 } 151 defer releaseFn() 152 153 var features common.FeaturesInfo 154 provider.GetFeatures(&features) 155 buildSession, sessionInfo, err := mr.createSession(features) 156 if err != nil { 157 return 158 } 159 160 // Receive a new build 161 jobData := mr.requestJob(runner, sessionInfo) 162 if jobData == nil { 163 return 164 } 165 166 // Make sure to always close output 167 jobCredentials := &common.JobCredentials{ 168 ID: jobData.ID, 169 Token: jobData.Token, 170 } 171 trace := mr.network.ProcessJob(*runner, jobCredentials) 172 defer func() { 173 if err != nil { 174 fmt.Fprintln(trace, err.Error()) 175 trace.Fail(err, common.RunnerSystemFailure) 176 } else { 177 trace.Fail(nil, common.NoneFailure) 178 } 179 }() 180 181 trace.SetFailuresCollector(mr.failuresCollector) 182 183 // Create a new build 184 build, err := common.NewBuild(*jobData, runner, mr.abortBuilds, executorData) 185 if err != nil { 186 return 187 } 188 build.Session = buildSession 189 190 // Add build to list of builds to assign numbers 191 mr.buildsHelper.addBuild(build) 192 defer mr.buildsHelper.removeBuild(build) 193 194 // Process the same runner by different worker again 195 // to speed up taking the builds 196 select { 197 case runners <- runner: 198 mr.log().WithField("runner", runner.ShortDescription()).Debugln("Requeued the runner") 199 200 default: 201 mr.log().WithField("runner", runner.ShortDescription()).Debugln("Failed to requeue the runner: ") 202 } 203 204 // Process a build 205 return build.Run(mr.config, trace) 206 } 207 208 func (mr *RunCommand) acquireRunnerResources(provider common.ExecutorProvider, runner *common.RunnerConfig) (common.ExecutorData, func(), error) { 209 executorData, err := provider.Acquire(runner) 210 if err != nil { 211 return nil, func() {}, fmt.Errorf("failed to update executor: %v", err) 212 } 213 214 if !mr.buildsHelper.acquireBuild(runner) { 215 provider.Release(runner, executorData) 216 return nil, nil, errors.New("failed to request job, runner limit met") 217 } 218 219 releaseFn := func() { 220 mr.buildsHelper.releaseBuild(runner) 221 provider.Release(runner, executorData) 222 } 223 224 return executorData, releaseFn, nil 225 } 226 227 func (mr *RunCommand) createSession(features common.FeaturesInfo) (*session.Session, *common.SessionInfo, error) { 228 if mr.sessionServer == nil || !features.Session { 229 return nil, nil, nil 230 } 231 232 sess, err := session.NewSession(mr.log()) 233 if err != nil { 234 return nil, nil, err 235 } 236 237 sessionInfo := &common.SessionInfo{ 238 URL: mr.sessionServer.AdvertiseAddress + sess.Endpoint, 239 Certificate: string(mr.sessionServer.CertificatePublicKey), 240 Authorization: sess.Token, 241 } 242 243 return sess, sessionInfo, err 244 } 245 246 func (mr *RunCommand) processRunners(id int, stopWorker chan bool, runners chan *common.RunnerConfig) { 247 mr.log().WithField("worker", id).Debugln("Starting worker") 248 for mr.stopSignal == nil { 249 select { 250 case runner := <-runners: 251 err := mr.processRunner(id, runner, runners) 252 if err != nil { 253 mr.log().WithFields(logrus.Fields{ 254 "runner": runner.ShortDescription(), 255 "executor": runner.Executor, 256 }).WithError(err). 257 Error("Failed to process runner") 258 } 259 260 // force GC cycle after processing build 261 runtime.GC() 262 263 case <-stopWorker: 264 mr.log().WithField("worker", id).Debugln("Stopping worker") 265 return 266 } 267 } 268 <-stopWorker 269 } 270 271 func (mr *RunCommand) startWorkers(startWorker chan int, stopWorker chan bool, runners chan *common.RunnerConfig) { 272 for mr.stopSignal == nil { 273 id := <-startWorker 274 go mr.processRunners(id, stopWorker, runners) 275 } 276 } 277 278 func (mr *RunCommand) loadConfig() error { 279 err := mr.configOptions.loadConfig() 280 if err != nil { 281 return err 282 } 283 284 // Set log level 285 err = mr.updateLoggingConfiguration() 286 if err != nil { 287 return err 288 } 289 290 // pass user to execute scripts as specific user 291 if mr.User != "" { 292 mr.config.User = mr.User 293 } 294 295 mr.healthy = nil 296 mr.log().Println("Configuration loaded") 297 mr.log().Debugln(helpers.ToYAML(mr.config)) 298 299 // initialize sentry 300 if mr.config.SentryDSN != nil { 301 var err error 302 mr.sentryLogHook, err = sentry.NewLogHook(*mr.config.SentryDSN) 303 if err != nil { 304 mr.log().WithError(err).Errorln("Sentry failure") 305 } 306 } else { 307 mr.sentryLogHook = sentry.LogHook{} 308 } 309 310 return nil 311 } 312 313 func (mr *RunCommand) updateLoggingConfiguration() error { 314 reloadNeeded := false 315 316 if mr.config.LogLevel != nil && !log.Configuration().IsLevelSetWithCli() { 317 err := log.Configuration().SetLevel(*mr.config.LogLevel) 318 if err != nil { 319 return err 320 } 321 322 reloadNeeded = true 323 } 324 325 if mr.config.LogFormat != nil && !log.Configuration().IsFormatSetWithCli() { 326 err := log.Configuration().SetFormat(*mr.config.LogFormat) 327 if err != nil { 328 return err 329 } 330 331 reloadNeeded = true 332 } 333 334 if reloadNeeded { 335 log.Configuration().ReloadConfiguration() 336 } 337 338 return nil 339 } 340 341 func (mr *RunCommand) checkConfig() (err error) { 342 info, err := os.Stat(mr.ConfigFile) 343 if err != nil { 344 return err 345 } 346 347 if !mr.config.ModTime.Before(info.ModTime()) { 348 return nil 349 } 350 351 err = mr.loadConfig() 352 if err != nil { 353 mr.log().Errorln("Failed to load config", err) 354 // don't reload the same file 355 mr.config.ModTime = info.ModTime() 356 return 357 } 358 return nil 359 } 360 361 func (mr *RunCommand) Start(s service.Service) error { 362 mr.abortBuilds = make(chan os.Signal) 363 mr.runSignal = make(chan os.Signal, 1) 364 mr.reloadSignal = make(chan os.Signal, 1) 365 mr.runFinished = make(chan bool, 1) 366 mr.stopSignals = make(chan os.Signal) 367 mr.log().Println("Starting multi-runner from", mr.ConfigFile, "...") 368 369 userModeWarning(false) 370 371 if len(mr.WorkingDirectory) > 0 { 372 err := os.Chdir(mr.WorkingDirectory) 373 if err != nil { 374 return err 375 } 376 } 377 378 err := mr.loadConfig() 379 if err != nil { 380 return err 381 } 382 383 // Start should not block. Do the actual work async. 384 go mr.Run() 385 386 return nil 387 } 388 389 func (mr *RunCommand) updateWorkers(workerIndex *int, startWorker chan int, stopWorker chan bool) os.Signal { 390 buildLimit := mr.config.Concurrent 391 392 if buildLimit < 1 { 393 mr.log().Fatalln("Concurrent is less than 1 - no jobs will be processed") 394 } 395 396 for mr.currentWorkers > buildLimit { 397 select { 398 case stopWorker <- true: 399 case signaled := <-mr.runSignal: 400 return signaled 401 } 402 mr.currentWorkers-- 403 } 404 405 for mr.currentWorkers < buildLimit { 406 select { 407 case startWorker <- *workerIndex: 408 case signaled := <-mr.runSignal: 409 return signaled 410 } 411 mr.currentWorkers++ 412 *workerIndex++ 413 } 414 415 return nil 416 } 417 418 func (mr *RunCommand) updateConfig() os.Signal { 419 select { 420 case <-time.After(common.ReloadConfigInterval * time.Second): 421 err := mr.checkConfig() 422 if err != nil { 423 mr.log().Errorln("Failed to load config", err) 424 } 425 426 case <-mr.reloadSignal: 427 err := mr.loadConfig() 428 if err != nil { 429 mr.log().Errorln("Failed to load config", err) 430 } 431 432 case signaled := <-mr.runSignal: 433 return signaled 434 } 435 return nil 436 } 437 438 func (mr *RunCommand) runWait() { 439 mr.log().Debugln("Waiting for stop signal") 440 441 // Save the stop signal and exit to execute Stop() 442 mr.stopSignal = <-mr.stopSignals 443 } 444 445 func (mr *RunCommand) serveMetrics(mux *http.ServeMux) { 446 registry := prometheus.NewRegistry() 447 // Metrics about the runner's business logic. 448 registry.MustRegister(&mr.buildsHelper) 449 registry.MustRegister(mr) 450 // Metrics about API connections 451 registry.MustRegister(mr.networkRequestStatusesCollector) 452 // Metrics about jobs failures 453 registry.MustRegister(mr.failuresCollector) 454 // Metrics about catched errors 455 registry.MustRegister(&mr.prometheusLogHook) 456 // Metrics about the program's build version. 457 registry.MustRegister(common.AppVersion.NewMetricsCollector()) 458 // Go-specific metrics about the process (GC stats, goroutines, etc.). 459 registry.MustRegister(prometheus.NewGoCollector()) 460 // Go-unrelated process metrics (memory usage, file descriptors, etc.). 461 registry.MustRegister(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{})) 462 463 // Register all executor provider collectors 464 for _, provider := range common.GetExecutorProviders() { 465 if collector, ok := provider.(prometheus.Collector); ok && collector != nil { 466 registry.MustRegister(collector) 467 } 468 } 469 470 mux.Handle("/metrics", promhttp.HandlerFor(registry, promhttp.HandlerOpts{})) 471 } 472 473 func (mr *RunCommand) serveDebugData(mux *http.ServeMux) { 474 mux.HandleFunc("/debug/jobs/list", mr.buildsHelper.ListJobsHandler) 475 } 476 477 func (mr *RunCommand) servePprof(mux *http.ServeMux) { 478 mux.HandleFunc("/debug/pprof/", pprof.Index) 479 mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline) 480 mux.HandleFunc("/debug/pprof/profile", pprof.Profile) 481 mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol) 482 mux.HandleFunc("/debug/pprof/trace", pprof.Trace) 483 } 484 485 func (mr *RunCommand) setupMetricsAndDebugServer() { 486 listenAddress, err := mr.listenAddress() 487 488 if err != nil { 489 mr.log().Errorf("invalid listen address: %s", err.Error()) 490 return 491 } 492 493 if listenAddress == "" { 494 mr.log().Info("listen_address not defined, metrics & debug endpoints disabled") 495 return 496 } 497 498 // We separate out the listener creation here so that we can return an error if 499 // the provided address is invalid or there is some other listener error. 500 listener, err := net.Listen("tcp", listenAddress) 501 if err != nil { 502 mr.log().WithError(err).Fatal("Failed to create listener for metrics server") 503 } 504 505 mux := http.NewServeMux() 506 507 go func() { 508 err := http.Serve(listener, mux) 509 if err != nil { 510 mr.log().WithError(err).Fatal("Metrics server terminated") 511 } 512 }() 513 514 mr.serveMetrics(mux) 515 mr.serveDebugData(mux) 516 mr.servePprof(mux) 517 518 mr.log(). 519 WithField("address", listenAddress). 520 Info("Metrics server listening") 521 } 522 523 func (mr *RunCommand) setupSessionServer() { 524 if mr.config.SessionServer.ListenAddress == "" { 525 mr.log().Info("[session_server].listen_address not defined, session endpoints disabled") 526 return 527 } 528 529 var err error 530 mr.sessionServer, err = session.NewServer( 531 session.ServerConfig{ 532 AdvertiseAddress: mr.config.SessionServer.AdvertiseAddress, 533 ListenAddress: mr.config.SessionServer.ListenAddress, 534 ShutdownTimeout: common.ShutdownTimeout * time.Second, 535 }, 536 mr.log(), 537 certificate.X509Generator{}, 538 mr.buildsHelper.findSessionByURL, 539 ) 540 if err != nil { 541 mr.log().WithError(err).Fatal("Failed to create session server") 542 } 543 544 go func() { 545 err := mr.sessionServer.Start() 546 if err != nil { 547 mr.log().WithError(err).Fatal("Session server terminated") 548 } 549 }() 550 551 mr.log(). 552 WithField("address", mr.config.SessionServer.ListenAddress). 553 Info("Session server listening") 554 } 555 556 func (mr *RunCommand) Run() { 557 mr.setupMetricsAndDebugServer() 558 mr.setupSessionServer() 559 560 runners := make(chan *common.RunnerConfig) 561 go mr.feedRunners(runners) 562 563 signal.Notify(mr.stopSignals, syscall.SIGQUIT, syscall.SIGTERM, os.Interrupt, os.Kill) 564 signal.Notify(mr.reloadSignal, syscall.SIGHUP) 565 566 startWorker := make(chan int) 567 stopWorker := make(chan bool) 568 go mr.startWorkers(startWorker, stopWorker, runners) 569 570 workerIndex := 0 571 572 for mr.stopSignal == nil { 573 signaled := mr.updateWorkers(&workerIndex, startWorker, stopWorker) 574 if signaled != nil { 575 break 576 } 577 578 signaled = mr.updateConfig() 579 if signaled != nil { 580 break 581 } 582 } 583 584 // Wait for workers to shutdown 585 for mr.currentWorkers > 0 { 586 stopWorker <- true 587 mr.currentWorkers-- 588 } 589 mr.log().Println("All workers stopped. Can exit now") 590 mr.runFinished <- true 591 } 592 593 func (mr *RunCommand) interruptRun() { 594 // Pump interrupt signal 595 for { 596 mr.runSignal <- mr.stopSignal 597 } 598 } 599 600 func (mr *RunCommand) abortAllBuilds() { 601 // Pump signal to abort all current builds 602 for { 603 mr.abortBuilds <- mr.stopSignal 604 } 605 } 606 607 func (mr *RunCommand) handleGracefulShutdown() error { 608 // We wait till we have a SIGQUIT 609 for mr.stopSignal == syscall.SIGQUIT { 610 mr.log().Warningln("Requested quit, waiting for builds to finish") 611 612 // Wait for other signals to finish builds 613 select { 614 case mr.stopSignal = <-mr.stopSignals: 615 // We received a new signal 616 617 case <-mr.runFinished: 618 // Everything finished we can exit now 619 return nil 620 } 621 } 622 623 return fmt.Errorf("received: %v", mr.stopSignal) 624 } 625 626 func (mr *RunCommand) handleShutdown() error { 627 mr.log().Warningln("Requested service stop:", mr.stopSignal) 628 629 go mr.abortAllBuilds() 630 631 if mr.sessionServer != nil { 632 mr.sessionServer.Close() 633 } 634 635 // Wait for graceful shutdown or abort after timeout 636 for { 637 select { 638 case mr.stopSignal = <-mr.stopSignals: 639 return fmt.Errorf("forced exit: %v", mr.stopSignal) 640 641 case <-time.After(common.ShutdownTimeout * time.Second): 642 return errors.New("shutdown timed out") 643 644 case <-mr.runFinished: 645 // Everything finished we can exit now 646 return nil 647 } 648 } 649 } 650 651 func (mr *RunCommand) Stop(s service.Service) (err error) { 652 go mr.interruptRun() 653 err = mr.handleGracefulShutdown() 654 if err == nil { 655 return 656 } 657 err = mr.handleShutdown() 658 return 659 } 660 661 // Describe implements prometheus.Collector. 662 func (mr *RunCommand) Describe(ch chan<- *prometheus.Desc) { 663 ch <- concurrentDesc 664 ch <- limitDesc 665 } 666 667 // Collect implements prometheus.Collector. 668 func (mr *RunCommand) Collect(ch chan<- prometheus.Metric) { 669 config := mr.config 670 671 ch <- prometheus.MustNewConstMetric( 672 concurrentDesc, 673 prometheus.GaugeValue, 674 float64(config.Concurrent), 675 ) 676 677 for _, runner := range config.Runners { 678 ch <- prometheus.MustNewConstMetric( 679 limitDesc, 680 prometheus.GaugeValue, 681 float64(runner.Limit), 682 runner.ShortDescription(), 683 ) 684 } 685 } 686 687 func (mr *RunCommand) Execute(context *cli.Context) { 688 svcConfig := &service.Config{ 689 Name: mr.ServiceName, 690 DisplayName: mr.ServiceName, 691 Description: defaultDescription, 692 Arguments: []string{"run"}, 693 Option: service.KeyValue{ 694 "RunWait": mr.runWait, 695 }, 696 } 697 698 svc, err := service_helpers.New(mr, svcConfig) 699 if err != nil { 700 logrus.Fatalln(err) 701 } 702 703 if mr.Syslog { 704 log.SetSystemLogger(logrus.StandardLogger(), svc) 705 } 706 707 logrus.AddHook(&mr.sentryLogHook) 708 logrus.AddHook(&mr.prometheusLogHook) 709 710 err = svc.Run() 711 if err != nil { 712 logrus.Fatalln(err) 713 } 714 } 715 716 func init() { 717 requestStatusesCollector := network.NewAPIRequestStatusesMap() 718 719 common.RegisterCommand2("run", "run multi runner service", &RunCommand{ 720 ServiceName: defaultServiceName, 721 network: network.NewGitLabClientWithRequestStatusesMap(requestStatusesCollector), 722 networkRequestStatusesCollector: requestStatusesCollector, 723 prometheusLogHook: prometheus_helper.NewLogHook(), 724 failuresCollector: prometheus_helper.NewFailuresCollector(), 725 buildsHelper: newBuildsHelper(), 726 }) 727 }