github.com/yrj2011/jx-test-infra@v0.0.0-20190529031832-7a2065ee98eb/prow/plank/controller.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package plank 18 19 import ( 20 "bytes" 21 "fmt" 22 "strings" 23 "sync" 24 "time" 25 26 "github.com/bwmarrin/snowflake" 27 "github.com/sirupsen/logrus" 28 "k8s.io/api/core/v1" 29 30 "k8s.io/test-infra/prow/config" 31 "k8s.io/test-infra/prow/github" 32 "k8s.io/test-infra/prow/kube" 33 "k8s.io/test-infra/prow/pjutil" 34 "k8s.io/test-infra/prow/pod-utils/decorate" 35 reportlib "k8s.io/test-infra/prow/report" 36 ) 37 38 const ( 39 testInfra = "https://github.com/kubernetes/test-infra/issues" 40 ) 41 42 type kubeClient interface { 43 CreateProwJob(kube.ProwJob) (kube.ProwJob, error) 44 ListProwJobs(string) ([]kube.ProwJob, error) 45 ReplaceProwJob(string, kube.ProwJob) (kube.ProwJob, error) 46 47 CreatePod(v1.Pod) (kube.Pod, error) 48 ListPods(string) ([]kube.Pod, error) 49 DeletePod(string) error 50 } 51 52 // GitHubClient contains the methods used by plank on k8s.io/test-infra/prow/github.Client 53 // Plank's unit tests implement a fake of this. 54 type GitHubClient interface { 55 BotName() (string, error) 56 CreateStatus(org, repo, ref string, s github.Status) error 57 ListIssueComments(org, repo string, number int) ([]github.IssueComment, error) 58 CreateComment(org, repo string, number int, comment string) error 59 DeleteComment(org, repo string, ID int) error 60 EditComment(org, repo string, ID int, comment string) error 61 GetPullRequestChanges(org, repo string, number int) ([]github.PullRequestChange, error) 62 } 63 64 type configAgent interface { 65 Config() *config.Config 66 } 67 68 // TODO: Dry this out 69 type syncFn func(pj kube.ProwJob, pm map[string]kube.Pod, reports chan<- kube.ProwJob) error 70 71 // Controller manages ProwJobs. 72 type Controller struct { 73 kc kubeClient 74 pkcs map[string]kubeClient 75 ghc GitHubClient 76 log *logrus.Entry 77 ca configAgent 78 node *snowflake.Node 79 totURL string 80 // selector that will be applied on prowjobs and pods. 81 selector string 82 83 lock sync.RWMutex 84 // pendingJobs is a short-lived cache that helps in limiting 85 // the maximum concurrency of jobs. 86 pendingJobs map[string]int 87 88 pjLock sync.RWMutex 89 // shared across the controller and a goroutine that gathers metrics. 90 pjs []kube.ProwJob 91 } 92 93 // NewController creates a new Controller from the provided clients. 94 func NewController(kc *kube.Client, pkcs map[string]*kube.Client, ghc GitHubClient, logger *logrus.Entry, ca *config.Agent, totURL, selector string) (*Controller, error) { 95 n, err := snowflake.NewNode(1) 96 if err != nil { 97 return nil, err 98 } 99 if logger == nil { 100 logger = logrus.NewEntry(logrus.StandardLogger()) 101 } 102 buildClusters := map[string]kubeClient{} 103 for alias, client := range pkcs { 104 buildClusters[alias] = kubeClient(client) 105 } 106 return &Controller{ 107 kc: kc, 108 pkcs: buildClusters, 109 ghc: ghc, 110 log: logger, 111 ca: ca, 112 node: n, 113 pendingJobs: make(map[string]int), 114 totURL: totURL, 115 selector: selector, 116 }, nil 117 } 118 119 // canExecuteConcurrently checks whether the provided ProwJob can 120 // be executed concurrently. 121 func (c *Controller) canExecuteConcurrently(pj *kube.ProwJob) bool { 122 c.lock.Lock() 123 defer c.lock.Unlock() 124 125 if max := c.ca.Config().Plank.MaxConcurrency; max > 0 { 126 var running int 127 for _, num := range c.pendingJobs { 128 running += num 129 } 130 if running >= max { 131 c.log.WithFields(pjutil.ProwJobFields(pj)).Debugf("Not starting another job, already %d running.", running) 132 return false 133 } 134 } 135 136 if pj.Spec.MaxConcurrency == 0 { 137 c.pendingJobs[pj.Spec.Job]++ 138 return true 139 } 140 141 numPending := c.pendingJobs[pj.Spec.Job] 142 if numPending >= pj.Spec.MaxConcurrency { 143 c.log.WithFields(pjutil.ProwJobFields(pj)).Debugf("Not starting another instance of %s, already %d running.", pj.Spec.Job, numPending) 144 return false 145 } 146 c.pendingJobs[pj.Spec.Job]++ 147 return true 148 } 149 150 // incrementNumPendingJobs increments the amount of 151 // pending ProwJobs for the given job identifier 152 func (c *Controller) incrementNumPendingJobs(job string) { 153 c.lock.Lock() 154 defer c.lock.Unlock() 155 c.pendingJobs[job]++ 156 } 157 158 // Sync does one sync iteration. 159 func (c *Controller) Sync() error { 160 pjs, err := c.kc.ListProwJobs(c.selector) 161 if err != nil { 162 return fmt.Errorf("error listing prow jobs: %v", err) 163 } 164 selector := fmt.Sprintf("%s=true", kube.CreatedByProw) 165 if len(c.selector) > 0 { 166 selector = strings.Join([]string{c.selector, selector}, ",") 167 } 168 pm := map[string]kube.Pod{} 169 for alias, client := range c.pkcs { 170 // hack: get all pods as knativie build pods dont have CreatedByProw=true label 171 pods, err := client.ListPods("") 172 if err != nil { 173 return fmt.Errorf("error listing pods in cluster %q: %v", alias, err) 174 } 175 for _, pod := range pods { 176 pm[pod.ObjectMeta.Name] = pod 177 } 178 } 179 // TODO: Replace the following filtering with a field selector once CRDs support field selectors. 180 // https://github.com/kubernetes/kubernetes/issues/53459 181 var k8sJobs []kube.ProwJob 182 for _, pj := range pjs { 183 if pj.Spec.Agent == kube.KubernetesAgent || pj.Spec.Agent == kube.BuildAgent { 184 k8sJobs = append(k8sJobs, pj) 185 } 186 } 187 pjs = k8sJobs 188 189 var syncErrs []error 190 if err := c.terminateDupes(pjs, pm); err != nil { 191 syncErrs = append(syncErrs, err) 192 } 193 194 // Share what we have for gathering metrics. 195 c.pjLock.Lock() 196 c.pjs = pjs 197 c.pjLock.Unlock() 198 199 pendingCh, triggeredCh := pjutil.PartitionActive(pjs) 200 errCh := make(chan error, len(pjs)) 201 reportCh := make(chan kube.ProwJob, len(pjs)) 202 203 // Reinstantiate on every resync of the controller instead of trying 204 // to keep this in sync with the state of the world. 205 c.pendingJobs = make(map[string]int) 206 // Sync pending jobs first so we can determine what is the maximum 207 // number of new jobs we can trigger when syncing the non-pendings. 208 maxSyncRoutines := c.ca.Config().Plank.MaxGoroutines 209 c.log.Debugf("Handling %d pending prowjobs", len(pendingCh)) 210 syncProwJobs(c.log, c.syncPendingJob, maxSyncRoutines, pendingCh, reportCh, errCh, pm) 211 c.log.Debugf("Handling %d triggered prowjobs", len(triggeredCh)) 212 syncProwJobs(c.log, c.syncTriggeredJob, maxSyncRoutines, triggeredCh, reportCh, errCh, pm) 213 214 close(errCh) 215 close(reportCh) 216 217 for err := range errCh { 218 syncErrs = append(syncErrs, err) 219 } 220 221 var reportErrs []error 222 if c.ghc != nil { 223 reportTemplate := c.ca.Config().Plank.ReportTemplate 224 for report := range reportCh { 225 if err := reportlib.Report(c.ghc, reportTemplate, report); err != nil { 226 reportErrs = append(reportErrs, err) 227 c.log.WithFields(pjutil.ProwJobFields(&report)).WithError(err).Warn("Failed to report ProwJob status") 228 } 229 } 230 } 231 232 if len(syncErrs) == 0 && len(reportErrs) == 0 { 233 return nil 234 } 235 return fmt.Errorf("errors syncing: %v, errors reporting: %v", syncErrs, reportErrs) 236 } 237 238 // SyncMetrics records metrics for the cached prowjobs. 239 func (c *Controller) SyncMetrics() { 240 c.pjLock.RLock() 241 defer c.pjLock.RUnlock() 242 kube.GatherProwJobMetrics(c.pjs) 243 } 244 245 // terminateDupes aborts presubmits that have a newer version. It modifies pjs 246 // in-place when it aborts. 247 // TODO: Dry this out - need to ensure we can abstract children cancellation first. 248 func (c *Controller) terminateDupes(pjs []kube.ProwJob, pm map[string]kube.Pod) error { 249 // "job org/repo#number" -> newest job 250 dupes := make(map[string]int) 251 for i, pj := range pjs { 252 if pj.Complete() || pj.Spec.Type != kube.PresubmitJob { 253 continue 254 } 255 n := fmt.Sprintf("%s %s/%s#%d", pj.Spec.Job, pj.Spec.Refs.Org, pj.Spec.Refs.Repo, pj.Spec.Refs.Pulls[0].Number) 256 prev, ok := dupes[n] 257 if !ok { 258 dupes[n] = i 259 continue 260 } 261 cancelIndex := i 262 if (&pjs[prev].Status.StartTime).Before(&pj.Status.StartTime) { 263 cancelIndex = prev 264 dupes[n] = i 265 } 266 toCancel := pjs[cancelIndex] 267 // Allow aborting presubmit jobs for commits that have been superseded by 268 // newer commits in Github pull requests. 269 if c.ca.Config().Plank.AllowCancellations { 270 if pod, exists := pm[toCancel.ObjectMeta.Name]; exists { 271 if client, ok := c.pkcs[toCancel.ClusterAlias()]; !ok { 272 c.log.WithFields(pjutil.ProwJobFields(&toCancel)).Errorf("Unknown cluster alias %q.", toCancel.ClusterAlias()) 273 } else if err := client.DeletePod(pod.ObjectMeta.Name); err != nil { 274 c.log.WithError(err).WithFields(pjutil.ProwJobFields(&toCancel)).Warn("Cannot delete pod") 275 } 276 } 277 } 278 toCancel.SetComplete() 279 prevState := toCancel.Status.State 280 toCancel.Status.State = kube.AbortedState 281 c.log.WithFields(pjutil.ProwJobFields(&toCancel)). 282 WithField("from", prevState). 283 WithField("to", toCancel.Status.State).Info("Transitioning states.") 284 npj, err := c.kc.ReplaceProwJob(toCancel.ObjectMeta.Name, toCancel) 285 if err != nil { 286 return err 287 } 288 pjs[cancelIndex] = npj 289 } 290 return nil 291 } 292 293 // TODO: Dry this out 294 func syncProwJobs( 295 l *logrus.Entry, 296 syncFn syncFn, 297 maxSyncRoutines int, 298 jobs <-chan kube.ProwJob, 299 reports chan<- kube.ProwJob, 300 syncErrors chan<- error, 301 pm map[string]kube.Pod, 302 ) { 303 goroutines := maxSyncRoutines 304 if goroutines > len(jobs) { 305 goroutines = len(jobs) 306 } 307 wg := &sync.WaitGroup{} 308 wg.Add(goroutines) 309 l.Debugf("Firing up %d goroutines", goroutines) 310 for i := 0; i < goroutines; i++ { 311 go func() { 312 defer wg.Done() 313 for pj := range jobs { 314 if err := syncFn(pj, pm, reports); err != nil { 315 syncErrors <- err 316 } 317 } 318 }() 319 } 320 wg.Wait() 321 } 322 323 func (c *Controller) syncPendingJob(pj kube.ProwJob, pm map[string]kube.Pod, reports chan<- kube.ProwJob) error { 324 // Record last known state so we can log state transitions. 325 prevState := pj.Status.State 326 podExists := false 327 var pod kube.Pod 328 if pj.Spec.Agent == kube.BuildAgent { 329 for _, p := range pm { 330 bn, exist := p.Labels["build-name"] 331 if exist && bn == pj.Name { 332 podExists = true 333 pod = p 334 break 335 } 336 } 337 } else { 338 pod, podExists = pm[pj.ObjectMeta.Name] 339 } 340 if !podExists { 341 c.incrementNumPendingJobs(pj.Spec.Job) 342 // Pod is missing. This can happen in case the previous pod was deleted manually or by 343 // a rescheduler. Start a new pod. 344 id, pn, err := c.startPod(pj) 345 if err != nil { 346 _, isUnprocessable := err.(kube.UnprocessableEntityError) 347 if !isUnprocessable { 348 return fmt.Errorf("error starting pod: %v", err) 349 } 350 pj.Status.State = kube.ErrorState 351 pj.SetComplete() 352 pj.Status.Description = "Job cannot be processed." 353 c.log.WithFields(pjutil.ProwJobFields(&pj)).WithError(err).Warning("Unprocessable pod.") 354 } else { 355 pj.Status.BuildID = id 356 pj.Status.PodName = pn 357 c.log.WithFields(pjutil.ProwJobFields(&pj)).Info("Pod is missing, starting a new pod") 358 } 359 } else { 360 switch pod.Status.Phase { 361 case kube.PodUnknown: 362 c.incrementNumPendingJobs(pj.Spec.Job) 363 // Pod is in Unknown state. This can happen if there is a problem with 364 // the node. Delete the old pod, we'll start a new one next loop. 365 c.log.WithFields(pjutil.ProwJobFields(&pj)).Info("Pod is in unknown state, deleting & restarting pod") 366 client, ok := c.pkcs[pj.ClusterAlias()] 367 if !ok { 368 return fmt.Errorf("Unknown cluster alias %q.", pj.ClusterAlias()) 369 } 370 return client.DeletePod(pj.ObjectMeta.Name) 371 372 case kube.PodSucceeded: 373 // Pod succeeded. Update ProwJob, talk to GitHub, and start next jobs. 374 pj.SetComplete() 375 pj.Status.State = kube.SuccessState 376 pj.Status.Description = "Job succeeded." 377 for _, nj := range pj.Spec.RunAfterSuccess { 378 child := pjutil.NewProwJob(nj, pj.ObjectMeta.Labels) 379 if c.ghc != nil && !c.RunAfterSuccessCanRun(&pj, &child, c.ca, c.ghc) { 380 continue 381 } 382 if _, err := c.kc.CreateProwJob(pjutil.NewProwJob(nj, pj.ObjectMeta.Labels)); err != nil { 383 return fmt.Errorf("error starting next prowjob: %v", err) 384 } 385 } 386 387 case kube.PodFailed: 388 if pod.Status.Reason == kube.Evicted { 389 c.incrementNumPendingJobs(pj.Spec.Job) 390 // Pod was evicted. We will recreate it in the next resync. 391 client, ok := c.pkcs[pj.ClusterAlias()] 392 if !ok { 393 return fmt.Errorf("Unknown cluster alias %q.", pj.ClusterAlias()) 394 } 395 return client.DeletePod(pj.ObjectMeta.Name) 396 } 397 // Pod failed. Update ProwJob, talk to GitHub. 398 pj.SetComplete() 399 pj.Status.State = kube.FailureState 400 pj.Status.Description = "Job failed." 401 402 case kube.PodPending: 403 maxPodPending := c.ca.Config().Plank.PodPendingTimeout 404 405 if pod.Status.StartTime.IsZero() || time.Since(pod.Status.StartTime.Time) < maxPodPending { 406 if pj.Status.State != kube.PendingState { 407 c.incrementNumPendingJobs(pj.Spec.Job) 408 pj.Status.State = kube.PendingState 409 pj.Status.Description = "Job pending." 410 } else { 411 logrus.Info("original code") 412 // Pod is running. Do nothing. 413 c.incrementNumPendingJobs(pj.Spec.Job) 414 // dont return as knative builds may not have updated the prowjob yet 415 } 416 417 } else { 418 // Pod is stuck in pending state longer than maxPodPending 419 // abort the job, and talk to Github 420 pj.SetComplete() 421 pj.Status.State = kube.AbortedState 422 pj.Status.Description = "Job aborted." 423 } 424 425 default: 426 // Pod is running. Do nothing. 427 c.incrementNumPendingJobs(pj.Spec.Job) 428 return nil 429 } 430 } 431 432 var b bytes.Buffer 433 if err := c.ca.Config().Plank.JobURLTemplate.Execute(&b, &pj); err != nil { 434 c.log.WithFields(pjutil.ProwJobFields(&pj)).Errorf("error executing URL template: %v", err) 435 } else { 436 pj.Status.URL = b.String() 437 } 438 reports <- pj 439 if prevState != pj.Status.State { 440 c.log.WithFields(pjutil.ProwJobFields(&pj)). 441 WithField("from", prevState). 442 WithField("to", pj.Status.State).Info("Transitioning states.") 443 } 444 _, err := c.kc.ReplaceProwJob(pj.ObjectMeta.Name, pj) 445 return err 446 } 447 448 func (c *Controller) syncTriggeredJob(pj kube.ProwJob, pm map[string]kube.Pod, reports chan<- kube.ProwJob) error { 449 // Record last known state so we can log state transitions. 450 prevState := pj.Status.State 451 452 var id, pn string 453 pod, podExists := pm[pj.ObjectMeta.Name] 454 // We may end up in a state where the pod exists but the prowjob is not 455 // updated to pending if we successfully create a new pod in a previous 456 // sync but the prowjob update fails. Simply ignore creating a new pod 457 // and rerun the prowjob update. 458 if !podExists { 459 // Do not start more jobs than specified. 460 if !c.canExecuteConcurrently(&pj) { 461 return nil 462 } 463 // We haven't started the pod yet. Do so. 464 var err error 465 id, pn, err = c.startPod(pj) 466 if err != nil { 467 _, isUnprocessable := err.(kube.UnprocessableEntityError) 468 if !isUnprocessable { 469 return fmt.Errorf("error starting pod: %v", err) 470 } 471 pj.Status.State = kube.ErrorState 472 pj.SetComplete() 473 pj.Status.Description = "Job cannot be processed." 474 logrus.WithField("job", pj.Spec.Job).WithError(err).Warning("Unprocessable pod.") 475 } 476 } else { 477 id = getPodBuildID(&pod) 478 pn = pod.ObjectMeta.Name 479 } 480 481 if pj.Status.State == kube.TriggeredState { 482 // BuildID needs to be set before we execute the job url template. 483 pj.Status.BuildID = id 484 pj.Status.State = kube.PendingState 485 pj.Status.PodName = pn 486 pj.Status.Description = "Job triggered." 487 var b bytes.Buffer 488 if err := c.ca.Config().Plank.JobURLTemplate.Execute(&b, &pj); err != nil { 489 c.log.WithFields(pjutil.ProwJobFields(&pj)).Errorf("error executing URL template: %v", err) 490 } else { 491 pj.Status.URL = b.String() 492 } 493 } 494 reports <- pj 495 if prevState != pj.Status.State { 496 c.log.WithFields(pjutil.ProwJobFields(&pj)). 497 WithField("from", prevState). 498 WithField("to", pj.Status.State).Info("Transitioning states.") 499 } 500 _, err := c.kc.ReplaceProwJob(pj.ObjectMeta.Name, pj) 501 return err 502 } 503 504 // TODO: No need to return the pod name since we already have the 505 // prowjob in the call site. 506 func (c *Controller) startPod(pj kube.ProwJob) (string, string, error) { 507 buildID, err := c.getBuildID(pj.Spec.Job) 508 if err != nil { 509 return "", "", fmt.Errorf("error getting build ID: %v", err) 510 } 511 512 pod, err := decorate.ProwJobToPod(pj, buildID) 513 if err != nil { 514 return "", "", err 515 } 516 517 client, ok := c.pkcs[pj.ClusterAlias()] 518 if !ok { 519 return "", "", fmt.Errorf("Unknown cluster alias %q.", pj.ClusterAlias()) 520 } 521 actual, err := client.CreatePod(*pod) 522 if err != nil { 523 return "", "", err 524 } 525 return buildID, actual.ObjectMeta.Name, nil 526 } 527 528 func (c *Controller) getBuildID(name string) (string, error) { 529 if c.totURL == "" { 530 return c.node.Generate().String(), nil 531 } 532 return pjutil.GetBuildID(name, c.totURL) 533 } 534 535 func getPodBuildID(pod *kube.Pod) string { 536 for _, env := range pod.Spec.Containers[0].Env { 537 if env.Name == "BUILD_NUMBER" { 538 return env.Value 539 } 540 } 541 logrus.Warningf("BUILD_NUMBER was not found in pod %q: streaming logs from deck will not work", pod.ObjectMeta.Name) 542 return "" 543 } 544 545 // RunAfterSuccessCanRun returns whether a child job (specified as run_after_success in the 546 // prow config) can run once its parent job succeeds. The only case we will not run a child job 547 // is when it is a presubmit job and has a run_if_changed regular expression specified which does 548 // not match the changed filenames in the pull request the job was meant to run for. 549 // TODO: Collapse with Jenkins, impossible to reuse as is due to the interfaces. 550 func (c *Controller) RunAfterSuccessCanRun(parent, child *kube.ProwJob, ca configAgent, ghc GitHubClient) bool { 551 if parent.Spec.Type != kube.PresubmitJob { 552 return true 553 } 554 555 // TODO: Make sure that parent and child have always the same org/repo. 556 org := parent.Spec.Refs.Org 557 repo := parent.Spec.Refs.Repo 558 prNum := parent.Spec.Refs.Pulls[0].Number 559 560 ps := ca.Config().GetPresubmit(org+"/"+repo, child.Spec.Job) 561 if ps == nil { 562 // The config has changed ever since we started the parent. 563 // Not sure what is more correct here. Run the child for now. 564 return true 565 } 566 if ps.RunIfChanged == "" { 567 return true 568 } 569 changesFull, err := ghc.GetPullRequestChanges(org, repo, prNum) 570 if err != nil { 571 c.log.WithError(err).WithFields(pjutil.ProwJobFields(parent)).Warnf("Cannot get PR changes for #%d", prNum) 572 return true 573 } 574 // We only care about the filenames here 575 var changes []string 576 for _, change := range changesFull { 577 changes = append(changes, change.Filename) 578 } 579 return ps.RunsAgainstChanges(changes) 580 }