github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/prow/plank/controller.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package plank 18 19 import ( 20 "bytes" 21 "fmt" 22 "net/url" 23 "path" 24 "strings" 25 "sync" 26 "time" 27 28 "github.com/sirupsen/logrus" 29 "k8s.io/api/core/v1" 30 "k8s.io/test-infra/prow/config" 31 "k8s.io/test-infra/prow/gcsupload" 32 "k8s.io/test-infra/prow/github" 33 "k8s.io/test-infra/prow/kube" 34 "k8s.io/test-infra/prow/pjutil" 35 "k8s.io/test-infra/prow/pod-utils/decorate" 36 "k8s.io/test-infra/prow/pod-utils/downwardapi" 37 reportlib "k8s.io/test-infra/prow/report" 38 ) 39 40 const ( 41 testInfra = "https://github.com/kubernetes/test-infra/issues" 42 ) 43 44 type kubeClient interface { 45 CreateProwJob(kube.ProwJob) (kube.ProwJob, error) 46 ListProwJobs(string) ([]kube.ProwJob, error) 47 ReplaceProwJob(string, kube.ProwJob) (kube.ProwJob, error) 48 49 CreatePod(v1.Pod) (kube.Pod, error) 50 ListPods(string) ([]kube.Pod, error) 51 DeletePod(string) error 52 } 53 54 // GitHubClient contains the methods used by plank on k8s.io/test-infra/prow/github.Client 55 // Plank's unit tests implement a fake of this. 56 type GitHubClient interface { 57 BotName() (string, error) 58 CreateStatus(org, repo, ref string, s github.Status) error 59 ListIssueComments(org, repo string, number int) ([]github.IssueComment, error) 60 CreateComment(org, repo string, number int, comment string) error 61 DeleteComment(org, repo string, ID int) error 62 EditComment(org, repo string, ID int, comment string) error 63 GetPullRequestChanges(org, repo string, number int) ([]github.PullRequestChange, error) 64 } 65 66 type configAgent interface { 67 Config() *config.Config 68 } 69 70 // TODO: Dry this out 71 type syncFn func(pj kube.ProwJob, pm map[string]kube.Pod, reports chan<- kube.ProwJob) error 72 73 // Controller manages ProwJobs. 74 type Controller struct { 75 kc kubeClient 76 pkcs map[string]kubeClient 77 ghc GitHubClient 78 log *logrus.Entry 79 ca configAgent 80 totURL string 81 // selector that will be applied on prowjobs and pods. 82 selector string 83 84 lock sync.RWMutex 85 // pendingJobs is a short-lived cache that helps in limiting 86 // the maximum concurrency of jobs. 87 pendingJobs map[string]int 88 89 pjLock sync.RWMutex 90 // shared across the controller and a goroutine that gathers metrics. 91 pjs []kube.ProwJob 92 93 // if skip report job results to github 94 skipReport bool 95 } 96 97 // NewController creates a new Controller from the provided clients. 98 func NewController(kc *kube.Client, pkcs map[string]*kube.Client, ghc GitHubClient, logger *logrus.Entry, ca *config.Agent, totURL, selector string, skipReport bool) (*Controller, error) { 99 if logger == nil { 100 logger = logrus.NewEntry(logrus.StandardLogger()) 101 } 102 buildClusters := map[string]kubeClient{} 103 for alias, client := range pkcs { 104 buildClusters[alias] = kubeClient(client) 105 } 106 return &Controller{ 107 kc: kc, 108 pkcs: buildClusters, 109 ghc: ghc, 110 log: logger, 111 ca: ca, 112 pendingJobs: make(map[string]int), 113 totURL: totURL, 114 selector: selector, 115 skipReport: skipReport, 116 }, nil 117 } 118 119 // canExecuteConcurrently checks whether the provided ProwJob can 120 // be executed concurrently. 121 func (c *Controller) canExecuteConcurrently(pj *kube.ProwJob) bool { 122 c.lock.Lock() 123 defer c.lock.Unlock() 124 125 if max := c.ca.Config().Plank.MaxConcurrency; max > 0 { 126 var running int 127 for _, num := range c.pendingJobs { 128 running += num 129 } 130 if running >= max { 131 c.log.WithFields(pjutil.ProwJobFields(pj)).Debugf("Not starting another job, already %d running.", running) 132 return false 133 } 134 } 135 136 if pj.Spec.MaxConcurrency == 0 { 137 c.pendingJobs[pj.Spec.Job]++ 138 return true 139 } 140 141 numPending := c.pendingJobs[pj.Spec.Job] 142 if numPending >= pj.Spec.MaxConcurrency { 143 c.log.WithFields(pjutil.ProwJobFields(pj)).Debugf("Not starting another instance of %s, already %d running.", pj.Spec.Job, numPending) 144 return false 145 } 146 c.pendingJobs[pj.Spec.Job]++ 147 return true 148 } 149 150 // incrementNumPendingJobs increments the amount of 151 // pending ProwJobs for the given job identifier 152 func (c *Controller) incrementNumPendingJobs(job string) { 153 c.lock.Lock() 154 defer c.lock.Unlock() 155 c.pendingJobs[job]++ 156 } 157 158 // Sync does one sync iteration. 159 func (c *Controller) Sync() error { 160 pjs, err := c.kc.ListProwJobs(c.selector) 161 if err != nil { 162 return fmt.Errorf("error listing prow jobs: %v", err) 163 } 164 selector := fmt.Sprintf("%s=true", kube.CreatedByProw) 165 if len(c.selector) > 0 { 166 selector = strings.Join([]string{c.selector, selector}, ",") 167 } 168 169 pm := map[string]kube.Pod{} 170 for alias, client := range c.pkcs { 171 pods, err := client.ListPods(selector) 172 if err != nil { 173 return fmt.Errorf("error listing pods in cluster %q: %v", alias, err) 174 } 175 for _, pod := range pods { 176 pm[pod.ObjectMeta.Name] = pod 177 } 178 } 179 // TODO: Replace the following filtering with a field selector once CRDs support field selectors. 180 // https://github.com/kubernetes/kubernetes/issues/53459 181 var k8sJobs []kube.ProwJob 182 for _, pj := range pjs { 183 if pj.Spec.Agent == kube.KubernetesAgent { 184 k8sJobs = append(k8sJobs, pj) 185 } 186 } 187 pjs = k8sJobs 188 189 var syncErrs []error 190 if err := c.terminateDupes(pjs, pm); err != nil { 191 syncErrs = append(syncErrs, err) 192 } 193 194 // Share what we have for gathering metrics. 195 c.pjLock.Lock() 196 c.pjs = pjs 197 c.pjLock.Unlock() 198 199 pendingCh, triggeredCh := pjutil.PartitionActive(pjs) 200 errCh := make(chan error, len(pjs)) 201 reportCh := make(chan kube.ProwJob, len(pjs)) 202 203 // Reinstantiate on every resync of the controller instead of trying 204 // to keep this in sync with the state of the world. 205 c.pendingJobs = make(map[string]int) 206 // Sync pending jobs first so we can determine what is the maximum 207 // number of new jobs we can trigger when syncing the non-pendings. 208 maxSyncRoutines := c.ca.Config().Plank.MaxGoroutines 209 c.log.Debugf("Handling %d pending prowjobs", len(pendingCh)) 210 syncProwJobs(c.log, c.syncPendingJob, maxSyncRoutines, pendingCh, reportCh, errCh, pm) 211 c.log.Debugf("Handling %d triggered prowjobs", len(triggeredCh)) 212 syncProwJobs(c.log, c.syncTriggeredJob, maxSyncRoutines, triggeredCh, reportCh, errCh, pm) 213 214 close(errCh) 215 close(reportCh) 216 217 for err := range errCh { 218 syncErrs = append(syncErrs, err) 219 } 220 221 var reportErrs []error 222 if !c.skipReport { 223 reportTemplate := c.ca.Config().Plank.ReportTemplate 224 for report := range reportCh { 225 if err := reportlib.Report(c.ghc, reportTemplate, report); err != nil { 226 reportErrs = append(reportErrs, err) 227 c.log.WithFields(pjutil.ProwJobFields(&report)).WithError(err).Warn("Failed to report ProwJob status") 228 } 229 } 230 } 231 232 if len(syncErrs) == 0 && len(reportErrs) == 0 { 233 return nil 234 } 235 return fmt.Errorf("errors syncing: %v, errors reporting: %v", syncErrs, reportErrs) 236 } 237 238 // SyncMetrics records metrics for the cached prowjobs. 239 func (c *Controller) SyncMetrics() { 240 c.pjLock.RLock() 241 defer c.pjLock.RUnlock() 242 kube.GatherProwJobMetrics(c.pjs) 243 } 244 245 // terminateDupes aborts presubmits that have a newer version. It modifies pjs 246 // in-place when it aborts. 247 // TODO: Dry this out - need to ensure we can abstract children cancellation first. 248 func (c *Controller) terminateDupes(pjs []kube.ProwJob, pm map[string]kube.Pod) error { 249 // "job org/repo#number" -> newest job 250 dupes := make(map[string]int) 251 for i, pj := range pjs { 252 if pj.Complete() || pj.Spec.Type != kube.PresubmitJob { 253 continue 254 } 255 n := fmt.Sprintf("%s %s/%s#%d", pj.Spec.Job, pj.Spec.Refs.Org, pj.Spec.Refs.Repo, pj.Spec.Refs.Pulls[0].Number) 256 prev, ok := dupes[n] 257 if !ok { 258 dupes[n] = i 259 continue 260 } 261 cancelIndex := i 262 if (&pjs[prev].Status.StartTime).Before(&pj.Status.StartTime) { 263 cancelIndex = prev 264 dupes[n] = i 265 } 266 toCancel := pjs[cancelIndex] 267 // Allow aborting presubmit jobs for commits that have been superseded by 268 // newer commits in Github pull requests. 269 if c.ca.Config().Plank.AllowCancellations { 270 if pod, exists := pm[toCancel.ObjectMeta.Name]; exists { 271 if client, ok := c.pkcs[toCancel.ClusterAlias()]; !ok { 272 c.log.WithFields(pjutil.ProwJobFields(&toCancel)).Errorf("Unknown cluster alias %q.", toCancel.ClusterAlias()) 273 } else if err := client.DeletePod(pod.ObjectMeta.Name); err != nil { 274 c.log.WithError(err).WithFields(pjutil.ProwJobFields(&toCancel)).Warn("Cannot delete pod") 275 } 276 } 277 } 278 toCancel.SetComplete() 279 prevState := toCancel.Status.State 280 toCancel.Status.State = kube.AbortedState 281 c.log.WithFields(pjutil.ProwJobFields(&toCancel)). 282 WithField("from", prevState). 283 WithField("to", toCancel.Status.State).Info("Transitioning states.") 284 npj, err := c.kc.ReplaceProwJob(toCancel.ObjectMeta.Name, toCancel) 285 if err != nil { 286 return err 287 } 288 pjs[cancelIndex] = npj 289 } 290 return nil 291 } 292 293 // TODO: Dry this out 294 func syncProwJobs( 295 l *logrus.Entry, 296 syncFn syncFn, 297 maxSyncRoutines int, 298 jobs <-chan kube.ProwJob, 299 reports chan<- kube.ProwJob, 300 syncErrors chan<- error, 301 pm map[string]kube.Pod, 302 ) { 303 goroutines := maxSyncRoutines 304 if goroutines > len(jobs) { 305 goroutines = len(jobs) 306 } 307 wg := &sync.WaitGroup{} 308 wg.Add(goroutines) 309 l.Debugf("Firing up %d goroutines", goroutines) 310 for i := 0; i < goroutines; i++ { 311 go func() { 312 defer wg.Done() 313 for pj := range jobs { 314 if err := syncFn(pj, pm, reports); err != nil { 315 syncErrors <- err 316 } 317 } 318 }() 319 } 320 wg.Wait() 321 } 322 323 func (c *Controller) syncPendingJob(pj kube.ProwJob, pm map[string]kube.Pod, reports chan<- kube.ProwJob) error { 324 // Record last known state so we can log state transitions. 325 prevState := pj.Status.State 326 327 pod, podExists := pm[pj.ObjectMeta.Name] 328 if !podExists { 329 c.incrementNumPendingJobs(pj.Spec.Job) 330 // Pod is missing. This can happen in case the previous pod was deleted manually or by 331 // a rescheduler. Start a new pod. 332 id, pn, err := c.startPod(pj) 333 if err != nil { 334 _, isUnprocessable := err.(kube.UnprocessableEntityError) 335 if !isUnprocessable { 336 return fmt.Errorf("error starting pod: %v", err) 337 } 338 pj.Status.State = kube.ErrorState 339 pj.SetComplete() 340 pj.Status.Description = "Job cannot be processed." 341 c.log.WithFields(pjutil.ProwJobFields(&pj)).WithError(err).Warning("Unprocessable pod.") 342 } else { 343 pj.Status.BuildID = id 344 pj.Status.PodName = pn 345 c.log.WithFields(pjutil.ProwJobFields(&pj)).Info("Pod is missing, starting a new pod") 346 } 347 } else { 348 switch pod.Status.Phase { 349 case kube.PodUnknown: 350 c.incrementNumPendingJobs(pj.Spec.Job) 351 // Pod is in Unknown state. This can happen if there is a problem with 352 // the node. Delete the old pod, we'll start a new one next loop. 353 c.log.WithFields(pjutil.ProwJobFields(&pj)).Info("Pod is in unknown state, deleting & restarting pod") 354 client, ok := c.pkcs[pj.ClusterAlias()] 355 if !ok { 356 return fmt.Errorf("unknown cluster alias %q", pj.ClusterAlias()) 357 } 358 return client.DeletePod(pj.ObjectMeta.Name) 359 360 case kube.PodSucceeded: 361 // Pod succeeded. Update ProwJob, talk to GitHub, and start next jobs. 362 pj.SetComplete() 363 pj.Status.State = kube.SuccessState 364 pj.Status.Description = "Job succeeded." 365 for _, nj := range pj.Spec.RunAfterSuccess { 366 child := pjutil.NewProwJob(nj, pj.ObjectMeta.Labels) 367 if c.ghc != nil && !c.RunAfterSuccessCanRun(&pj, &child, c.ca, c.ghc) { 368 continue 369 } 370 if _, err := c.kc.CreateProwJob(pjutil.NewProwJob(nj, pj.ObjectMeta.Labels)); err != nil { 371 return fmt.Errorf("error starting next prowjob: %v", err) 372 } 373 } 374 375 case kube.PodFailed: 376 if pod.Status.Reason == kube.Evicted { 377 // Pod was evicted. 378 if pj.Spec.ErrorOnEviction { 379 // ErrorOnEviction is enabled, complete the PJ and mark it as errored. 380 pj.SetComplete() 381 pj.Status.State = kube.ErrorState 382 pj.Status.Description = "Job pod was evicted by the cluster." 383 break 384 } 385 // ErrorOnEviction is disabled. Delete the pod now and recreate it in 386 // the next resync. 387 c.incrementNumPendingJobs(pj.Spec.Job) 388 client, ok := c.pkcs[pj.ClusterAlias()] 389 if !ok { 390 return fmt.Errorf("unknown cluster alias %q", pj.ClusterAlias()) 391 } 392 return client.DeletePod(pj.ObjectMeta.Name) 393 } 394 // Pod failed. Update ProwJob, talk to GitHub. 395 pj.SetComplete() 396 pj.Status.State = kube.FailureState 397 pj.Status.Description = "Job failed." 398 399 case kube.PodPending: 400 maxPodPending := c.ca.Config().Plank.PodPendingTimeout 401 if pod.Status.StartTime.IsZero() || time.Since(pod.Status.StartTime.Time) < maxPodPending { 402 // Pod is running. Do nothing. 403 c.incrementNumPendingJobs(pj.Spec.Job) 404 return nil 405 } 406 407 // Pod is stuck in pending state longer than maxPodPending 408 // abort the job, and talk to Github 409 pj.SetComplete() 410 pj.Status.State = kube.AbortedState 411 pj.Status.Description = "Job aborted." 412 413 default: 414 // Pod is running. Do nothing. 415 c.incrementNumPendingJobs(pj.Spec.Job) 416 return nil 417 } 418 } 419 420 pj.Status.URL = jobURL(c.ca.Config().Plank, pj, c.log) 421 422 reports <- pj 423 424 if prevState != pj.Status.State { 425 c.log.WithFields(pjutil.ProwJobFields(&pj)). 426 WithField("from", prevState). 427 WithField("to", pj.Status.State).Info("Transitioning states.") 428 } 429 _, err := c.kc.ReplaceProwJob(pj.ObjectMeta.Name, pj) 430 return err 431 } 432 433 func (c *Controller) syncTriggeredJob(pj kube.ProwJob, pm map[string]kube.Pod, reports chan<- kube.ProwJob) error { 434 // Record last known state so we can log state transitions. 435 prevState := pj.Status.State 436 437 var id, pn string 438 pod, podExists := pm[pj.ObjectMeta.Name] 439 // We may end up in a state where the pod exists but the prowjob is not 440 // updated to pending if we successfully create a new pod in a previous 441 // sync but the prowjob update fails. Simply ignore creating a new pod 442 // and rerun the prowjob update. 443 if !podExists { 444 // Do not start more jobs than specified. 445 if !c.canExecuteConcurrently(&pj) { 446 return nil 447 } 448 // We haven't started the pod yet. Do so. 449 var err error 450 id, pn, err = c.startPod(pj) 451 if err != nil { 452 _, isUnprocessable := err.(kube.UnprocessableEntityError) 453 if !isUnprocessable { 454 return fmt.Errorf("error starting pod: %v", err) 455 } 456 pj.Status.State = kube.ErrorState 457 pj.SetComplete() 458 pj.Status.Description = "Job cannot be processed." 459 logrus.WithField("job", pj.Spec.Job).WithError(err).Warning("Unprocessable pod.") 460 } 461 } else { 462 id = getPodBuildID(&pod) 463 pn = pod.ObjectMeta.Name 464 } 465 466 if pj.Status.State == kube.TriggeredState { 467 // BuildID needs to be set before we execute the job url template. 468 pj.Status.BuildID = id 469 pj.Status.State = kube.PendingState 470 pj.Status.PodName = pn 471 pj.Status.Description = "Job triggered." 472 pj.Status.URL = jobURL(c.ca.Config().Plank, pj, c.log) 473 } 474 reports <- pj 475 if prevState != pj.Status.State { 476 c.log.WithFields(pjutil.ProwJobFields(&pj)). 477 WithField("from", prevState). 478 WithField("to", pj.Status.State).Info("Transitioning states.") 479 } 480 _, err := c.kc.ReplaceProwJob(pj.ObjectMeta.Name, pj) 481 return err 482 } 483 484 // TODO: No need to return the pod name since we already have the 485 // prowjob in the call site. 486 func (c *Controller) startPod(pj kube.ProwJob) (string, string, error) { 487 buildID, err := c.getBuildID(pj.Spec.Job) 488 if err != nil { 489 return "", "", fmt.Errorf("error getting build ID: %v", err) 490 } 491 492 pod, err := decorate.ProwJobToPod(pj, buildID) 493 if err != nil { 494 return "", "", err 495 } 496 497 client, ok := c.pkcs[pj.ClusterAlias()] 498 if !ok { 499 return "", "", fmt.Errorf("unknown cluster alias %q", pj.ClusterAlias()) 500 } 501 actual, err := client.CreatePod(*pod) 502 if err != nil { 503 return "", "", err 504 } 505 return buildID, actual.ObjectMeta.Name, nil 506 } 507 508 func (c *Controller) getBuildID(name string) (string, error) { 509 return pjutil.GetBuildID(name, c.totURL) 510 } 511 512 func getPodBuildID(pod *kube.Pod) string { 513 for _, env := range pod.Spec.Containers[0].Env { 514 if env.Name == "BUILD_ID" { 515 return env.Value 516 } 517 } 518 logrus.Warningf("BUILD_ID was not found in pod %q: streaming logs from deck will not work", pod.ObjectMeta.Name) 519 return "" 520 } 521 522 // RunAfterSuccessCanRun returns whether a child job (specified as run_after_success in the 523 // prow config) can run once its parent job succeeds. The only case we will not run a child job 524 // is when it is a presubmit job and has a run_if_changed regular expression specified which does 525 // not match the changed filenames in the pull request the job was meant to run for. 526 // TODO: Collapse with Jenkins, impossible to reuse as is due to the interfaces. 527 func (c *Controller) RunAfterSuccessCanRun(parent, child *kube.ProwJob, ca configAgent, ghc GitHubClient) bool { 528 if parent.Spec.Type != kube.PresubmitJob { 529 return true 530 } 531 532 // TODO: Make sure that parent and child have always the same org/repo. 533 org := parent.Spec.Refs.Org 534 repo := parent.Spec.Refs.Repo 535 prNum := parent.Spec.Refs.Pulls[0].Number 536 537 ps := ca.Config().GetPresubmit(org+"/"+repo, child.Spec.Job) 538 if ps == nil { 539 // The config has changed ever since we started the parent. 540 // Not sure what is more correct here. Run the child for now. 541 return true 542 } 543 if ps.RunIfChanged == "" { 544 return true 545 } 546 changesFull, err := ghc.GetPullRequestChanges(org, repo, prNum) 547 if err != nil { 548 c.log.WithError(err).WithFields(pjutil.ProwJobFields(parent)).Warnf("Cannot get PR changes for #%d", prNum) 549 return true 550 } 551 // We only care about the filenames here 552 var changes []string 553 for _, change := range changesFull { 554 changes = append(changes, change.Filename) 555 } 556 return ps.RunsAgainstChanges(changes) 557 } 558 559 func jobURL(plank config.Plank, pj kube.ProwJob, log *logrus.Entry) string { 560 if pj.Spec.DecorationConfig != nil && plank.JobURLPrefix != "" { 561 spec := downwardapi.NewJobSpec(pj.Spec, pj.Status.BuildID, pj.Name) 562 gcsConfig := pj.Spec.DecorationConfig.GCSConfiguration 563 _, gcsPath, _ := gcsupload.PathsForJob(gcsConfig, &spec, "") 564 565 prefix, _ := url.Parse(plank.JobURLPrefix) 566 prefix.Path = path.Join(prefix.Path, gcsConfig.Bucket, gcsPath) 567 return prefix.String() 568 } 569 var b bytes.Buffer 570 if err := plank.JobURLTemplate.Execute(&b, &pj); err != nil { 571 log.WithFields(pjutil.ProwJobFields(&pj)).Errorf("error executing URL template: %v", err) 572 } else { 573 return b.String() 574 } 575 return "" 576 }