github.com/shashidharatd/test-infra@v0.0.0-20171006011030-71304e1ca560/prow/plank/controller.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package plank 18 19 import ( 20 "bytes" 21 "fmt" 22 "io/ioutil" 23 "net/http" 24 "sync" 25 "time" 26 27 "github.com/bwmarrin/snowflake" 28 "github.com/sirupsen/logrus" 29 30 "k8s.io/test-infra/prow/config" 31 "k8s.io/test-infra/prow/github" 32 "k8s.io/test-infra/prow/kube" 33 "k8s.io/test-infra/prow/pjutil" 34 reportlib "k8s.io/test-infra/prow/report" 35 ) 36 37 const ( 38 testInfra = "https://github.com/kubernetes/test-infra/issues" 39 40 // maxSyncRoutines is the maximum number of goroutines 41 // that will be active at any one time for the sync 42 maxSyncRoutines = 20 43 ) 44 45 type kubeClient interface { 46 CreateProwJob(kube.ProwJob) (kube.ProwJob, error) 47 ListProwJobs(map[string]string) ([]kube.ProwJob, error) 48 ReplaceProwJob(string, kube.ProwJob) (kube.ProwJob, error) 49 50 CreatePod(kube.Pod) (kube.Pod, error) 51 ListPods(map[string]string) ([]kube.Pod, error) 52 DeletePod(string) error 53 } 54 55 type githubClient interface { 56 BotName() (string, error) 57 CreateStatus(org, repo, ref string, s github.Status) error 58 ListIssueComments(org, repo string, number int) ([]github.IssueComment, error) 59 CreateComment(org, repo string, number int, comment string) error 60 DeleteComment(org, repo string, ID int) error 61 EditComment(org, repo string, ID int, comment string) error 62 GetPullRequestChanges(org, repo string, number int) ([]github.PullRequestChange, error) 63 } 64 65 type configAgent interface { 66 Config() *config.Config 67 } 68 69 // TODO: Dry this out 70 type syncFn func(pj kube.ProwJob, pm map[string]kube.Pod, reports chan<- kube.ProwJob) error 71 72 // Controller manages ProwJobs. 73 type Controller struct { 74 kc kubeClient 75 pkc kubeClient 76 ghc githubClient 77 ca configAgent 78 node *snowflake.Node 79 totURL string 80 81 lock sync.RWMutex 82 // pendingJobs is a short-lived cache that helps in limiting 83 // the maximum concurrency of jobs. 84 pendingJobs map[string]int 85 } 86 87 // canExecuteConcurrently checks whether the provided ProwJob can 88 // be executed concurrently. 89 func (c *Controller) canExecuteConcurrently(pj *kube.ProwJob) bool { 90 c.lock.Lock() 91 defer c.lock.Unlock() 92 93 if max := c.ca.Config().Plank.MaxConcurrency; max > 0 { 94 var running int 95 for _, num := range c.pendingJobs { 96 running += num 97 } 98 if running >= max { 99 logrus.Infof("Not starting another job, already %d running.", running) 100 return false 101 } 102 } 103 104 if pj.Spec.MaxConcurrency == 0 { 105 c.pendingJobs[pj.Spec.Job]++ 106 return true 107 } 108 109 numPending := c.pendingJobs[pj.Spec.Job] 110 if numPending >= pj.Spec.MaxConcurrency { 111 logrus.WithField("job", pj.Spec.Job).Infof("Not starting another instance of %s, already %d running.", pj.Spec.Job, numPending) 112 return false 113 } 114 c.pendingJobs[pj.Spec.Job]++ 115 return true 116 } 117 118 // incrementNumPendingJobs increments the amount of 119 // pending ProwJobs for the given job identifier 120 func (c *Controller) incrementNumPendingJobs(job string) { 121 c.lock.Lock() 122 defer c.lock.Unlock() 123 c.pendingJobs[job]++ 124 } 125 126 // NewController creates a new Controller from the provided clients. 127 func NewController(kc, pkc *kube.Client, ghc *github.Client, ca *config.Agent, totURL string) (*Controller, error) { 128 n, err := snowflake.NewNode(1) 129 if err != nil { 130 return nil, err 131 } 132 return &Controller{ 133 kc: kc, 134 pkc: pkc, 135 ghc: ghc, 136 ca: ca, 137 node: n, 138 pendingJobs: make(map[string]int), 139 lock: sync.RWMutex{}, 140 totURL: totURL, 141 }, nil 142 } 143 144 // Sync does one sync iteration. 145 func (c *Controller) Sync() error { 146 pjs, err := c.kc.ListProwJobs(nil) 147 if err != nil { 148 return fmt.Errorf("error listing prow jobs: %v", err) 149 } 150 labels := map[string]string{kube.CreatedByProw: "true"} 151 pods, err := c.pkc.ListPods(labels) 152 if err != nil { 153 return fmt.Errorf("error listing pods: %v", err) 154 } 155 pm := map[string]kube.Pod{} 156 for _, pod := range pods { 157 pm[pod.Metadata.Name] = pod 158 } 159 160 var k8sJobs []kube.ProwJob 161 for _, pj := range pjs { 162 if pj.Spec.Agent == kube.KubernetesAgent { 163 k8sJobs = append(k8sJobs, pj) 164 } 165 } 166 pjs = k8sJobs 167 168 var syncErrs []error 169 if err := c.terminateDupes(pjs, pm); err != nil { 170 syncErrs = append(syncErrs, err) 171 } 172 173 pendingCh, nonPendingCh := pjutil.PartitionPending(pjs) 174 errCh := make(chan error, len(pjs)) 175 reportCh := make(chan kube.ProwJob, len(pjs)) 176 177 // Reinstantiate on every resync of the controller instead of trying 178 // to keep this in sync with the state of the world. 179 c.pendingJobs = make(map[string]int) 180 // Sync pending jobs first so we can determine what is the maximum 181 // number of new jobs we can trigger when syncing the non-pendings. 182 syncProwJobs(c.syncPendingJob, pendingCh, reportCh, errCh, pm) 183 syncProwJobs(c.syncNonPendingJob, nonPendingCh, reportCh, errCh, pm) 184 185 close(errCh) 186 close(reportCh) 187 188 for err := range errCh { 189 syncErrs = append(syncErrs, err) 190 } 191 192 var reportErrs []error 193 reportTemplate := c.ca.Config().Plank.ReportTemplate 194 for report := range reportCh { 195 if err := reportlib.Report(c.ghc, reportTemplate, report); err != nil { 196 reportErrs = append(reportErrs, err) 197 } 198 } 199 200 if len(syncErrs) == 0 && len(reportErrs) == 0 { 201 return nil 202 } 203 return fmt.Errorf("errors syncing: %v, errors reporting: %v", syncErrs, reportErrs) 204 } 205 206 // terminateDupes aborts presubmits that have a newer version. It modifies pjs 207 // in-place when it aborts. 208 // TODO: Dry this out - need to ensure we can abstract children cancellation first. 209 func (c *Controller) terminateDupes(pjs []kube.ProwJob, pm map[string]kube.Pod) error { 210 // "job org/repo#number" -> newest job 211 dupes := make(map[string]int) 212 for i, pj := range pjs { 213 if pj.Complete() || pj.Spec.Type != kube.PresubmitJob { 214 continue 215 } 216 n := fmt.Sprintf("%s %s/%s#%d", pj.Spec.Job, pj.Spec.Refs.Org, pj.Spec.Refs.Repo, pj.Spec.Refs.Pulls[0].Number) 217 prev, ok := dupes[n] 218 if !ok { 219 dupes[n] = i 220 continue 221 } 222 cancelIndex := i 223 if pjs[prev].Status.StartTime.Before(pj.Status.StartTime) { 224 cancelIndex = prev 225 dupes[n] = i 226 } 227 toCancel := pjs[cancelIndex] 228 // Allow aborting presubmit jobs for commits that have been superseded by 229 // newer commits in Github pull requests. 230 if c.ca.Config().Plank.AllowCancellations { 231 if pod, exists := pm[toCancel.Metadata.Name]; exists { 232 if err := c.pkc.DeletePod(pod.Metadata.Name); err != nil { 233 logrus.Warningf("Cannot cancel pod for prowjob %q: %v", toCancel.Metadata.Name, err) 234 } 235 } 236 } 237 toCancel.Status.CompletionTime = time.Now() 238 toCancel.Status.State = kube.AbortedState 239 npj, err := c.kc.ReplaceProwJob(toCancel.Metadata.Name, toCancel) 240 if err != nil { 241 return err 242 } 243 pjs[cancelIndex] = npj 244 } 245 return nil 246 } 247 248 // TODO: Dry this out 249 func syncProwJobs(syncFn syncFn, jobs <-chan kube.ProwJob, reports chan<- kube.ProwJob, syncErrors chan<- error, pm map[string]kube.Pod) { 250 wg := &sync.WaitGroup{} 251 wg.Add(maxSyncRoutines) 252 for i := 0; i < maxSyncRoutines; i++ { 253 go func(jobs <-chan kube.ProwJob) { 254 defer wg.Done() 255 for pj := range jobs { 256 if err := syncFn(pj, pm, reports); err != nil { 257 syncErrors <- err 258 } 259 } 260 }(jobs) 261 } 262 wg.Wait() 263 } 264 265 func (c *Controller) syncPendingJob(pj kube.ProwJob, pm map[string]kube.Pod, reports chan<- kube.ProwJob) error { 266 pod, podExists := pm[pj.Metadata.Name] 267 if !podExists { 268 c.incrementNumPendingJobs(pj.Spec.Job) 269 // Pod is missing. This can happen in case we deleted the previous pod because 270 // it was stuck in Unknown/Evicted state due to a node problem or the pod was 271 // deleted manually. Start a new pod. 272 id, pn, err := c.startPod(pj) 273 if err != nil { 274 _, isUnprocessable := err.(kube.UnprocessableEntityError) 275 if !isUnprocessable { 276 return fmt.Errorf("error starting pod: %v", err) 277 } 278 pj.Status.State = kube.ErrorState 279 pj.Status.CompletionTime = time.Now() 280 pj.Status.Description = "Job cannot be processed." 281 logrus.WithField("job", pj.Spec.Job).WithError(err).Warning("Unprocessable pod.") 282 } else { 283 pj.Status.BuildID = id 284 pj.Status.PodName = pn 285 } 286 } else { 287 switch pod.Status.Phase { 288 case kube.PodUnknown: 289 c.incrementNumPendingJobs(pj.Spec.Job) 290 // Pod is in Unknown state. This can happen if there is a problem with 291 // the node. Delete the old pod, we'll start a new one next loop. 292 return c.pkc.DeletePod(pj.Metadata.Name) 293 294 case kube.PodSucceeded: 295 // Pod succeeded. Update ProwJob, talk to GitHub, and start next jobs. 296 pj.Status.CompletionTime = time.Now() 297 pj.Status.State = kube.SuccessState 298 pj.Status.Description = "Job succeeded." 299 for _, nj := range pj.Spec.RunAfterSuccess { 300 child := pjutil.NewProwJob(nj) 301 if !RunAfterSuccessCanRun(&pj, &child, c.ca, c.ghc) { 302 continue 303 } 304 if _, err := c.kc.CreateProwJob(pjutil.NewProwJob(nj)); err != nil { 305 return fmt.Errorf("error starting next prowjob: %v", err) 306 } 307 } 308 309 case kube.PodFailed: 310 if pod.Status.Reason == kube.Evicted { 311 c.incrementNumPendingJobs(pj.Spec.Job) 312 // Pod was evicted. We will recreate it in the next resync. 313 return c.pkc.DeletePod(pj.Metadata.Name) 314 } 315 // Pod failed. Update ProwJob, talk to GitHub. 316 pj.Status.CompletionTime = time.Now() 317 pj.Status.State = kube.FailureState 318 pj.Status.Description = "Job failed." 319 320 default: 321 // Pod is running. Do nothing. 322 c.incrementNumPendingJobs(pj.Spec.Job) 323 return nil 324 } 325 } 326 327 var b bytes.Buffer 328 if err := c.ca.Config().Plank.JobURLTemplate.Execute(&b, &pj); err != nil { 329 return fmt.Errorf("error executing URL template: %v", err) 330 } 331 pj.Status.URL = b.String() 332 reports <- pj 333 334 _, err := c.kc.ReplaceProwJob(pj.Metadata.Name, pj) 335 return err 336 } 337 338 func (c *Controller) syncNonPendingJob(pj kube.ProwJob, pm map[string]kube.Pod, reports chan<- kube.ProwJob) error { 339 if pj.Complete() { 340 return nil 341 } 342 343 // The rest are new prowjobs. 344 345 var id, pn string 346 pod, podExists := pm[pj.Metadata.Name] 347 // We may end up in a state where the pod exists but the prowjob is not 348 // updated to pending if we successfully create a new pod in a previous 349 // sync but the prowjob update fails. Simply ignore creating a new pod 350 // and rerun the prowjob update. 351 if !podExists { 352 // Do not start more jobs than specified. 353 if !c.canExecuteConcurrently(&pj) { 354 return nil 355 } 356 // We haven't started the pod yet. Do so. 357 var err error 358 id, pn, err = c.startPod(pj) 359 if err != nil { 360 _, isUnprocessable := err.(kube.UnprocessableEntityError) 361 if !isUnprocessable { 362 return fmt.Errorf("error starting pod: %v", err) 363 } 364 pj.Status.State = kube.ErrorState 365 pj.Status.CompletionTime = time.Now() 366 pj.Status.Description = "Job cannot be processed." 367 logrus.WithField("job", pj.Spec.Job).WithError(err).Warning("Unprocessable pod.") 368 } 369 } else { 370 id = getPodBuildID(&pod) 371 pn = pod.Metadata.Name 372 } 373 374 if pj.Status.State == kube.TriggeredState { 375 // BuildID needs to be set before we execute the job url template. 376 pj.Status.BuildID = id 377 pj.Status.State = kube.PendingState 378 pj.Status.PodName = pn 379 pj.Status.Description = "Job triggered." 380 var b bytes.Buffer 381 if err := c.ca.Config().Plank.JobURLTemplate.Execute(&b, &pj); err != nil { 382 return fmt.Errorf("error executing URL template: %v", err) 383 } 384 pj.Status.URL = b.String() 385 } 386 reports <- pj 387 388 _, err := c.kc.ReplaceProwJob(pj.Metadata.Name, pj) 389 return err 390 } 391 392 // TODO: No need to return the pod name since we already have the 393 // prowjob in the call site. 394 func (c *Controller) startPod(pj kube.ProwJob) (string, string, error) { 395 buildID, err := c.getBuildID(pj.Spec.Job) 396 if err != nil { 397 return "", "", fmt.Errorf("error getting build ID: %v", err) 398 } 399 400 pod := pjutil.ProwJobToPod(pj, buildID) 401 402 actual, err := c.pkc.CreatePod(*pod) 403 if err != nil { 404 return "", "", err 405 } 406 return buildID, actual.Metadata.Name, nil 407 } 408 409 func (c *Controller) getBuildID(name string) (string, error) { 410 if c.totURL == "" { 411 return c.node.Generate().String(), nil 412 } 413 var err error 414 url := c.totURL + "/vend/" + name 415 for retries := 0; retries < 60; retries++ { 416 if retries > 0 { 417 time.Sleep(2 * time.Second) 418 } 419 var resp *http.Response 420 resp, err = http.Get(url) 421 if err != nil { 422 continue 423 } 424 defer resp.Body.Close() 425 if resp.StatusCode != 200 { 426 continue 427 } 428 if buf, err := ioutil.ReadAll(resp.Body); err == nil { 429 return string(buf), nil 430 } 431 return "", err 432 } 433 return "", err 434 } 435 436 func getPodBuildID(pod *kube.Pod) string { 437 for _, env := range pod.Spec.Containers[0].Env { 438 if env.Name == "BUILD_NUMBER" { 439 return env.Value 440 } 441 } 442 logrus.Warningf("BUILD_NUMBER was not found in pod %q: streaming logs from deck will not work", pod.Metadata.Name) 443 return "" 444 } 445 446 // RunAfterSuccessCanRun returns whether a child job (specified as run_after_success in the 447 // prow config) can run once its parent job succeeds. The only case we will not run a child job 448 // is when it is a presubmit job and has a run_if_changed regural expression specified which does 449 // not match the changed filenames in the pull request the job was meant to run for. 450 // TODO: Collapse with Jenkins, impossible to reuse as is due to the interfaces. 451 func RunAfterSuccessCanRun(parent, child *kube.ProwJob, c configAgent, ghc githubClient) bool { 452 if parent.Spec.Type != kube.PresubmitJob { 453 return true 454 } 455 456 // TODO: Make sure that parent and child have always the same org/repo. 457 org := parent.Spec.Refs.Org 458 repo := parent.Spec.Refs.Repo 459 prNum := parent.Spec.Refs.Pulls[0].Number 460 461 ps := c.Config().GetPresubmit(org+"/"+repo, child.Spec.Job) 462 if ps == nil { 463 // The config has changed ever since we started the parent. 464 // Not sure what is more correct here. Run the child for now. 465 return true 466 } 467 if ps.RunIfChanged == "" { 468 return true 469 } 470 changesFull, err := ghc.GetPullRequestChanges(org, repo, prNum) 471 if err != nil { 472 logrus.Warningf("Cannot get PR changes for %d: %v", prNum, err) 473 return true 474 } 475 // We only care about the filenames here 476 var changes []string 477 for _, change := range changesFull { 478 changes = append(changes, change.Filename) 479 } 480 return ps.RunsAgainstChanges(changes) 481 }