github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/prow/jenkins/controller.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package jenkins
    18  
    19  import (
    20  	"bytes"
    21  	"fmt"
    22  	"strconv"
    23  	"sync"
    24  
    25  	"github.com/bwmarrin/snowflake"
    26  	"github.com/sirupsen/logrus"
    27  
    28  	"k8s.io/test-infra/prow/config"
    29  	"k8s.io/test-infra/prow/github"
    30  	"k8s.io/test-infra/prow/kube"
    31  	"k8s.io/test-infra/prow/pjutil"
    32  	reportlib "k8s.io/test-infra/prow/report"
    33  )
    34  
    35  const (
    36  	testInfra = "https://github.com/kubernetes/test-infra/issues"
    37  )
    38  
    39  type kubeClient interface {
    40  	CreateProwJob(kube.ProwJob) (kube.ProwJob, error)
    41  	ListProwJobs(string) ([]kube.ProwJob, error)
    42  	ReplaceProwJob(string, kube.ProwJob) (kube.ProwJob, error)
    43  }
    44  
    45  type jenkinsClient interface {
    46  	Build(*kube.ProwJob, string) error
    47  	ListBuilds(jobs []string) (map[string]Build, error)
    48  	Abort(job string, build *Build) error
    49  }
    50  
    51  type githubClient interface {
    52  	BotName() (string, error)
    53  	CreateStatus(org, repo, ref string, s github.Status) error
    54  	ListIssueComments(org, repo string, number int) ([]github.IssueComment, error)
    55  	CreateComment(org, repo string, number int, comment string) error
    56  	DeleteComment(org, repo string, ID int) error
    57  	EditComment(org, repo string, ID int, comment string) error
    58  	GetPullRequestChanges(org, repo string, number int) ([]github.PullRequestChange, error)
    59  }
    60  
    61  type configAgent interface {
    62  	Config() *config.Config
    63  }
    64  
    65  type syncFn func(kube.ProwJob, chan<- kube.ProwJob, map[string]Build) error
    66  
    67  // Controller manages ProwJobs.
    68  type Controller struct {
    69  	kc     kubeClient
    70  	jc     jenkinsClient
    71  	ghc    githubClient
    72  	log    *logrus.Entry
    73  	ca     configAgent
    74  	node   *snowflake.Node
    75  	totURL string
    76  	// selector that will be applied on prowjobs.
    77  	selector string
    78  
    79  	lock sync.RWMutex
    80  	// pendingJobs is a short-lived cache that helps in limiting
    81  	// the maximum concurrency of jobs.
    82  	pendingJobs map[string]int
    83  
    84  	pjLock sync.RWMutex
    85  	// shared across the controller and a goroutine that gathers metrics.
    86  	pjs []kube.ProwJob
    87  }
    88  
    89  // NewController creates a new Controller from the provided clients.
    90  func NewController(kc *kube.Client, jc *Client, ghc *github.Client, logger *logrus.Entry, ca *config.Agent, totURL, selector string) (*Controller, error) {
    91  	n, err := snowflake.NewNode(1)
    92  	if err != nil {
    93  		return nil, err
    94  	}
    95  	if logger == nil {
    96  		logger = logrus.NewEntry(logrus.StandardLogger())
    97  	}
    98  	return &Controller{
    99  		kc:          kc,
   100  		jc:          jc,
   101  		ghc:         ghc,
   102  		log:         logger,
   103  		ca:          ca,
   104  		selector:    selector,
   105  		node:        n,
   106  		totURL:      totURL,
   107  		pendingJobs: make(map[string]int),
   108  	}, nil
   109  }
   110  
   111  func (c *Controller) config() config.Controller {
   112  	operators := c.ca.Config().JenkinsOperators
   113  	if len(operators) == 1 {
   114  		return operators[0].Controller
   115  	}
   116  	configured := make([]string, 0, len(operators))
   117  	for _, cfg := range operators {
   118  		if cfg.LabelSelectorString == c.selector {
   119  			return cfg.Controller
   120  		}
   121  		configured = append(configured, cfg.LabelSelectorString)
   122  	}
   123  	if len(c.selector) == 0 {
   124  		c.log.Panicf("You need to specify a non-empty --label-selector (existing selectors: %v).", configured)
   125  	} else {
   126  		c.log.Panicf("No config exists for --label-selector=%s.", c.selector)
   127  	}
   128  	return config.Controller{}
   129  }
   130  
   131  // canExecuteConcurrently checks whether the provided ProwJob can
   132  // be executed concurrently.
   133  func (c *Controller) canExecuteConcurrently(pj *kube.ProwJob) bool {
   134  	c.lock.Lock()
   135  	defer c.lock.Unlock()
   136  
   137  	if max := c.config().MaxConcurrency; max > 0 {
   138  		var running int
   139  		for _, num := range c.pendingJobs {
   140  			running += num
   141  		}
   142  		if running >= max {
   143  			c.log.WithFields(pjutil.ProwJobFields(pj)).Debugf("Not starting another job, already %d running.", running)
   144  			return false
   145  		}
   146  	}
   147  
   148  	if pj.Spec.MaxConcurrency == 0 {
   149  		c.pendingJobs[pj.Spec.Job]++
   150  		return true
   151  	}
   152  
   153  	numPending := c.pendingJobs[pj.Spec.Job]
   154  	if numPending >= pj.Spec.MaxConcurrency {
   155  		c.log.WithFields(pjutil.ProwJobFields(pj)).Debugf("Not starting another instance of %s, already %d running.", pj.Spec.Job, numPending)
   156  		return false
   157  	}
   158  	c.pendingJobs[pj.Spec.Job]++
   159  	return true
   160  }
   161  
   162  // incrementNumPendingJobs increments the amount of
   163  // pending ProwJobs for the given job identifier
   164  func (c *Controller) incrementNumPendingJobs(job string) {
   165  	c.lock.Lock()
   166  	defer c.lock.Unlock()
   167  	c.pendingJobs[job]++
   168  }
   169  
   170  // Sync does one sync iteration.
   171  func (c *Controller) Sync() error {
   172  	pjs, err := c.kc.ListProwJobs(c.selector)
   173  	if err != nil {
   174  		return fmt.Errorf("error listing prow jobs: %v", err)
   175  	}
   176  	// Share what we have for gathering metrics.
   177  	c.pjLock.Lock()
   178  	c.pjs = pjs
   179  	c.pjLock.Unlock()
   180  
   181  	// TODO: Replace the following filtering with a field selector once CRDs support field selectors.
   182  	// https://github.com/kubernetes/kubernetes/issues/53459
   183  	var jenkinsJobs []kube.ProwJob
   184  	for _, pj := range pjs {
   185  		if pj.Spec.Agent == kube.JenkinsAgent {
   186  			jenkinsJobs = append(jenkinsJobs, pj)
   187  		}
   188  	}
   189  	pjs = jenkinsJobs
   190  	jbs, err := c.jc.ListBuilds(getJenkinsJobs(pjs))
   191  	if err != nil {
   192  		return fmt.Errorf("error listing jenkins builds: %v", err)
   193  	}
   194  
   195  	var syncErrs []error
   196  	if err := c.terminateDupes(pjs, jbs); err != nil {
   197  		syncErrs = append(syncErrs, err)
   198  	}
   199  
   200  	pendingCh, triggeredCh := pjutil.PartitionActive(pjs)
   201  	errCh := make(chan error, len(pjs))
   202  	reportCh := make(chan kube.ProwJob, len(pjs))
   203  
   204  	// Reinstantiate on every resync of the controller instead of trying
   205  	// to keep this in sync with the state of the world.
   206  	c.pendingJobs = make(map[string]int)
   207  	// Sync pending jobs first so we can determine what is the maximum
   208  	// number of new jobs we can trigger when syncing the non-pendings.
   209  	maxSyncRoutines := c.config().MaxGoroutines
   210  	c.log.Debugf("Handling %d pending prowjobs", len(pendingCh))
   211  	syncProwJobs(c.log, c.syncPendingJob, maxSyncRoutines, pendingCh, reportCh, errCh, jbs)
   212  	c.log.Debugf("Handling %d triggered prowjobs", len(triggeredCh))
   213  	syncProwJobs(c.log, c.syncTriggeredJob, maxSyncRoutines, triggeredCh, reportCh, errCh, jbs)
   214  
   215  	close(errCh)
   216  	close(reportCh)
   217  
   218  	for err := range errCh {
   219  		syncErrs = append(syncErrs, err)
   220  	}
   221  
   222  	var reportErrs []error
   223  	reportTemplate := c.config().ReportTemplate
   224  	for report := range reportCh {
   225  		if err := reportlib.Report(c.ghc, reportTemplate, report); err != nil {
   226  			reportErrs = append(reportErrs, err)
   227  			c.log.WithFields(pjutil.ProwJobFields(&report)).WithError(err).Warn("Failed to report ProwJob status")
   228  		}
   229  	}
   230  
   231  	if len(syncErrs) == 0 && len(reportErrs) == 0 {
   232  		return nil
   233  	}
   234  	return fmt.Errorf("errors syncing: %v, errors reporting: %v", syncErrs, reportErrs)
   235  }
   236  
   237  // SyncMetrics records metrics for the cached prowjobs.
   238  func (c *Controller) SyncMetrics() {
   239  	c.pjLock.RLock()
   240  	defer c.pjLock.RUnlock()
   241  	kube.GatherProwJobMetrics(c.pjs)
   242  }
   243  
   244  // getJenkinsJobs returns all the Jenkins jobs for all active
   245  // prowjobs from the provided list. It handles deduplication.
   246  func getJenkinsJobs(pjs []kube.ProwJob) []string {
   247  	jenkinsJobs := make(map[string]struct{})
   248  	for _, pj := range pjs {
   249  		if pj.Complete() {
   250  			continue
   251  		}
   252  		jenkinsJobs[pj.Spec.Job] = struct{}{}
   253  	}
   254  	var jobs []string
   255  	for job := range jenkinsJobs {
   256  		jobs = append(jobs, job)
   257  	}
   258  	return jobs
   259  }
   260  
   261  // terminateDupes aborts presubmits that have a newer version. It modifies pjs
   262  // in-place when it aborts.
   263  func (c *Controller) terminateDupes(pjs []kube.ProwJob, jbs map[string]Build) error {
   264  	// "job org/repo#number" -> newest job
   265  	dupes := make(map[string]int)
   266  	for i, pj := range pjs {
   267  		if pj.Complete() || pj.Spec.Type != kube.PresubmitJob {
   268  			continue
   269  		}
   270  		n := fmt.Sprintf("%s %s/%s#%d", pj.Spec.Job, pj.Spec.Refs.Org, pj.Spec.Refs.Repo, pj.Spec.Refs.Pulls[0].Number)
   271  		prev, ok := dupes[n]
   272  		if !ok {
   273  			dupes[n] = i
   274  			continue
   275  		}
   276  		cancelIndex := i
   277  		if (&pjs[prev].Status.StartTime).Before(&pj.Status.StartTime) {
   278  			cancelIndex = prev
   279  			dupes[n] = i
   280  		}
   281  		toCancel := pjs[cancelIndex]
   282  		// Allow aborting presubmit jobs for commits that have been superseded by
   283  		// newer commits in Github pull requests.
   284  		if c.config().AllowCancellations {
   285  			build, buildExists := jbs[toCancel.ObjectMeta.Name]
   286  			// Avoid cancelling enqueued builds.
   287  			if buildExists && build.IsEnqueued() {
   288  				continue
   289  			}
   290  			// Otherwise, abort it.
   291  			if buildExists {
   292  				if err := c.jc.Abort(toCancel.Spec.Job, &build); err != nil {
   293  					c.log.WithError(err).WithFields(pjutil.ProwJobFields(&toCancel)).Warn("Cannot cancel Jenkins build")
   294  				}
   295  			}
   296  		}
   297  		toCancel.SetComplete()
   298  		prevState := toCancel.Status.State
   299  		toCancel.Status.State = kube.AbortedState
   300  		c.log.WithFields(pjutil.ProwJobFields(&toCancel)).
   301  			WithField("from", prevState).
   302  			WithField("to", toCancel.Status.State).Info("Transitioning states.")
   303  		npj, err := c.kc.ReplaceProwJob(toCancel.ObjectMeta.Name, toCancel)
   304  		if err != nil {
   305  			return err
   306  		}
   307  		pjs[cancelIndex] = npj
   308  	}
   309  	return nil
   310  }
   311  
   312  func syncProwJobs(
   313  	l *logrus.Entry,
   314  	syncFn syncFn,
   315  	maxSyncRoutines int,
   316  	jobs <-chan kube.ProwJob,
   317  	reports chan<- kube.ProwJob,
   318  	syncErrors chan<- error,
   319  	jbs map[string]Build,
   320  ) {
   321  	goroutines := maxSyncRoutines
   322  	if goroutines > len(jobs) {
   323  		goroutines = len(jobs)
   324  	}
   325  	wg := &sync.WaitGroup{}
   326  	wg.Add(goroutines)
   327  	l.Debugf("Firing up %d goroutines", goroutines)
   328  	for i := 0; i < goroutines; i++ {
   329  		go func() {
   330  			defer wg.Done()
   331  			for pj := range jobs {
   332  				if err := syncFn(pj, reports, jbs); err != nil {
   333  					syncErrors <- err
   334  				}
   335  			}
   336  		}()
   337  	}
   338  	wg.Wait()
   339  }
   340  
   341  func (c *Controller) syncPendingJob(pj kube.ProwJob, reports chan<- kube.ProwJob, jbs map[string]Build) error {
   342  	// Record last known state so we can log state transitions.
   343  	prevState := pj.Status.State
   344  
   345  	jb, jbExists := jbs[pj.ObjectMeta.Name]
   346  	if !jbExists {
   347  		pj.SetComplete()
   348  		pj.Status.State = kube.ErrorState
   349  		pj.Status.URL = testInfra
   350  		pj.Status.Description = "Error finding Jenkins job."
   351  	} else {
   352  		switch {
   353  		case jb.IsEnqueued():
   354  			// Still in queue.
   355  			c.incrementNumPendingJobs(pj.Spec.Job)
   356  			return nil
   357  
   358  		case jb.IsRunning():
   359  			// Build still going.
   360  			c.incrementNumPendingJobs(pj.Spec.Job)
   361  			if pj.Status.Description == "Jenkins job running." {
   362  				return nil
   363  			}
   364  			pj.Status.Description = "Jenkins job running."
   365  
   366  		case jb.IsSuccess():
   367  			// Build is complete.
   368  			pj.SetComplete()
   369  			pj.Status.State = kube.SuccessState
   370  			pj.Status.Description = "Jenkins job succeeded."
   371  			for _, nj := range pj.Spec.RunAfterSuccess {
   372  				child := pjutil.NewProwJob(nj, pj.ObjectMeta.Labels)
   373  				if !c.RunAfterSuccessCanRun(&pj, &child, c.ca, c.ghc) {
   374  					continue
   375  				}
   376  				if _, err := c.kc.CreateProwJob(pjutil.NewProwJob(nj, pj.ObjectMeta.Labels)); err != nil {
   377  					return fmt.Errorf("error starting next prowjob: %v", err)
   378  				}
   379  			}
   380  
   381  		case jb.IsFailure():
   382  			pj.SetComplete()
   383  			pj.Status.State = kube.FailureState
   384  			pj.Status.Description = "Jenkins job failed."
   385  
   386  		case jb.IsAborted():
   387  			pj.SetComplete()
   388  			pj.Status.State = kube.AbortedState
   389  			pj.Status.Description = "Jenkins job aborted."
   390  		}
   391  		// Construct the status URL that will be used in reports.
   392  		pj.Status.PodName = pj.ObjectMeta.Name
   393  		pj.Status.BuildID = jb.BuildID()
   394  		pj.Status.JenkinsBuildID = strconv.Itoa(jb.Number)
   395  		var b bytes.Buffer
   396  		if err := c.config().JobURLTemplate.Execute(&b, &pj); err != nil {
   397  			c.log.WithFields(pjutil.ProwJobFields(&pj)).Errorf("error executing URL template: %v", err)
   398  		} else {
   399  			pj.Status.URL = b.String()
   400  		}
   401  	}
   402  	// Report to Github.
   403  	reports <- pj
   404  	if prevState != pj.Status.State {
   405  		c.log.WithFields(pjutil.ProwJobFields(&pj)).
   406  			WithField("from", prevState).
   407  			WithField("to", pj.Status.State).Info("Transitioning states.")
   408  	}
   409  	_, err := c.kc.ReplaceProwJob(pj.ObjectMeta.Name, pj)
   410  	return err
   411  }
   412  
   413  func (c *Controller) syncTriggeredJob(pj kube.ProwJob, reports chan<- kube.ProwJob, jbs map[string]Build) error {
   414  	// Record last known state so we can log state transitions.
   415  	prevState := pj.Status.State
   416  
   417  	if _, jbExists := jbs[pj.ObjectMeta.Name]; !jbExists {
   418  		// Do not start more jobs than specified.
   419  		if !c.canExecuteConcurrently(&pj) {
   420  			return nil
   421  		}
   422  		buildID, err := c.getBuildID(pj.Spec.Job)
   423  		if err != nil {
   424  			return fmt.Errorf("error getting build ID: %v", err)
   425  		}
   426  		// Start the Jenkins job.
   427  		if err := c.jc.Build(&pj, buildID); err != nil {
   428  			c.log.WithError(err).WithFields(pjutil.ProwJobFields(&pj)).Warn("Cannot start Jenkins build")
   429  			pj.SetComplete()
   430  			pj.Status.State = kube.ErrorState
   431  			pj.Status.URL = testInfra
   432  			pj.Status.Description = "Error starting Jenkins job."
   433  		} else {
   434  			pj.Status.State = kube.PendingState
   435  			pj.Status.Description = "Jenkins job enqueued."
   436  		}
   437  	} else {
   438  		// If a Jenkins build already exists for this job, advance the ProwJob to Pending and
   439  		// it should be handled by syncPendingJob in the next sync.
   440  		pj.Status.State = kube.PendingState
   441  		pj.Status.Description = "Jenkins job enqueued."
   442  	}
   443  	// Report to Github.
   444  	reports <- pj
   445  
   446  	if prevState != pj.Status.State {
   447  		c.log.WithFields(pjutil.ProwJobFields(&pj)).
   448  			WithField("from", prevState).
   449  			WithField("to", pj.Status.State).Info("Transitioning states.")
   450  	}
   451  	_, err := c.kc.ReplaceProwJob(pj.ObjectMeta.Name, pj)
   452  	return err
   453  }
   454  
   455  func (c *Controller) getBuildID(name string) (string, error) {
   456  	if c.totURL == "" {
   457  		return c.node.Generate().String(), nil
   458  	}
   459  	return pjutil.GetBuildID(name, c.totURL)
   460  }
   461  
   462  // RunAfterSuccessCanRun returns whether a child job (specified as run_after_success in the
   463  // prow config) can run once its parent job succeeds. The only case we will not run a child job
   464  // is when it is a presubmit job and has a run_if_changed regular expression specified which does
   465  // not match the changed filenames in the pull request the job was meant to run for.
   466  // TODO: Collapse with plank, impossible to reuse as is due to the interfaces.
   467  func (c *Controller) RunAfterSuccessCanRun(parent, child *kube.ProwJob, ca configAgent, ghc githubClient) bool {
   468  	if parent.Spec.Type != kube.PresubmitJob {
   469  		return true
   470  	}
   471  
   472  	// TODO: Make sure that parent and child have always the same org/repo.
   473  	org := parent.Spec.Refs.Org
   474  	repo := parent.Spec.Refs.Repo
   475  	prNum := parent.Spec.Refs.Pulls[0].Number
   476  
   477  	ps := ca.Config().GetPresubmit(org+"/"+repo, child.Spec.Job)
   478  	if ps == nil {
   479  		// The config has changed ever since we started the parent.
   480  		// Not sure what is more correct here. Run the child for now.
   481  		return true
   482  	}
   483  	if ps.RunIfChanged == "" {
   484  		return true
   485  	}
   486  	changesFull, err := ghc.GetPullRequestChanges(org, repo, prNum)
   487  	if err != nil {
   488  		c.log.WithError(err).WithFields(pjutil.ProwJobFields(parent)).Warnf("Cannot get PR changes for #%d", prNum)
   489  		return true
   490  	}
   491  	// We only care about the filenames here
   492  	var changes []string
   493  	for _, change := range changesFull {
   494  		changes = append(changes, change.Filename)
   495  	}
   496  	return ps.RunsAgainstChanges(changes)
   497  }