sigs.k8s.io/prow@v0.0.0-20240503223140-c5e374dc7eb1/pkg/jenkins/controller.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package jenkins
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"fmt"
    23  	"strconv"
    24  	"sync"
    25  
    26  	"github.com/bwmarrin/snowflake"
    27  	"github.com/sirupsen/logrus"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	ktypes "k8s.io/apimachinery/pkg/types"
    30  	"k8s.io/utils/clock"
    31  	prowv1 "sigs.k8s.io/prow/pkg/client/clientset/versioned/typed/prowjobs/v1"
    32  
    33  	prowapi "sigs.k8s.io/prow/pkg/apis/prowjobs/v1"
    34  	"sigs.k8s.io/prow/pkg/config"
    35  	"sigs.k8s.io/prow/pkg/github"
    36  	reportlib "sigs.k8s.io/prow/pkg/github/report"
    37  	"sigs.k8s.io/prow/pkg/kube"
    38  	"sigs.k8s.io/prow/pkg/pjutil"
    39  )
    40  
    41  type prowJobClient interface {
    42  	Create(context.Context, *prowapi.ProwJob, metav1.CreateOptions) (*prowapi.ProwJob, error)
    43  	List(context.Context, metav1.ListOptions) (*prowapi.ProwJobList, error)
    44  	Patch(ctx context.Context, name string, pt ktypes.PatchType, data []byte, o metav1.PatchOptions, subresources ...string) (result *prowapi.ProwJob, err error)
    45  }
    46  
    47  type jenkinsClient interface {
    48  	Build(*prowapi.ProwJob, string) error
    49  	ListBuilds(jobs []BuildQueryParams) (map[string]Build, error)
    50  	Abort(job string, build *Build) error
    51  }
    52  
    53  type githubClient interface {
    54  	reportlib.GitHubClient
    55  	GetPullRequestChanges(org, repo string, number int) ([]github.PullRequestChange, error)
    56  }
    57  
    58  type syncFn func(prowapi.ProwJob, chan<- prowapi.ProwJob, map[string]Build) error
    59  
    60  // Controller manages ProwJobs.
    61  type Controller struct {
    62  	prowJobClient prowJobClient
    63  	jc            jenkinsClient
    64  	ghc           githubClient
    65  	log           *logrus.Entry
    66  	cfg           config.Getter
    67  	node          *snowflake.Node
    68  	totURL        string
    69  	// if skip report job results to github
    70  	skipReport bool
    71  	// selector that will be applied on prowjobs.
    72  	selector string
    73  
    74  	lock sync.RWMutex
    75  	// pendingJobs is a short-lived cache that helps in limiting
    76  	// the maximum concurrency of jobs.
    77  	pendingJobs map[string]int
    78  
    79  	pjLock sync.RWMutex
    80  	// shared across the controller and a goroutine that gathers metrics.
    81  	pjs   []prowapi.ProwJob
    82  	clock clock.WithTickerAndDelayedExecution
    83  }
    84  
    85  // NewController creates a new Controller from the provided clients.
    86  func NewController(prowJobClient prowv1.ProwJobInterface, jc *Client, ghc github.Client, logger *logrus.Entry, cfg config.Getter, totURL, selector string, skipReport bool) (*Controller, error) {
    87  	n, err := snowflake.NewNode(1)
    88  	if err != nil {
    89  		return nil, err
    90  	}
    91  	if logger == nil {
    92  		logger = logrus.NewEntry(logrus.StandardLogger())
    93  	}
    94  	return &Controller{
    95  		prowJobClient: prowJobClient,
    96  		jc:            jc,
    97  		ghc:           ghc,
    98  		log:           logger,
    99  		cfg:           cfg,
   100  		selector:      selector,
   101  		node:          n,
   102  		totURL:        totURL,
   103  		skipReport:    skipReport,
   104  		pendingJobs:   make(map[string]int),
   105  		clock:         clock.RealClock{},
   106  	}, nil
   107  }
   108  
   109  func (c *Controller) config() config.Controller {
   110  	operators := c.cfg().JenkinsOperators
   111  	if len(operators) == 1 {
   112  		return operators[0].Controller
   113  	}
   114  	configured := make([]string, 0, len(operators))
   115  	for _, cfg := range operators {
   116  		if cfg.LabelSelectorString == c.selector {
   117  			return cfg.Controller
   118  		}
   119  		configured = append(configured, cfg.LabelSelectorString)
   120  	}
   121  	if len(c.selector) == 0 {
   122  		c.log.Panicf("You need to specify a non-empty --label-selector (existing selectors: %v).", configured)
   123  	} else {
   124  		c.log.Panicf("No config exists for --label-selector=%s.", c.selector)
   125  	}
   126  	return config.Controller{}
   127  }
   128  
   129  // canExecuteConcurrently checks whether the provided ProwJob can
   130  // be executed concurrently.
   131  func (c *Controller) canExecuteConcurrently(pj *prowapi.ProwJob) bool {
   132  	c.lock.Lock()
   133  	defer c.lock.Unlock()
   134  
   135  	if max := c.config().MaxConcurrency; max > 0 {
   136  		var running int
   137  		for _, num := range c.pendingJobs {
   138  			running += num
   139  		}
   140  		if running >= max {
   141  			c.log.WithFields(pjutil.ProwJobFields(pj)).Debugf("Not starting another job, already %d running.", running)
   142  			return false
   143  		}
   144  	}
   145  
   146  	if pj.Spec.MaxConcurrency == 0 {
   147  		c.pendingJobs[pj.Spec.Job]++
   148  		return true
   149  	}
   150  
   151  	numPending := c.pendingJobs[pj.Spec.Job]
   152  	if numPending >= pj.Spec.MaxConcurrency {
   153  		c.log.WithFields(pjutil.ProwJobFields(pj)).Debugf("Not starting another instance of %s, already %d running.", pj.Spec.Job, numPending)
   154  		return false
   155  	}
   156  	c.pendingJobs[pj.Spec.Job]++
   157  	return true
   158  }
   159  
   160  // incrementNumPendingJobs increments the amount of
   161  // pending ProwJobs for the given job identifier
   162  func (c *Controller) incrementNumPendingJobs(job string) {
   163  	c.lock.Lock()
   164  	defer c.lock.Unlock()
   165  	c.pendingJobs[job]++
   166  }
   167  
   168  // Sync does one sync iteration.
   169  func (c *Controller) Sync() error {
   170  	pjs, err := c.prowJobClient.List(context.TODO(), metav1.ListOptions{LabelSelector: c.selector})
   171  	if err != nil {
   172  		return fmt.Errorf("error listing prow jobs: %w", err)
   173  	}
   174  	// Share what we have for gathering metrics.
   175  	c.pjLock.Lock()
   176  	c.pjs = pjs.Items
   177  	c.pjLock.Unlock()
   178  
   179  	// TODO: Replace the following filtering with a field selector once CRDs support field selectors.
   180  	// https://github.com/kubernetes/kubernetes/issues/53459
   181  	var jenkinsJobs []prowapi.ProwJob
   182  	for _, pj := range pjs.Items {
   183  		if pj.Spec.Agent == prowapi.JenkinsAgent {
   184  			jenkinsJobs = append(jenkinsJobs, pj)
   185  		}
   186  	}
   187  	jbs, err := c.jc.ListBuilds(getJenkinsJobs(jenkinsJobs))
   188  	if err != nil {
   189  		return fmt.Errorf("error listing jenkins builds: %w", err)
   190  	}
   191  
   192  	var syncErrs []error
   193  	if err := c.terminateDupes(jenkinsJobs, jbs); err != nil {
   194  		syncErrs = append(syncErrs, err)
   195  	}
   196  
   197  	pendingCh, triggeredCh, abortedCh := pjutil.PartitionActive(jenkinsJobs)
   198  	errCh := make(chan error, len(jenkinsJobs))
   199  	reportCh := make(chan prowapi.ProwJob, len(jenkinsJobs))
   200  
   201  	// Reinstantiate on every resync of the controller instead of trying
   202  	// to keep this in sync with the state of the world.
   203  	c.pendingJobs = make(map[string]int)
   204  	// Sync pending jobs first so we can determine what is the maximum
   205  	// number of new jobs we can trigger when syncing the non-pendings.
   206  	maxSyncRoutines := c.config().MaxGoroutines
   207  	c.log.Debugf("Handling %d pending prowjobs", len(pendingCh))
   208  	syncProwJobs(c.log, c.syncPendingJob, maxSyncRoutines, pendingCh, reportCh, errCh, jbs)
   209  	c.log.Debugf("Handling %d triggered prowjobs", len(triggeredCh))
   210  	syncProwJobs(c.log, c.syncTriggeredJob, maxSyncRoutines, triggeredCh, reportCh, errCh, jbs)
   211  	c.log.Debugf("Handling %d aborted prowjobs", len(abortedCh))
   212  	syncProwJobs(c.log, c.syncAbortedJob, maxSyncRoutines, abortedCh, reportCh, errCh, jbs)
   213  
   214  	close(errCh)
   215  	close(reportCh)
   216  
   217  	for err := range errCh {
   218  		syncErrs = append(syncErrs, err)
   219  	}
   220  
   221  	var reportErrs []error
   222  	if !c.skipReport {
   223  		reportConfig := c.cfg().GitHubReporter
   224  		jConfig := c.config()
   225  		for report := range reportCh {
   226  			reportTemplate := jConfig.ReportTemplateForRepo(report.Spec.Refs)
   227  			if err := reportlib.Report(context.Background(), c.ghc, reportTemplate, report, reportConfig); err != nil {
   228  				reportErrs = append(reportErrs, err)
   229  				c.log.WithFields(pjutil.ProwJobFields(&report)).WithError(err).Warn("Failed to report ProwJob status")
   230  			}
   231  		}
   232  	}
   233  
   234  	if len(syncErrs) == 0 && len(reportErrs) == 0 {
   235  		return nil
   236  	}
   237  	return fmt.Errorf("errors syncing: %v, errors reporting: %v", syncErrs, reportErrs)
   238  }
   239  
   240  // SyncMetrics records metrics for the cached prowjobs.
   241  func (c *Controller) SyncMetrics() {
   242  	c.pjLock.RLock()
   243  	defer c.pjLock.RUnlock()
   244  	kube.GatherProwJobMetrics(c.log, c.pjs)
   245  }
   246  
   247  // getJenkinsJobs returns all the Jenkins jobs for all active
   248  // prowjobs from the provided list. It handles deduplication.
   249  func getJenkinsJobs(pjs []prowapi.ProwJob) []BuildQueryParams {
   250  	jenkinsJobs := []BuildQueryParams{}
   251  
   252  	for _, pj := range pjs {
   253  		if pj.Complete() {
   254  			continue
   255  		}
   256  
   257  		jenkinsJobs = append(jenkinsJobs, BuildQueryParams{
   258  			JobName:   getJobName(&pj.Spec),
   259  			ProwJobID: pj.Name,
   260  		})
   261  	}
   262  
   263  	return jenkinsJobs
   264  }
   265  
   266  // terminateDupes aborts presubmits that have a newer version. It modifies pjs
   267  // in-place when it aborts.
   268  func (c *Controller) terminateDupes(pjs []prowapi.ProwJob, jbs map[string]Build) error {
   269  	// "job org/repo#number" -> newest job
   270  	dupes := make(map[string]int)
   271  	for i, pj := range pjs {
   272  		if pj.Complete() || pj.Spec.Type != prowapi.PresubmitJob {
   273  			continue
   274  		}
   275  		n := fmt.Sprintf("%s %s/%s#%d", pj.Spec.Job, pj.Spec.Refs.Org, pj.Spec.Refs.Repo, pj.Spec.Refs.Pulls[0].Number)
   276  		prev, ok := dupes[n]
   277  		if !ok {
   278  			dupes[n] = i
   279  			continue
   280  		}
   281  		cancelIndex := i
   282  		if (&pjs[prev].Status.StartTime).Before(&pj.Status.StartTime) {
   283  			cancelIndex = prev
   284  			dupes[n] = i
   285  		}
   286  		toCancel := pjs[cancelIndex]
   287  
   288  		// Abort presubmit jobs for commits that have been superseded by
   289  		// newer commits in GitHub pull requests.
   290  		build, buildExists := jbs[toCancel.ObjectMeta.Name]
   291  		// Avoid cancelling enqueued builds.
   292  		if buildExists && build.IsEnqueued() {
   293  			continue
   294  		}
   295  		// Otherwise, abort it.
   296  		if buildExists {
   297  			if err := c.jc.Abort(getJobName(&toCancel.Spec), &build); err != nil {
   298  				c.log.WithError(err).WithFields(pjutil.ProwJobFields(&toCancel)).Warn("Cannot cancel Jenkins build")
   299  			}
   300  		}
   301  
   302  		srcPJ := toCancel.DeepCopy()
   303  		toCancel.SetComplete()
   304  		prevState := toCancel.Status.State
   305  		toCancel.Status.State = prowapi.AbortedState
   306  		toCancel.Status.Description = "Aborted as the newer version of this job is running."
   307  		c.log.WithFields(pjutil.ProwJobFields(&toCancel)).
   308  			WithField("from", prevState).
   309  			WithField("to", toCancel.Status.State).Info("Transitioning states.")
   310  		npj, err := pjutil.PatchProwjob(context.TODO(), c.prowJobClient, c.log, *srcPJ, toCancel)
   311  		if err != nil {
   312  			return err
   313  		}
   314  		pjs[cancelIndex] = *npj
   315  	}
   316  	return nil
   317  }
   318  
   319  func syncProwJobs(
   320  	l *logrus.Entry,
   321  	syncFn syncFn,
   322  	maxSyncRoutines int,
   323  	jobs <-chan prowapi.ProwJob,
   324  	reports chan<- prowapi.ProwJob,
   325  	syncErrors chan<- error,
   326  	jbs map[string]Build,
   327  ) {
   328  	goroutines := maxSyncRoutines
   329  	if goroutines > len(jobs) {
   330  		goroutines = len(jobs)
   331  	}
   332  	wg := &sync.WaitGroup{}
   333  	wg.Add(goroutines)
   334  	l.Debugf("Firing up %d goroutines", goroutines)
   335  	for i := 0; i < goroutines; i++ {
   336  		go func() {
   337  			defer wg.Done()
   338  			for pj := range jobs {
   339  				if err := syncFn(pj, reports, jbs); err != nil {
   340  					syncErrors <- err
   341  				}
   342  			}
   343  		}()
   344  	}
   345  	wg.Wait()
   346  }
   347  
   348  func (c *Controller) syncPendingJob(pj prowapi.ProwJob, reports chan<- prowapi.ProwJob, jbs map[string]Build) error {
   349  	// Record last known state so we can patch
   350  	prevPJ := pj.DeepCopy()
   351  
   352  	jb, jbExists := jbs[pj.ObjectMeta.Name]
   353  	if !jbExists {
   354  		pj.SetComplete()
   355  		pj.Status.State = prowapi.ErrorState
   356  		pj.Status.URL = c.cfg().StatusErrorLink
   357  		pj.Status.Description = "Error finding Jenkins job."
   358  	} else {
   359  		switch {
   360  		case jb.IsEnqueued():
   361  			// Still in queue.
   362  			c.incrementNumPendingJobs(pj.Spec.Job)
   363  			return nil
   364  
   365  		case jb.IsRunning():
   366  			// Build still going.
   367  			c.incrementNumPendingJobs(pj.Spec.Job)
   368  			if pj.Status.Description == "Jenkins job running." {
   369  				return nil
   370  			}
   371  			pj.Status.Description = "Jenkins job running."
   372  
   373  		case jb.IsSuccess():
   374  			// Build is complete.
   375  			pj.SetComplete()
   376  			pj.Status.State = prowapi.SuccessState
   377  			pj.Status.Description = "Jenkins job succeeded."
   378  
   379  		case jb.IsFailure():
   380  			pj.SetComplete()
   381  			pj.Status.State = prowapi.FailureState
   382  			pj.Status.Description = "Jenkins job failed."
   383  
   384  		case jb.IsAborted():
   385  			pj.SetComplete()
   386  			pj.Status.State = prowapi.AbortedState
   387  			pj.Status.Description = "Jenkins job aborted."
   388  		}
   389  		// Construct the status URL that will be used in reports.
   390  		pj.Status.PodName = pj.ObjectMeta.Name
   391  		pj.Status.BuildID = jb.BuildID()
   392  		pj.Status.JenkinsBuildID = strconv.Itoa(jb.Number)
   393  		var b bytes.Buffer
   394  		if err := c.config().JobURLTemplate.Execute(&b, &pj); err != nil {
   395  			c.log.WithFields(pjutil.ProwJobFields(&pj)).Errorf("error executing URL template: %v", err)
   396  		} else {
   397  			pj.Status.URL = b.String()
   398  		}
   399  	}
   400  	// Report to GitHub.
   401  	reports <- pj
   402  	if prevPJ.Status.State != pj.Status.State {
   403  		c.log.WithFields(pjutil.ProwJobFields(&pj)).
   404  			WithField("from", prevPJ.Status.State).
   405  			WithField("to", pj.Status.State).Info("Transitioning states.")
   406  	}
   407  	_, err := pjutil.PatchProwjob(context.TODO(), c.prowJobClient, c.log, *prevPJ, pj)
   408  	return err
   409  }
   410  
   411  func (c *Controller) syncAbortedJob(pj prowapi.ProwJob, _ chan<- prowapi.ProwJob, jbs map[string]Build) error {
   412  	if pj.Status.State != prowapi.AbortedState || pj.Complete() {
   413  		return nil
   414  	}
   415  
   416  	if build, exists := jbs[pj.Name]; exists {
   417  		if err := c.jc.Abort(getJobName(&pj.Spec), &build); err != nil {
   418  			return fmt.Errorf("failed to abort Jenkins build: %w", err)
   419  		}
   420  	}
   421  
   422  	originalPJ := pj.DeepCopy()
   423  	pj.SetComplete()
   424  	_, err := pjutil.PatchProwjob(context.TODO(), c.prowJobClient, c.log, *originalPJ, pj)
   425  	return err
   426  }
   427  
   428  func (c *Controller) syncTriggeredJob(pj prowapi.ProwJob, reports chan<- prowapi.ProwJob, jbs map[string]Build) error {
   429  	// Record last known state so we can patch
   430  	prevPJ := pj.DeepCopy()
   431  
   432  	if _, jbExists := jbs[pj.ObjectMeta.Name]; !jbExists {
   433  		// Do not start more jobs than specified.
   434  		if !c.canExecuteConcurrently(&pj) {
   435  			return nil
   436  		}
   437  		buildID, err := c.getBuildID(pj.Spec.Job)
   438  		if err != nil {
   439  			return fmt.Errorf("error getting build ID: %w", err)
   440  		}
   441  		// Start the Jenkins job.
   442  		if err := c.jc.Build(&pj, buildID); err != nil {
   443  			c.log.WithError(err).WithFields(pjutil.ProwJobFields(&pj)).Warn("Cannot start Jenkins build")
   444  			pj.SetComplete()
   445  			pj.Status.State = prowapi.ErrorState
   446  			pj.Status.URL = c.cfg().StatusErrorLink
   447  			pj.Status.Description = "Error starting Jenkins job."
   448  		} else {
   449  			now := metav1.NewTime(c.clock.Now())
   450  			pj.Status.PendingTime = &now
   451  			pj.Status.State = prowapi.PendingState
   452  			pj.Status.Description = "Jenkins job enqueued."
   453  		}
   454  	} else {
   455  		// If a Jenkins build already exists for this job, advance the ProwJob to Pending and
   456  		// it should be handled by syncPendingJob in the next sync.
   457  		if pj.Status.PendingTime == nil {
   458  			now := metav1.NewTime(c.clock.Now())
   459  			pj.Status.PendingTime = &now
   460  		}
   461  		pj.Status.State = prowapi.PendingState
   462  		pj.Status.Description = "Jenkins job enqueued."
   463  	}
   464  	// Report to GitHub.
   465  	reports <- pj
   466  
   467  	if prevPJ.Status.State != pj.Status.State {
   468  		c.log.WithFields(pjutil.ProwJobFields(&pj)).
   469  			WithField("from", prevPJ.Status.State).
   470  			WithField("to", pj.Status.State).Info("Transitioning states.")
   471  	}
   472  	_, err := pjutil.PatchProwjob(context.TODO(), c.prowJobClient, c.log, *prevPJ, pj)
   473  	return err
   474  }
   475  
   476  func (c *Controller) getBuildID(name string) (string, error) {
   477  	if c.totURL == "" {
   478  		return c.node.Generate().String(), nil
   479  	}
   480  	return pjutil.GetBuildID(name, c.totURL)
   481  }