github.com/shashidharatd/test-infra@v0.0.0-20171006011030-71304e1ca560/prow/plank/controller.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package plank
    18  
    19  import (
    20  	"bytes"
    21  	"fmt"
    22  	"io/ioutil"
    23  	"net/http"
    24  	"sync"
    25  	"time"
    26  
    27  	"github.com/bwmarrin/snowflake"
    28  	"github.com/sirupsen/logrus"
    29  
    30  	"k8s.io/test-infra/prow/config"
    31  	"k8s.io/test-infra/prow/github"
    32  	"k8s.io/test-infra/prow/kube"
    33  	"k8s.io/test-infra/prow/pjutil"
    34  	reportlib "k8s.io/test-infra/prow/report"
    35  )
    36  
    37  const (
    38  	testInfra = "https://github.com/kubernetes/test-infra/issues"
    39  
    40  	// maxSyncRoutines is the maximum number of goroutines
    41  	// that will be active at any one time for the sync
    42  	maxSyncRoutines = 20
    43  )
    44  
    45  type kubeClient interface {
    46  	CreateProwJob(kube.ProwJob) (kube.ProwJob, error)
    47  	ListProwJobs(map[string]string) ([]kube.ProwJob, error)
    48  	ReplaceProwJob(string, kube.ProwJob) (kube.ProwJob, error)
    49  
    50  	CreatePod(kube.Pod) (kube.Pod, error)
    51  	ListPods(map[string]string) ([]kube.Pod, error)
    52  	DeletePod(string) error
    53  }
    54  
    55  type githubClient interface {
    56  	BotName() (string, error)
    57  	CreateStatus(org, repo, ref string, s github.Status) error
    58  	ListIssueComments(org, repo string, number int) ([]github.IssueComment, error)
    59  	CreateComment(org, repo string, number int, comment string) error
    60  	DeleteComment(org, repo string, ID int) error
    61  	EditComment(org, repo string, ID int, comment string) error
    62  	GetPullRequestChanges(org, repo string, number int) ([]github.PullRequestChange, error)
    63  }
    64  
    65  type configAgent interface {
    66  	Config() *config.Config
    67  }
    68  
    69  // TODO: Dry this out
    70  type syncFn func(pj kube.ProwJob, pm map[string]kube.Pod, reports chan<- kube.ProwJob) error
    71  
    72  // Controller manages ProwJobs.
    73  type Controller struct {
    74  	kc     kubeClient
    75  	pkc    kubeClient
    76  	ghc    githubClient
    77  	ca     configAgent
    78  	node   *snowflake.Node
    79  	totURL string
    80  
    81  	lock sync.RWMutex
    82  	// pendingJobs is a short-lived cache that helps in limiting
    83  	// the maximum concurrency of jobs.
    84  	pendingJobs map[string]int
    85  }
    86  
    87  // canExecuteConcurrently checks whether the provided ProwJob can
    88  // be executed concurrently.
    89  func (c *Controller) canExecuteConcurrently(pj *kube.ProwJob) bool {
    90  	c.lock.Lock()
    91  	defer c.lock.Unlock()
    92  
    93  	if max := c.ca.Config().Plank.MaxConcurrency; max > 0 {
    94  		var running int
    95  		for _, num := range c.pendingJobs {
    96  			running += num
    97  		}
    98  		if running >= max {
    99  			logrus.Infof("Not starting another job, already %d running.", running)
   100  			return false
   101  		}
   102  	}
   103  
   104  	if pj.Spec.MaxConcurrency == 0 {
   105  		c.pendingJobs[pj.Spec.Job]++
   106  		return true
   107  	}
   108  
   109  	numPending := c.pendingJobs[pj.Spec.Job]
   110  	if numPending >= pj.Spec.MaxConcurrency {
   111  		logrus.WithField("job", pj.Spec.Job).Infof("Not starting another instance of %s, already %d running.", pj.Spec.Job, numPending)
   112  		return false
   113  	}
   114  	c.pendingJobs[pj.Spec.Job]++
   115  	return true
   116  }
   117  
   118  // incrementNumPendingJobs increments the amount of
   119  // pending ProwJobs for the given job identifier
   120  func (c *Controller) incrementNumPendingJobs(job string) {
   121  	c.lock.Lock()
   122  	defer c.lock.Unlock()
   123  	c.pendingJobs[job]++
   124  }
   125  
   126  // NewController creates a new Controller from the provided clients.
   127  func NewController(kc, pkc *kube.Client, ghc *github.Client, ca *config.Agent, totURL string) (*Controller, error) {
   128  	n, err := snowflake.NewNode(1)
   129  	if err != nil {
   130  		return nil, err
   131  	}
   132  	return &Controller{
   133  		kc:          kc,
   134  		pkc:         pkc,
   135  		ghc:         ghc,
   136  		ca:          ca,
   137  		node:        n,
   138  		pendingJobs: make(map[string]int),
   139  		lock:        sync.RWMutex{},
   140  		totURL:      totURL,
   141  	}, nil
   142  }
   143  
   144  // Sync does one sync iteration.
   145  func (c *Controller) Sync() error {
   146  	pjs, err := c.kc.ListProwJobs(nil)
   147  	if err != nil {
   148  		return fmt.Errorf("error listing prow jobs: %v", err)
   149  	}
   150  	labels := map[string]string{kube.CreatedByProw: "true"}
   151  	pods, err := c.pkc.ListPods(labels)
   152  	if err != nil {
   153  		return fmt.Errorf("error listing pods: %v", err)
   154  	}
   155  	pm := map[string]kube.Pod{}
   156  	for _, pod := range pods {
   157  		pm[pod.Metadata.Name] = pod
   158  	}
   159  
   160  	var k8sJobs []kube.ProwJob
   161  	for _, pj := range pjs {
   162  		if pj.Spec.Agent == kube.KubernetesAgent {
   163  			k8sJobs = append(k8sJobs, pj)
   164  		}
   165  	}
   166  	pjs = k8sJobs
   167  
   168  	var syncErrs []error
   169  	if err := c.terminateDupes(pjs, pm); err != nil {
   170  		syncErrs = append(syncErrs, err)
   171  	}
   172  
   173  	pendingCh, nonPendingCh := pjutil.PartitionPending(pjs)
   174  	errCh := make(chan error, len(pjs))
   175  	reportCh := make(chan kube.ProwJob, len(pjs))
   176  
   177  	// Reinstantiate on every resync of the controller instead of trying
   178  	// to keep this in sync with the state of the world.
   179  	c.pendingJobs = make(map[string]int)
   180  	// Sync pending jobs first so we can determine what is the maximum
   181  	// number of new jobs we can trigger when syncing the non-pendings.
   182  	syncProwJobs(c.syncPendingJob, pendingCh, reportCh, errCh, pm)
   183  	syncProwJobs(c.syncNonPendingJob, nonPendingCh, reportCh, errCh, pm)
   184  
   185  	close(errCh)
   186  	close(reportCh)
   187  
   188  	for err := range errCh {
   189  		syncErrs = append(syncErrs, err)
   190  	}
   191  
   192  	var reportErrs []error
   193  	reportTemplate := c.ca.Config().Plank.ReportTemplate
   194  	for report := range reportCh {
   195  		if err := reportlib.Report(c.ghc, reportTemplate, report); err != nil {
   196  			reportErrs = append(reportErrs, err)
   197  		}
   198  	}
   199  
   200  	if len(syncErrs) == 0 && len(reportErrs) == 0 {
   201  		return nil
   202  	}
   203  	return fmt.Errorf("errors syncing: %v, errors reporting: %v", syncErrs, reportErrs)
   204  }
   205  
   206  // terminateDupes aborts presubmits that have a newer version. It modifies pjs
   207  // in-place when it aborts.
   208  // TODO: Dry this out - need to ensure we can abstract children cancellation first.
   209  func (c *Controller) terminateDupes(pjs []kube.ProwJob, pm map[string]kube.Pod) error {
   210  	// "job org/repo#number" -> newest job
   211  	dupes := make(map[string]int)
   212  	for i, pj := range pjs {
   213  		if pj.Complete() || pj.Spec.Type != kube.PresubmitJob {
   214  			continue
   215  		}
   216  		n := fmt.Sprintf("%s %s/%s#%d", pj.Spec.Job, pj.Spec.Refs.Org, pj.Spec.Refs.Repo, pj.Spec.Refs.Pulls[0].Number)
   217  		prev, ok := dupes[n]
   218  		if !ok {
   219  			dupes[n] = i
   220  			continue
   221  		}
   222  		cancelIndex := i
   223  		if pjs[prev].Status.StartTime.Before(pj.Status.StartTime) {
   224  			cancelIndex = prev
   225  			dupes[n] = i
   226  		}
   227  		toCancel := pjs[cancelIndex]
   228  		// Allow aborting presubmit jobs for commits that have been superseded by
   229  		// newer commits in Github pull requests.
   230  		if c.ca.Config().Plank.AllowCancellations {
   231  			if pod, exists := pm[toCancel.Metadata.Name]; exists {
   232  				if err := c.pkc.DeletePod(pod.Metadata.Name); err != nil {
   233  					logrus.Warningf("Cannot cancel pod for prowjob %q: %v", toCancel.Metadata.Name, err)
   234  				}
   235  			}
   236  		}
   237  		toCancel.Status.CompletionTime = time.Now()
   238  		toCancel.Status.State = kube.AbortedState
   239  		npj, err := c.kc.ReplaceProwJob(toCancel.Metadata.Name, toCancel)
   240  		if err != nil {
   241  			return err
   242  		}
   243  		pjs[cancelIndex] = npj
   244  	}
   245  	return nil
   246  }
   247  
   248  // TODO: Dry this out
   249  func syncProwJobs(syncFn syncFn, jobs <-chan kube.ProwJob, reports chan<- kube.ProwJob, syncErrors chan<- error, pm map[string]kube.Pod) {
   250  	wg := &sync.WaitGroup{}
   251  	wg.Add(maxSyncRoutines)
   252  	for i := 0; i < maxSyncRoutines; i++ {
   253  		go func(jobs <-chan kube.ProwJob) {
   254  			defer wg.Done()
   255  			for pj := range jobs {
   256  				if err := syncFn(pj, pm, reports); err != nil {
   257  					syncErrors <- err
   258  				}
   259  			}
   260  		}(jobs)
   261  	}
   262  	wg.Wait()
   263  }
   264  
   265  func (c *Controller) syncPendingJob(pj kube.ProwJob, pm map[string]kube.Pod, reports chan<- kube.ProwJob) error {
   266  	pod, podExists := pm[pj.Metadata.Name]
   267  	if !podExists {
   268  		c.incrementNumPendingJobs(pj.Spec.Job)
   269  		// Pod is missing. This can happen in case we deleted the previous pod because
   270  		// it was stuck in Unknown/Evicted state due to a node problem or the pod was
   271  		// deleted manually. Start a new pod.
   272  		id, pn, err := c.startPod(pj)
   273  		if err != nil {
   274  			_, isUnprocessable := err.(kube.UnprocessableEntityError)
   275  			if !isUnprocessable {
   276  				return fmt.Errorf("error starting pod: %v", err)
   277  			}
   278  			pj.Status.State = kube.ErrorState
   279  			pj.Status.CompletionTime = time.Now()
   280  			pj.Status.Description = "Job cannot be processed."
   281  			logrus.WithField("job", pj.Spec.Job).WithError(err).Warning("Unprocessable pod.")
   282  		} else {
   283  			pj.Status.BuildID = id
   284  			pj.Status.PodName = pn
   285  		}
   286  	} else {
   287  		switch pod.Status.Phase {
   288  		case kube.PodUnknown:
   289  			c.incrementNumPendingJobs(pj.Spec.Job)
   290  			// Pod is in Unknown state. This can happen if there is a problem with
   291  			// the node. Delete the old pod, we'll start a new one next loop.
   292  			return c.pkc.DeletePod(pj.Metadata.Name)
   293  
   294  		case kube.PodSucceeded:
   295  			// Pod succeeded. Update ProwJob, talk to GitHub, and start next jobs.
   296  			pj.Status.CompletionTime = time.Now()
   297  			pj.Status.State = kube.SuccessState
   298  			pj.Status.Description = "Job succeeded."
   299  			for _, nj := range pj.Spec.RunAfterSuccess {
   300  				child := pjutil.NewProwJob(nj)
   301  				if !RunAfterSuccessCanRun(&pj, &child, c.ca, c.ghc) {
   302  					continue
   303  				}
   304  				if _, err := c.kc.CreateProwJob(pjutil.NewProwJob(nj)); err != nil {
   305  					return fmt.Errorf("error starting next prowjob: %v", err)
   306  				}
   307  			}
   308  
   309  		case kube.PodFailed:
   310  			if pod.Status.Reason == kube.Evicted {
   311  				c.incrementNumPendingJobs(pj.Spec.Job)
   312  				// Pod was evicted. We will recreate it in the next resync.
   313  				return c.pkc.DeletePod(pj.Metadata.Name)
   314  			}
   315  			// Pod failed. Update ProwJob, talk to GitHub.
   316  			pj.Status.CompletionTime = time.Now()
   317  			pj.Status.State = kube.FailureState
   318  			pj.Status.Description = "Job failed."
   319  
   320  		default:
   321  			// Pod is running. Do nothing.
   322  			c.incrementNumPendingJobs(pj.Spec.Job)
   323  			return nil
   324  		}
   325  	}
   326  
   327  	var b bytes.Buffer
   328  	if err := c.ca.Config().Plank.JobURLTemplate.Execute(&b, &pj); err != nil {
   329  		return fmt.Errorf("error executing URL template: %v", err)
   330  	}
   331  	pj.Status.URL = b.String()
   332  	reports <- pj
   333  
   334  	_, err := c.kc.ReplaceProwJob(pj.Metadata.Name, pj)
   335  	return err
   336  }
   337  
   338  func (c *Controller) syncNonPendingJob(pj kube.ProwJob, pm map[string]kube.Pod, reports chan<- kube.ProwJob) error {
   339  	if pj.Complete() {
   340  		return nil
   341  	}
   342  
   343  	// The rest are new prowjobs.
   344  
   345  	var id, pn string
   346  	pod, podExists := pm[pj.Metadata.Name]
   347  	// We may end up in a state where the pod exists but the prowjob is not
   348  	// updated to pending if we successfully create a new pod in a previous
   349  	// sync but the prowjob update fails. Simply ignore creating a new pod
   350  	// and rerun the prowjob update.
   351  	if !podExists {
   352  		// Do not start more jobs than specified.
   353  		if !c.canExecuteConcurrently(&pj) {
   354  			return nil
   355  		}
   356  		// We haven't started the pod yet. Do so.
   357  		var err error
   358  		id, pn, err = c.startPod(pj)
   359  		if err != nil {
   360  			_, isUnprocessable := err.(kube.UnprocessableEntityError)
   361  			if !isUnprocessable {
   362  				return fmt.Errorf("error starting pod: %v", err)
   363  			}
   364  			pj.Status.State = kube.ErrorState
   365  			pj.Status.CompletionTime = time.Now()
   366  			pj.Status.Description = "Job cannot be processed."
   367  			logrus.WithField("job", pj.Spec.Job).WithError(err).Warning("Unprocessable pod.")
   368  		}
   369  	} else {
   370  		id = getPodBuildID(&pod)
   371  		pn = pod.Metadata.Name
   372  	}
   373  
   374  	if pj.Status.State == kube.TriggeredState {
   375  		// BuildID needs to be set before we execute the job url template.
   376  		pj.Status.BuildID = id
   377  		pj.Status.State = kube.PendingState
   378  		pj.Status.PodName = pn
   379  		pj.Status.Description = "Job triggered."
   380  		var b bytes.Buffer
   381  		if err := c.ca.Config().Plank.JobURLTemplate.Execute(&b, &pj); err != nil {
   382  			return fmt.Errorf("error executing URL template: %v", err)
   383  		}
   384  		pj.Status.URL = b.String()
   385  	}
   386  	reports <- pj
   387  
   388  	_, err := c.kc.ReplaceProwJob(pj.Metadata.Name, pj)
   389  	return err
   390  }
   391  
   392  // TODO: No need to return the pod name since we already have the
   393  // prowjob in the call site.
   394  func (c *Controller) startPod(pj kube.ProwJob) (string, string, error) {
   395  	buildID, err := c.getBuildID(pj.Spec.Job)
   396  	if err != nil {
   397  		return "", "", fmt.Errorf("error getting build ID: %v", err)
   398  	}
   399  
   400  	pod := pjutil.ProwJobToPod(pj, buildID)
   401  
   402  	actual, err := c.pkc.CreatePod(*pod)
   403  	if err != nil {
   404  		return "", "", err
   405  	}
   406  	return buildID, actual.Metadata.Name, nil
   407  }
   408  
   409  func (c *Controller) getBuildID(name string) (string, error) {
   410  	if c.totURL == "" {
   411  		return c.node.Generate().String(), nil
   412  	}
   413  	var err error
   414  	url := c.totURL + "/vend/" + name
   415  	for retries := 0; retries < 60; retries++ {
   416  		if retries > 0 {
   417  			time.Sleep(2 * time.Second)
   418  		}
   419  		var resp *http.Response
   420  		resp, err = http.Get(url)
   421  		if err != nil {
   422  			continue
   423  		}
   424  		defer resp.Body.Close()
   425  		if resp.StatusCode != 200 {
   426  			continue
   427  		}
   428  		if buf, err := ioutil.ReadAll(resp.Body); err == nil {
   429  			return string(buf), nil
   430  		}
   431  		return "", err
   432  	}
   433  	return "", err
   434  }
   435  
   436  func getPodBuildID(pod *kube.Pod) string {
   437  	for _, env := range pod.Spec.Containers[0].Env {
   438  		if env.Name == "BUILD_NUMBER" {
   439  			return env.Value
   440  		}
   441  	}
   442  	logrus.Warningf("BUILD_NUMBER was not found in pod %q: streaming logs from deck will not work", pod.Metadata.Name)
   443  	return ""
   444  }
   445  
   446  // RunAfterSuccessCanRun returns whether a child job (specified as run_after_success in the
   447  // prow config) can run once its parent job succeeds. The only case we will not run a child job
   448  // is when it is a presubmit job and has a run_if_changed regural expression specified which does
   449  // not match the changed filenames in the pull request the job was meant to run for.
   450  // TODO: Collapse with Jenkins, impossible to reuse as is due to the interfaces.
   451  func RunAfterSuccessCanRun(parent, child *kube.ProwJob, c configAgent, ghc githubClient) bool {
   452  	if parent.Spec.Type != kube.PresubmitJob {
   453  		return true
   454  	}
   455  
   456  	// TODO: Make sure that parent and child have always the same org/repo.
   457  	org := parent.Spec.Refs.Org
   458  	repo := parent.Spec.Refs.Repo
   459  	prNum := parent.Spec.Refs.Pulls[0].Number
   460  
   461  	ps := c.Config().GetPresubmit(org+"/"+repo, child.Spec.Job)
   462  	if ps == nil {
   463  		// The config has changed ever since we started the parent.
   464  		// Not sure what is more correct here. Run the child for now.
   465  		return true
   466  	}
   467  	if ps.RunIfChanged == "" {
   468  		return true
   469  	}
   470  	changesFull, err := ghc.GetPullRequestChanges(org, repo, prNum)
   471  	if err != nil {
   472  		logrus.Warningf("Cannot get PR changes for %d: %v", prNum, err)
   473  		return true
   474  	}
   475  	// We only care about the filenames here
   476  	var changes []string
   477  	for _, change := range changesFull {
   478  		changes = append(changes, change.Filename)
   479  	}
   480  	return ps.RunsAgainstChanges(changes)
   481  }