github.com/yrj2011/jx-test-infra@v0.0.0-20190529031832-7a2065ee98eb/mungegithub/mungers/submit-queue.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package mungers 18 19 import ( 20 "bytes" 21 "encoding/json" 22 "errors" 23 "fmt" 24 "math" 25 "net/http" 26 "sort" 27 "strconv" 28 "strings" 29 "sync" 30 "sync/atomic" 31 "time" 32 33 utilclock "k8s.io/apimachinery/pkg/util/clock" 34 "k8s.io/apimachinery/pkg/util/sets" 35 36 "k8s.io/contrib/test-utils/utils" 37 "k8s.io/test-infra/mungegithub/features" 38 "k8s.io/test-infra/mungegithub/github" 39 "k8s.io/test-infra/mungegithub/mungeopts" 40 "k8s.io/test-infra/mungegithub/mungers/e2e" 41 fake_e2e "k8s.io/test-infra/mungegithub/mungers/e2e/fake" 42 "k8s.io/test-infra/mungegithub/mungers/mungerutil" 43 "k8s.io/test-infra/mungegithub/mungers/shield" 44 "k8s.io/test-infra/mungegithub/options" 45 "k8s.io/test-infra/mungegithub/sharedmux" 46 47 "github.com/NYTimes/gziphandler" 48 "github.com/golang/glog" 49 githubapi "github.com/google/go-github/github" 50 "github.com/prometheus/client_golang/prometheus" 51 ) 52 53 const ( 54 approvedLabel = "approved" 55 lgtmLabel = "lgtm" 56 retestNotRequiredLabel = "retest-not-required" 57 retestNotRequiredDocsOnlyLabel = "retest-not-required-docs-only" 58 doNotMergeLabel = "do-not-merge" 59 wipLabel = "do-not-merge/work-in-progress" 60 holdLabel = "do-not-merge/hold" 61 releaseNoteLabelNeeded = "do-not-merge/release-note-label-needed" 62 cncfClaYesLabel = "cncf-cla: yes" 63 cncfClaNoLabel = "cncf-cla: no" 64 claHumanLabel = "cla: human-approved" 65 criticalFixLabel = "queue/critical-fix" 66 blocksOthersLabel = "queue/blocks-others" 67 fixLabel = "queue/fix" 68 multirebaseLabel = "queue/multiple-rebases" 69 70 sqContext = "Submit Queue" 71 72 githubE2EPollTime = 30 * time.Second 73 ) 74 75 var ( 76 // This MUST cause a RETEST of everything in the mungeopts.RequiredContexts.Retest 77 newRetestBody = "/test all [submit-queue is verifying that this PR is safe to merge]" 78 79 // this is the order in which labels will be compared for queue priority 80 labelPriorities = []string{criticalFixLabel, retestNotRequiredLabel, retestNotRequiredDocsOnlyLabel, multirebaseLabel, fixLabel, blocksOthersLabel} 81 // high priority labels are checked before the release 82 lastHighPriorityLabel = 2 // retestNotRequiredDocsOnlyLabel 83 ) 84 85 type submitStatus struct { 86 Time time.Time 87 statusPullRequest 88 Reason string 89 } 90 91 type statusPullRequest struct { 92 Number int 93 URL string 94 Title string 95 Login string 96 AvatarURL string 97 Additions int 98 Deletions int 99 ExtraInfo []string 100 BaseRef string 101 } 102 103 type e2eQueueStatus struct { 104 E2ERunning *statusPullRequest 105 E2EQueue []*statusPullRequest 106 BatchStatus *submitQueueBatchStatus 107 } 108 109 type submitQueueStatus struct { 110 PRStatus map[string]submitStatus 111 } 112 113 // Information about the e2e test health. Call updateHealth on the SubmitQueue 114 // at roughly constant intervals to keep this up to date. The mergeable fraction 115 // of time for the queue as a whole and the individual jobs will then be 116 // NumStable[PerJob] / TotalLoops. 117 type submitQueueHealth struct { 118 TotalLoops int 119 NumStable int 120 NumStablePerJob map[string]int 121 MergePossibleNow bool 122 } 123 124 // Generate health information using a queue of healthRecords. The bools are 125 // true for stable and false otherwise. 126 type healthRecord struct { 127 Time time.Time 128 Overall bool 129 Jobs map[string]bool 130 } 131 132 // information about the sq itself including how fast things are merging and 133 // how long since the last merge 134 type submitQueueStats struct { 135 Added int // Number of items added to the queue since restart 136 FlakesIgnored int 137 Initialized bool // true if we've made at least one complete pass 138 InstantMerges int // Number of merges without retests required 139 BatchMerges int // Number of merges caused by batch 140 LastMergeTime time.Time 141 MergeRate float64 142 MergesSinceRestart int 143 Removed int // Number of items dequeued since restart 144 RetestsAvoided int 145 StartTime time.Time 146 Tested int // Number of e2e tests completed 147 } 148 149 // pull-request that has been tested as successful, but interrupted because head flaked 150 type submitQueueInterruptedObject struct { 151 obj *github.MungeObject 152 // If these two items match when we're about to kick off a retest, it's safe to skip the retest. 153 interruptedMergeHeadSHA string 154 interruptedMergeBaseSHA string 155 } 156 157 // Contains metadata about this instance of the submit queue such as URLs. 158 // Consumed by the template system. 159 type submitQueueMetadata struct { 160 ProjectName string 161 162 ChartURL string 163 // chartURL is an option storage location. It is distinct from ChartURL 164 // since the public variables are used asynchronously by a fileserver 165 // and updates to the options values should not cause a race condition. 166 chartURL string 167 168 RepoPullURL string 169 ProwURL string 170 } 171 172 type submitQueueBatchStatus struct { 173 Error map[string]string 174 Running *prowJob 175 } 176 177 type prometheusMetrics struct { 178 Blocked prometheus.Gauge 179 OpenPRs prometheus.Gauge 180 QueuedPRs prometheus.Gauge 181 MergeCount prometheus.Counter 182 LastMergeTime prometheus.Gauge 183 } 184 185 var ( 186 sqPromMetrics = prometheusMetrics{ 187 Blocked: prometheus.NewGauge(prometheus.GaugeOpts{ 188 Name: "submitqueue_blocked", 189 Help: "The submit-queue is currently blocked", 190 }), 191 OpenPRs: prometheus.NewGauge(prometheus.GaugeOpts{ 192 Name: "submitqueue_open_pullrequests_total", 193 Help: "Number of open pull-requests", 194 }), 195 QueuedPRs: prometheus.NewGauge(prometheus.GaugeOpts{ 196 Name: "submitqueue_queued_pullrequests_total", 197 Help: "Number of pull-requests queued", 198 }), 199 MergeCount: prometheus.NewCounter(prometheus.CounterOpts{ 200 Name: "submitqueue_merge_total", 201 Help: "Number of merges done", 202 }), 203 LastMergeTime: prometheus.NewGauge(prometheus.GaugeOpts{ 204 Name: "submitqueue_time_of_last_merge", 205 Help: "Time of last merge", 206 }), 207 } 208 ) 209 210 // marshaled in serveCIStatus 211 type jobStatus struct { 212 State string `json:"state"` 213 BuildID string `json:"build_id"` 214 URL string `json:"url"` 215 } 216 217 // SubmitQueue will merge PR which meet a set of requirements. 218 // PR must have LGTM after the last commit 219 // PR must have passed all github CI checks 220 // The google internal jenkins instance must be passing the BlockingJobNames e2e tests 221 type SubmitQueue struct { 222 githubConfig *github.Config 223 opts *options.Options 224 NonBlockingJobNames []string 225 226 GateApproved bool 227 GateCLA bool 228 GateGHReviewApproved bool 229 GateGHReviewChangesRequested bool 230 231 // AdditionalRequiredLabels is a set of additional labels required for merging 232 // on top of the existing required ("lgtm", "approved", "cncf-cla: yes"). 233 AdditionalRequiredLabels []string 234 235 // BlockingLabels is a set of labels that forces the submit queue to ignore 236 // pull requests. 237 BlockingLabels []string 238 239 // If FakeE2E is true, don't try to connect to JenkinsHost, all jobs are passing. 240 FakeE2E bool 241 242 // All valid cla labels 243 ClaYesLabels []string 244 245 DoNotMergeMilestones []string 246 247 Metadata submitQueueMetadata 248 AdminPort int 249 250 sync.Mutex 251 prStatus map[string]submitStatus // protected by sync.Mutex 252 statusHistory []submitStatus // protected by sync.Mutex 253 lastClosedTime time.Time 254 255 clock utilclock.Clock 256 startTime time.Time // when the queue started (duh) 257 lastMergeTime time.Time 258 totalMerges int32 259 mergeRate float64 // per 24 hours 260 loopStarts int32 // if > 1, then we must have made a complete pass. 261 262 githubE2ERunning *github.MungeObject // protect by sync.Mutex! 263 githubE2EQueue map[int]*github.MungeObject // protected by sync.Mutex! 264 githubE2EPollTime time.Duration 265 lgtmTimeCache *mungerutil.LabelTimeCache 266 githubE2ELastPRNum int 267 268 lastE2EStable bool // was e2e stable last time they were checked, protect by sync.Mutex 269 e2e e2e.E2ETester 270 271 interruptedObj *submitQueueInterruptedObject 272 flakesIgnored int32 // Increments for each merge while 1+ job is flaky 273 instantMerges int32 // Increments whenever we merge without retesting 274 batchMerges int32 // Increments whenever we merge because of a batch 275 prsAdded int32 // Increments whenever an items queues 276 prsRemoved int32 // Increments whenever an item dequeues 277 prsTested int32 // Number of prs that completed second testing 278 retestsAvoided int32 // Increments whenever we skip due to head not changing. 279 280 health submitQueueHealth 281 healthHistory []healthRecord 282 283 emergencyMergeStopFlag int32 284 285 features *features.Features 286 287 mergeLock sync.Mutex // acquired when attempting to merge a specific PR 288 ProwURL string // prow base page 289 BatchEnabled bool 290 ContextURL string 291 batchStatus submitQueueBatchStatus 292 ciStatus map[string]map[string]jobStatus // type (eg batch) : job : status 293 294 // MergeToMasterMessage is an extra message when PR is merged to master branch, 295 // it must not end in a period. 296 MergeToMasterMessage string 297 } 298 299 func init() { 300 clock := utilclock.RealClock{} 301 prometheus.MustRegister(sqPromMetrics.Blocked) 302 prometheus.MustRegister(sqPromMetrics.OpenPRs) 303 prometheus.MustRegister(sqPromMetrics.QueuedPRs) 304 prometheus.MustRegister(sqPromMetrics.MergeCount) 305 prometheus.MustRegister(sqPromMetrics.LastMergeTime) 306 sq := &SubmitQueue{ 307 clock: clock, 308 startTime: clock.Now(), 309 lastMergeTime: clock.Now(), 310 lastE2EStable: true, 311 prStatus: map[string]submitStatus{}, 312 githubE2EQueue: map[int]*github.MungeObject{}, 313 } 314 RegisterMungerOrDie(sq) 315 RegisterStaleIssueComments(sq) 316 } 317 318 // Name is the name usable in --pr-mungers 319 func (sq *SubmitQueue) Name() string { return "submit-queue" } 320 321 // RequiredFeatures is a slice of 'features' that must be provided 322 func (sq *SubmitQueue) RequiredFeatures() []string { 323 return []string{features.BranchProtectionFeature, features.ServerFeatureName} 324 } 325 326 func (sq *SubmitQueue) emergencyMergeStop() bool { 327 return atomic.LoadInt32(&sq.emergencyMergeStopFlag) != 0 328 } 329 330 func (sq *SubmitQueue) setEmergencyMergeStop(stopMerges bool) { 331 if stopMerges { 332 atomic.StoreInt32(&sq.emergencyMergeStopFlag, 1) 333 } else { 334 atomic.StoreInt32(&sq.emergencyMergeStopFlag, 0) 335 } 336 } 337 338 // EmergencyStopHTTP sets the emergency stop flag. It expects the path of 339 // req.URL to contain either "emergency/stop", "emergency/resume", or "emergency/status". 340 func (sq *SubmitQueue) EmergencyStopHTTP(res http.ResponseWriter, req *http.Request) { 341 switch { 342 case strings.Contains(req.URL.Path, "emergency/stop"): 343 sq.setEmergencyMergeStop(true) 344 case strings.Contains(req.URL.Path, "emergency/resume"): 345 sq.setEmergencyMergeStop(false) 346 case strings.Contains(req.URL.Path, "emergency/status"): 347 default: 348 http.NotFound(res, req) 349 return 350 } 351 sq.serve(sq.marshal(struct{ EmergencyInProgress bool }{sq.emergencyMergeStop()}), res, req) 352 } 353 354 func round(num float64) int { 355 return int(num + math.Copysign(0.5, num)) 356 } 357 358 func toFixed(num float64) float64 { 359 output := math.Pow(10, float64(3)) 360 return float64(round(num*output)) / output 361 } 362 363 // This is the calculation of the exponential smoothing factor. It tries to 364 // make sure that if we get lots of fast merges we don't race the 'daily' 365 // avg really high really fast. But more importantly it means that if merges 366 // start going slowly the 'daily' average will get pulled down a lot by one 367 // slow merge instead of requiring numerous merges to get pulled down 368 func getSmoothFactor(dur time.Duration) float64 { 369 hours := dur.Hours() 370 smooth := .155*math.Log(hours) + .422 371 if smooth < .1 { 372 return .1 373 } 374 if smooth > .999 { 375 return .999 376 } 377 return smooth 378 } 379 380 // This calculates an exponentially smoothed merge Rate based on the formula 381 // newRate = (1-smooth)oldRate + smooth*newRate 382 // Which is really great and simple for constant time series data. But of course 383 // ours isn't time series data, so I vary the smoothing factor based on how long 384 // it has been since the last entry. See the comments on the `getSmoothFactor` for 385 // a discussion of why. 386 // This whole thing was dreamed up by eparis one weekend via a combination 387 // of guess-and-test and intuition. Someone who knows about this stuff 388 // is likely to laugh at the naivete. Point him to where someone intelligent 389 // has thought about this stuff and he will gladly do something smart. 390 // Merges that took less than 5 minutes are ignored completely for the rate 391 // calculation. 392 func calcMergeRate(oldRate float64, last, now time.Time) float64 { 393 since := now.Sub(last) 394 if since <= 5*time.Minute { 395 // retest-not-required PR merges shouldn't affect our best 396 // guess about the rate. 397 return oldRate 398 } 399 var rate float64 400 if since == 0 { 401 rate = 96 402 } else { 403 rate = 24.0 * time.Hour.Hours() / since.Hours() 404 } 405 smoothingFactor := getSmoothFactor(since) 406 mergeRate := ((1.0 - smoothingFactor) * oldRate) + (smoothingFactor * rate) 407 return toFixed(mergeRate) 408 } 409 410 // Updates a smoothed rate at which PRs are merging per day. 411 // Updates merge stats. Should be called once for every merge. 412 func (sq *SubmitQueue) updateMergeRate() { 413 now := sq.clock.Now() 414 sq.mergeRate = calcMergeRate(sq.mergeRate, sq.lastMergeTime, now) 415 416 // Update stats 417 sqPromMetrics.MergeCount.Inc() 418 atomic.AddInt32(&sq.totalMerges, 1) 419 sq.lastMergeTime = now 420 sqPromMetrics.LastMergeTime.Set(float64(sq.lastMergeTime.Unix())) 421 } 422 423 // This calculated the smoothed merge rate BUT it looks at the time since 424 // the last merge vs 'Now'. If we have not passed the next 'expected' time 425 // for a merge this just returns previous calculations. If 'Now' is later 426 // than we would expect given the existing mergeRate then pretend a merge 427 // happened right now and return the new merge rate. This way the merge rate 428 // is lower even if no merge has happened in a long time. 429 func (sq *SubmitQueue) calcMergeRateWithTail() float64 { 430 now := sq.clock.Now() 431 432 if sq.mergeRate == 0 { 433 return 0 434 } 435 // Figure out when we think the next merge would happen given the history 436 next := time.Duration(24/sq.mergeRate*time.Hour.Hours()) * time.Hour 437 expectedMergeTime := sq.lastMergeTime.Add(next) 438 439 // If we aren't there yet, just return the history 440 if !now.After(expectedMergeTime) { 441 return sq.mergeRate 442 } 443 444 // Pretend as though a merge happened right now to pull down the rate 445 return calcMergeRate(sq.mergeRate, sq.lastMergeTime, now) 446 } 447 448 // Initialize will initialize the munger 449 func (sq *SubmitQueue) Initialize(config *github.Config, features *features.Features) error { 450 sq.features = features 451 return sq.internalInitialize(config, features, "") 452 } 453 454 // internalInitialize will initialize the munger. 455 // if overrideURL is specified, will create testUtils 456 func (sq *SubmitQueue) internalInitialize(config *github.Config, features *features.Features, overrideURL string) error { 457 sq.Lock() 458 defer sq.Unlock() 459 460 // initialize to invalid pr number 461 sq.githubE2ELastPRNum = -1 462 463 sq.Metadata.ChartURL = sq.Metadata.chartURL 464 sq.Metadata.ProwURL = sq.ProwURL 465 sq.Metadata.RepoPullURL = fmt.Sprintf("https://github.com/%s/%s/pulls/", config.Org, config.Project) 466 sq.Metadata.ProjectName = strings.Title(config.Project) 467 sq.githubConfig = config 468 469 if sq.BatchEnabled && sq.ProwURL == "" { 470 return errors.New("batch merges require prow-url to be set") 471 } 472 473 // TODO: This is not how injection for tests should work. 474 if sq.FakeE2E { 475 sq.e2e = &fake_e2e.FakeE2ETester{} 476 } else { 477 var gcs *utils.Utils 478 if overrideURL != "" { 479 gcs = utils.NewTestUtils("bucket", "logs", overrideURL) 480 } else { 481 gcs = utils.NewWithPresubmitDetection( 482 mungeopts.GCS.BucketName, mungeopts.GCS.LogDir, 483 mungeopts.GCS.PullKey, mungeopts.GCS.PullLogDir, 484 ) 485 } 486 487 sq.e2e = (&e2e.RealE2ETester{ 488 Opts: sq.opts, 489 NonBlockingJobNames: &sq.NonBlockingJobNames, 490 BuildStatus: map[string]e2e.BuildInfo{}, 491 GoogleGCSBucketUtils: gcs, 492 }).Init(sharedmux.Admin) 493 } 494 495 sq.lgtmTimeCache = mungerutil.NewLabelTimeCache(lgtmLabel) 496 497 if features.Server.Enabled { 498 features.Server.Handle("/prs", gziphandler.GzipHandler(http.HandlerFunc(sq.servePRs))) 499 features.Server.Handle("/history", gziphandler.GzipHandler(http.HandlerFunc(sq.serveHistory))) 500 features.Server.Handle("/github-e2e-queue", gziphandler.GzipHandler(http.HandlerFunc(sq.serveGithubE2EStatus))) 501 features.Server.Handle("/merge-info", gziphandler.GzipHandler(http.HandlerFunc(sq.serveMergeInfo))) 502 features.Server.Handle("/priority-info", gziphandler.GzipHandler(http.HandlerFunc(sq.servePriorityInfo))) 503 features.Server.Handle("/health", gziphandler.GzipHandler(http.HandlerFunc(sq.serveHealth))) 504 features.Server.Handle("/health.svg", gziphandler.GzipHandler(http.HandlerFunc(sq.serveHealthSVG))) 505 features.Server.Handle("/sq-stats", gziphandler.GzipHandler(http.HandlerFunc(sq.serveSQStats))) 506 features.Server.Handle("/flakes", gziphandler.GzipHandler(http.HandlerFunc(sq.serveFlakes))) 507 features.Server.Handle("/metadata", gziphandler.GzipHandler(http.HandlerFunc(sq.serveMetadata))) 508 if sq.BatchEnabled { 509 features.Server.Handle("/batch", gziphandler.GzipHandler(http.HandlerFunc(sq.serveBatch))) 510 } 511 // this endpoint is useless without access to prow 512 if sq.ProwURL != "" { 513 features.Server.Handle("/ci-status", gziphandler.GzipHandler(http.HandlerFunc(sq.serveCIStatus))) 514 } 515 } 516 517 sharedmux.Admin.HandleFunc("/api/emergency/stop", sq.EmergencyStopHTTP) 518 sharedmux.Admin.HandleFunc("/api/emergency/resume", sq.EmergencyStopHTTP) 519 sharedmux.Admin.HandleFunc("/api/emergency/status", sq.EmergencyStopHTTP) 520 521 if sq.githubE2EPollTime == 0 { 522 sq.githubE2EPollTime = githubE2EPollTime 523 } 524 525 sq.healthHistory = make([]healthRecord, 0) 526 527 go sq.handleGithubE2EAndMerge() 528 go sq.updateGoogleE2ELoop() 529 if sq.BatchEnabled { 530 go sq.handleGithubE2EBatchMerge() 531 } 532 if sq.ProwURL != "" { 533 go sq.monitorProw() 534 } 535 536 if sq.AdminPort != 0 { 537 go http.ListenAndServe(fmt.Sprintf("0.0.0.0:%v", sq.AdminPort), sharedmux.Admin) 538 } 539 return nil 540 } 541 542 // EachLoop is called at the start of every munge loop 543 func (sq *SubmitQueue) EachLoop() error { 544 issues := []*githubapi.Issue{} 545 if !sq.lastClosedTime.IsZero() { 546 listOpts := &githubapi.IssueListByRepoOptions{ 547 State: "closed", 548 Since: sq.lastClosedTime, 549 } 550 var err error 551 issues, err = sq.githubConfig.ListAllIssues(listOpts) 552 if err != nil { 553 return err 554 } 555 } else { 556 sq.lastClosedTime = time.Now() 557 } 558 559 sq.Lock() 560 for _, issue := range issues { 561 if issue.ClosedAt != nil && issue.ClosedAt.After(sq.lastClosedTime) { 562 sq.lastClosedTime = *issue.ClosedAt 563 } 564 delete(sq.prStatus, strconv.Itoa(*issue.Number)) 565 } 566 567 sq.updateHealth() 568 sqPromMetrics.OpenPRs.Set(float64(len(sq.prStatus))) 569 sqPromMetrics.QueuedPRs.Set(float64(len(sq.githubE2EQueue))) 570 571 objs := []*github.MungeObject{} 572 for _, obj := range sq.githubE2EQueue { 573 objs = append(objs, obj) 574 } 575 sq.Unlock() 576 577 for _, obj := range objs { 578 obj.Refresh() 579 // This should recheck it and clean up the queue, we don't care about the result 580 _ = sq.validForMerge(obj) 581 } 582 atomic.AddInt32(&sq.loopStarts, 1) 583 return nil 584 } 585 586 // RegisterOptions registers options for this munger; returns any that require a restart when changed. 587 func (sq *SubmitQueue) RegisterOptions(opts *options.Options) sets.String { 588 sq.opts = opts 589 opts.RegisterStringSlice(&sq.NonBlockingJobNames, "nonblocking-jobs", []string{}, "Comma separated list of jobs that don't block merges, but will have status reported and issues filed.") 590 opts.RegisterStringSlice(&sq.AdditionalRequiredLabels, "additional-required-labels", []string{}, "Comma separated list of labels required for merging PRs on top of the existing required.") 591 opts.RegisterStringSlice(&sq.BlockingLabels, "blocking-labels", []string{}, "Comma separated list of labels required to miss from PRs in order to consider them mergeable.") 592 opts.RegisterBool(&sq.FakeE2E, "fake-e2e", false, "Whether to use a fake for testing E2E stability.") 593 opts.RegisterStringSlice(&sq.DoNotMergeMilestones, "do-not-merge-milestones", []string{}, "List of milestones which, when applied, will cause the PR to not be merged.") 594 opts.RegisterInt(&sq.AdminPort, "admin-port", 9999, "If non-zero, will serve administrative actions on this port.") 595 opts.RegisterString(&sq.Metadata.chartURL, "chart-url", "", "URL to access the submit-queue instance's health charts.") 596 opts.RegisterString(&sq.ProwURL, "prow-url", "", "Prow deployment base URL to read batch results and direct users to.") 597 opts.RegisterBool(&sq.BatchEnabled, "batch-enabled", false, "Do batch merges (requires prow/splice coordination).") 598 opts.RegisterString(&sq.ContextURL, "context-url", "", "URL where the submit queue is serving - used in Github status contexts.") 599 opts.RegisterBool(&sq.GateApproved, "gate-approved", false, "Gate on approved label.") 600 opts.RegisterBool(&sq.GateCLA, "gate-cla", false, "Gate on cla labels.") 601 opts.RegisterString(&sq.MergeToMasterMessage, "merge-to-master-message", "", "Extra message when PR is merged to master branch.") 602 opts.RegisterBool(&sq.GateGHReviewApproved, "gh-review-approved", false, "Gate github review, approve") 603 opts.RegisterBool(&sq.GateGHReviewChangesRequested, "gh-review-changes-requested", false, "Gate github review, changes request") 604 opts.RegisterStringSlice(&sq.ClaYesLabels, "cla-yes-labels", []string{cncfClaYesLabel, claHumanLabel}, "Comma separated list of labels that would be counted as valid cla labels") 605 606 opts.RegisterUpdateCallback(func(changed sets.String) error { 607 if changed.HasAny("prow-url", "batch-enabled") { 608 if sq.BatchEnabled && sq.ProwURL == "" { 609 return fmt.Errorf("batch merges require prow-url to be set") 610 } 611 } 612 if changed.HasAny("gate-cla", "cla-yes-labels") { 613 if sq.GateCLA && len(sq.ClaYesLabels) == 0 { 614 return fmt.Errorf("gating cla require at least one cla yes label. Default are %s and %s", cncfClaYesLabel, claHumanLabel) 615 } 616 } 617 return nil 618 }) 619 620 return sets.NewString( 621 "batch-enabled", // Need to start or kill batch processing. 622 "context-url", // Need to remunge all PRs to update statuses with new url. 623 "admin-port", // Need to restart server on new port. 624 // For the following: need to restart fileserver. 625 "chart-url", 626 // For the following: need to re-initialize e2e which is used by other goroutines. 627 "fake-e2e", 628 "gcs-bucket", 629 "gcs-logs-dir", 630 "pull-logs-dir", 631 "pull-key", 632 // For the following: need to remunge all PRs if changed from true to false. 633 "gate-cla", 634 "gate-approved", 635 // Need to remunge all PRs if anything changes in the following sets. 636 "additional-required-labels", 637 "blocking-labels", 638 "cla-yes-labels", 639 "required-retest-contexts", 640 ) 641 } 642 643 // Hold the lock 644 func (sq *SubmitQueue) updateHealth() { 645 // Remove old entries from the front. 646 for len(sq.healthHistory) > 0 && time.Since(sq.healthHistory[0].Time).Hours() > 24.0 { 647 sq.healthHistory = sq.healthHistory[1:] 648 } 649 // Make the current record 650 emergencyStop := sq.emergencyMergeStop() 651 newEntry := healthRecord{ 652 Time: time.Now(), 653 Overall: !emergencyStop, 654 Jobs: map[string]bool{}, 655 } 656 for job, status := range sq.e2e.GetBuildStatus() { 657 // Ignore flakes. 658 newEntry.Jobs[job] = status.Status != "Not Stable" 659 } 660 if emergencyStop { 661 // invent an "emergency stop" job that's failing. 662 newEntry.Jobs["Emergency Stop"] = false 663 } 664 sq.healthHistory = append(sq.healthHistory, newEntry) 665 // Now compute the health structure so we don't have to do it on page load 666 sq.health.TotalLoops = len(sq.healthHistory) 667 sq.health.NumStable = 0 668 sq.health.NumStablePerJob = map[string]int{} 669 sq.health.MergePossibleNow = !emergencyStop 670 if sq.health.MergePossibleNow { 671 sqPromMetrics.Blocked.Set(0) 672 } else { 673 sqPromMetrics.Blocked.Set(1) 674 } 675 for _, record := range sq.healthHistory { 676 if record.Overall { 677 sq.health.NumStable++ 678 } 679 for job, stable := range record.Jobs { 680 if _, ok := sq.health.NumStablePerJob[job]; !ok { 681 sq.health.NumStablePerJob[job] = 0 682 } 683 if stable { 684 sq.health.NumStablePerJob[job]++ 685 } 686 } 687 } 688 } 689 690 func (sq *SubmitQueue) monitorProw() { 691 nonBlockingJobNames := make(map[string]bool) 692 requireRetestJobNames := make(map[string]bool) 693 694 for { 695 sq.opts.Lock() 696 for _, jobName := range sq.NonBlockingJobNames { 697 nonBlockingJobNames[jobName] = true 698 } 699 for _, jobName := range mungeopts.RequiredContexts.Retest { 700 requireRetestJobNames[jobName] = true 701 } 702 url := sq.ProwURL + "/data.js" 703 704 currentPR := -1 705 if sq.githubE2ERunning != nil { 706 currentPR = *sq.githubE2ERunning.Issue.Number 707 } 708 sq.opts.Unlock() 709 710 lastPR := sq.githubE2ELastPRNum 711 // get current job info from prow 712 allJobs, err := getJobs(url) 713 if err != nil { 714 glog.Errorf("Error reading batch jobs from Prow URL %v: %v", url, err) 715 time.Sleep(time.Minute) 716 continue 717 } 718 // TODO: copy these from sq first instead 719 ciStatus := make(map[string]map[string]jobStatus) 720 ciLatest := make(map[string]map[string]time.Time) 721 722 for _, job := range allJobs { 723 if job.Finished == "" || job.BuildID == "" { 724 continue 725 } 726 // type/category 727 key := job.Type + "/" 728 // the most recent submit-queue PR(s) 729 if job.Number == currentPR || job.Number == lastPR { 730 key += "single" 731 } else if nonBlockingJobNames[job.Job] { 732 key += "nonblocking" 733 } else if requireRetestJobNames[job.Job] { 734 key += "requiredretest" 735 } 736 737 ft, err := time.Parse(time.RFC3339Nano, job.Finished) 738 if err != nil { 739 glog.Errorf("Error parsing job finish time %s: %v", job.Finished, err) 740 continue 741 } 742 743 if _, ok := ciLatest[key]; !ok { 744 ciLatest[key] = make(map[string]time.Time) 745 ciStatus[key] = make(map[string]jobStatus) 746 } 747 latest, ok := ciLatest[key][job.Job] 748 749 // TODO: flake cache? 750 if !ok || latest.Before(ft) { 751 ciLatest[key][job.Job] = ft 752 ciStatus[key][job.Job] = jobStatus{ 753 State: job.State, 754 BuildID: job.BuildID, 755 URL: job.URL, 756 } 757 } 758 } 759 760 sq.Lock() 761 sq.ciStatus = ciStatus 762 sq.Unlock() 763 764 time.Sleep(time.Minute) 765 } 766 } 767 768 func (sq *SubmitQueue) e2eStable(aboutToMerge bool) bool { 769 wentStable := false 770 wentUnstable := false 771 772 sq.e2e.LoadNonBlockingStatus() 773 stable := !sq.emergencyMergeStop() 774 775 sq.Lock() 776 last := sq.lastE2EStable 777 if last && !stable { 778 wentUnstable = true 779 } else if !last && stable { 780 wentStable = true 781 } 782 sq.lastE2EStable = stable 783 sq.Unlock() 784 785 reason := "" 786 avatar := "" 787 if wentStable { 788 reason = e2eRecover 789 avatar = "success.png" 790 } else if wentUnstable { 791 reason = e2eFailure 792 avatar = "error.png" 793 } 794 if reason != "" { 795 submitStatus := submitStatus{ 796 Time: sq.clock.Now(), 797 statusPullRequest: statusPullRequest{ 798 Title: reason, 799 AvatarURL: avatar, 800 }, 801 Reason: reason, 802 } 803 sq.Lock() 804 sq.statusHistory = append(sq.statusHistory, submitStatus) 805 sq.Unlock() 806 } 807 return stable 808 } 809 810 // This serves little purpose other than to show updates every minute in the 811 // web UI. Stable() will get called as needed against individual PRs as well. 812 func (sq *SubmitQueue) updateGoogleE2ELoop() { 813 for { 814 _ = sq.e2eStable(false) 815 time.Sleep(1 * time.Minute) 816 } 817 } 818 819 func objToStatusPullRequest(obj *github.MungeObject) *statusPullRequest { 820 if obj == nil { 821 return &statusPullRequest{} 822 } 823 res := statusPullRequest{ 824 Number: *obj.Issue.Number, 825 URL: *obj.Issue.HTMLURL, 826 Title: *obj.Issue.Title, 827 Login: *obj.Issue.User.Login, 828 AvatarURL: *obj.Issue.User.AvatarURL, 829 } 830 pr, ok := obj.GetPR() 831 if !ok { 832 return &res 833 } 834 if pr.Additions != nil { 835 res.Additions = *pr.Additions 836 } 837 if pr.Deletions != nil { 838 res.Deletions = *pr.Deletions 839 } 840 if pr.Base != nil && pr.Base.Ref != nil { 841 res.BaseRef = *pr.Base.Ref 842 } 843 844 labelPriority := labelPriority(obj) 845 if labelPriority <= lastHighPriorityLabel { 846 res.ExtraInfo = append(res.ExtraInfo, labelPriorities[labelPriority]) 847 } 848 849 milestone, ok := obj.Annotations["milestone"] 850 if !ok { 851 milestone, _ = obj.ReleaseMilestone() 852 obj.Annotations["milestone"] = milestone 853 } 854 if milestone != "" { 855 res.ExtraInfo = append(res.ExtraInfo, milestone) 856 } 857 858 if labelPriority > lastHighPriorityLabel && labelPriority < len(labelPriorities) { 859 res.ExtraInfo = append(res.ExtraInfo, labelPriorities[labelPriority]) 860 } 861 862 return &res 863 } 864 865 func reasonToState(reason string) string { 866 switch reason { 867 case merged, mergedByHand, mergedSkippedRetest, mergedBatch: 868 return "success" 869 case e2eFailure, ghE2EQueued, ghE2EWaitingStart, ghE2ERunning: 870 return "success" 871 case unknown: 872 return "failure" 873 default: 874 return "pending" 875 } 876 } 877 878 // SetMergeStatus will set the status given a particular PR. This function should 879 // be used instead of manipulating the prStatus directly as sq.Lock() must be 880 // called when manipulating that structure 881 // `obj` is the active github object 882 // `reason` is the new 'status' for this object 883 func (sq *SubmitQueue) SetMergeStatus(obj *github.MungeObject, reason string) { 884 glog.V(4).Infof("SubmitQueue not merging %d because %q", *obj.Issue.Number, reason) 885 submitStatus := submitStatus{ 886 Time: sq.clock.Now(), 887 statusPullRequest: *objToStatusPullRequest(obj), 888 Reason: reason, 889 } 890 891 status, ok := obj.GetStatus(sqContext) 892 if !ok || status == nil || *status.Description != reason { 893 state := reasonToState(reason) 894 sq.opts.Lock() 895 contextURL := sq.ContextURL 896 sq.opts.Unlock() 897 url := fmt.Sprintf("%s/#/prs?prDisplay=%d&historyDisplay=%d", contextURL, *obj.Issue.Number, *obj.Issue.Number) 898 _ = obj.SetStatus(state, url, reason, sqContext) 899 } 900 901 sq.Lock() 902 defer sq.Unlock() 903 904 // If we are currently retesting E2E the normal munge loop might find 905 // that the ci tests are not green. That's normal and expected and we 906 // should just ignore that status update entirely. 907 if sq.githubE2ERunning != nil && *sq.githubE2ERunning.Issue.Number == *obj.Issue.Number && strings.HasPrefix(reason, ciFailure) { 908 return 909 } 910 911 if sq.onQueue(obj) { 912 sq.statusHistory = append(sq.statusHistory, submitStatus) 913 if len(sq.statusHistory) > 128 { 914 sq.statusHistory = sq.statusHistory[1:] 915 } 916 } 917 sq.prStatus[strconv.Itoa(*obj.Issue.Number)] = submitStatus 918 sq.cleanupOldE2E(obj, reason) 919 } 920 921 // setContextFailedStatus calls SetMergeStatus after determining a particular github status 922 // which is failed. 923 func (sq *SubmitQueue) setContextFailedStatus(obj *github.MungeObject, contexts []string) { 924 for i, context := range contexts { 925 contextSlice := contexts[i : i+1] 926 success, ok := obj.IsStatusSuccess(contextSlice) 927 if ok && success { 928 continue 929 } 930 failMsg := fmt.Sprintf(ciFailureFmt, context) 931 sq.SetMergeStatus(obj, failMsg) 932 return 933 } 934 glog.Errorf("Inside setContextFailedStatus() but none of the status's failed! %d: %v", obj.Number(), contexts) 935 sq.SetMergeStatus(obj, ciFailure) 936 } 937 938 // sq.Lock() MUST be held! 939 func (sq *SubmitQueue) getE2EQueueStatus() []*statusPullRequest { 940 queue := []*statusPullRequest{} 941 keys := sq.orderedE2EQueue() 942 for _, k := range keys { 943 obj := sq.githubE2EQueue[k] 944 request := objToStatusPullRequest(obj) 945 queue = append(queue, request) 946 } 947 return queue 948 } 949 950 func (sq *SubmitQueue) marshal(data interface{}) []byte { 951 b, err := json.Marshal(data) 952 if err != nil { 953 glog.Errorf("Unable to Marshal data: %#v: %v", data, err) 954 return nil 955 } 956 return b 957 } 958 959 func (sq *SubmitQueue) getQueueHistory() []byte { 960 sq.Lock() 961 defer sq.Unlock() 962 return sq.marshal(sq.statusHistory) 963 } 964 965 // GetQueueStatus returns a json representation of the state of the submit 966 // queue. This can be used to generate web pages about the submit queue. 967 func (sq *SubmitQueue) getQueueStatus() []byte { 968 status := submitQueueStatus{PRStatus: map[string]submitStatus{}} 969 sq.Lock() 970 defer sq.Unlock() 971 972 for key, value := range sq.prStatus { 973 status.PRStatus[key] = value 974 } 975 return sq.marshal(status) 976 } 977 978 func (sq *SubmitQueue) getGithubE2EStatus() []byte { 979 sq.Lock() 980 defer sq.Unlock() 981 status := e2eQueueStatus{ 982 E2EQueue: sq.getE2EQueueStatus(), 983 E2ERunning: objToStatusPullRequest(sq.githubE2ERunning), 984 BatchStatus: &sq.batchStatus, 985 } 986 return sq.marshal(status) 987 } 988 989 func noMergeMessage(label string) string { 990 return "Will not auto merge because " + label + " is present" 991 } 992 993 func noAdditionalLabelMessage(label string) string { 994 return "Will not auto merge because " + label + " is missing" 995 } 996 997 const ( 998 unknown = "unknown failure" 999 noCLA = "PR is missing CLA label; needs one from the following list:" 1000 noLGTM = "PR does not have " + lgtmLabel + " label." 1001 noApproved = "PR does not have " + approvedLabel + " label." 1002 lgtmEarly = "The PR was changed after the " + lgtmLabel + " label was added." 1003 unmergeable = "PR is unable to be automatically merged. Needs rebase." 1004 undeterminedMergability = "Unable to determine is PR is mergeable. Will try again later." 1005 ciFailure = "Required Github CI test is not green" 1006 ciFailureFmt = ciFailure + ": %s" 1007 e2eFailure = "The e2e tests are failing. The entire submit queue is blocked." 1008 e2eRecover = "The e2e tests started passing. The submit queue is unblocked." 1009 merged = "MERGED!" 1010 mergedSkippedRetest = "MERGED! (skipped retest because of label)" 1011 mergedBatch = "MERGED! (batch)" 1012 mergedByHand = "MERGED! (by hand outside of submit queue)" 1013 ghE2EQueued = "Queued to run github e2e tests a second time." 1014 ghE2EWaitingStart = "Requested and waiting for github e2e test to start running a second time." 1015 ghE2ERunning = "Running github e2e tests a second time." 1016 ghE2EFailed = "Second github e2e run failed." 1017 unmergeableMilestone = "Milestone is for a future release and cannot be merged" 1018 headCommitChanged = "This PR has changed since we ran the tests" 1019 ghReviewStateUnclear = "Cannot get gh reviews status" 1020 ghReviewApproved = "This pr has no Github review \"approved\"." 1021 ghReviewChangesRequested = "Reviewer(s) requested changes through github review process." 1022 ) 1023 1024 // validForMergeExt is the base logic about what PR can be automatically merged. 1025 // PRs must pass this logic to be placed on the queue and they must pass this 1026 // logic a second time to be retested/merged after they get to the top of 1027 // the queue. 1028 // 1029 // checkStatus is true if the PR should only merge if the appropriate Github status 1030 // checks are passing. 1031 // 1032 // If you update the logic PLEASE PLEASE PLEASE update serveMergeInfo() as well. 1033 func (sq *SubmitQueue) validForMergeExt(obj *github.MungeObject, checkStatus bool) bool { 1034 // Can't merge an issue! 1035 if !obj.IsPR() { 1036 return false 1037 } 1038 1039 // Can't merge something already merged. 1040 if m, ok := obj.IsMerged(); !ok { 1041 glog.Errorf("%d: unknown err", *obj.Issue.Number) 1042 sq.SetMergeStatus(obj, unknown) 1043 return false 1044 } else if m { 1045 sq.SetMergeStatus(obj, mergedByHand) 1046 return false 1047 } 1048 1049 // Lock to get options since we may be running on a goroutine besides the main one. 1050 sq.opts.Lock() 1051 gateCLA := sq.GateCLA 1052 gateApproved := sq.GateApproved 1053 doNotMergeMilestones := sq.DoNotMergeMilestones 1054 mergeContexts := mungeopts.RequiredContexts.Merge 1055 retestContexts := mungeopts.RequiredContexts.Retest 1056 additionalLabels := sq.AdditionalRequiredLabels 1057 blockingLabels := sq.BlockingLabels 1058 claYesLabels := sq.ClaYesLabels 1059 sq.opts.Unlock() 1060 1061 milestone := obj.Issue.Milestone 1062 title := "" 1063 // Net set means the empty milestone, "" 1064 if milestone != nil && milestone.Title != nil { 1065 title = *milestone.Title 1066 } 1067 for _, blocked := range doNotMergeMilestones { 1068 if title == blocked || (title == "" && blocked == "NO-MILESTONE") { 1069 sq.SetMergeStatus(obj, unmergeableMilestone) 1070 return false 1071 } 1072 } 1073 1074 // Must pass CLA checks 1075 if gateCLA { 1076 for i, l := range claYesLabels { 1077 if obj.HasLabel(l) { 1078 break 1079 } 1080 if i == len(claYesLabels)-1 { 1081 sq.SetMergeStatus(obj, fmt.Sprintf("%s %q", noCLA, claYesLabels)) 1082 return false 1083 } 1084 } 1085 } 1086 1087 // Obviously must be mergeable 1088 if mergeable, ok := obj.IsMergeable(); !ok { 1089 sq.SetMergeStatus(obj, undeterminedMergability) 1090 return false 1091 } else if !mergeable { 1092 sq.SetMergeStatus(obj, unmergeable) 1093 return false 1094 } 1095 1096 // Validate the status information for this PR 1097 if checkStatus { 1098 if len(mergeContexts) > 0 { 1099 if success, ok := obj.IsStatusSuccess(mergeContexts); !ok || !success { 1100 sq.setContextFailedStatus(obj, mergeContexts) 1101 return false 1102 } 1103 } 1104 if len(retestContexts) > 0 { 1105 if success, ok := obj.IsStatusSuccess(retestContexts); !ok || !success { 1106 sq.setContextFailedStatus(obj, retestContexts) 1107 return false 1108 } 1109 } 1110 } 1111 1112 if sq.GateGHReviewApproved || sq.GateGHReviewChangesRequested { 1113 if approvedReview, changesRequestedReview, ok := obj.CollectGHReviewStatus(); !ok { 1114 sq.SetMergeStatus(obj, ghReviewStateUnclear) 1115 return false 1116 } else if len(approvedReview) == 0 && sq.GateGHReviewApproved { 1117 sq.SetMergeStatus(obj, ghReviewApproved) 1118 return false 1119 } else if len(changesRequestedReview) > 0 && sq.GateGHReviewChangesRequested { 1120 sq.SetMergeStatus(obj, ghReviewChangesRequested) 1121 return false 1122 } 1123 } 1124 1125 if !obj.HasLabel(lgtmLabel) { 1126 sq.SetMergeStatus(obj, noLGTM) 1127 return false 1128 } 1129 1130 // PR cannot change since LGTM was added 1131 if after, ok := obj.ModifiedAfterLabeled(lgtmLabel); !ok { 1132 sq.SetMergeStatus(obj, unknown) 1133 return false 1134 } else if after { 1135 sq.SetMergeStatus(obj, lgtmEarly) 1136 return false 1137 } 1138 1139 if gateApproved { 1140 if !obj.HasLabel(approvedLabel) { 1141 sq.SetMergeStatus(obj, noApproved) 1142 return false 1143 } 1144 } 1145 1146 // PR cannot have any labels which prevent merging. 1147 for _, label := range []string{ 1148 cherrypickUnapprovedLabel, 1149 blockedPathsLabel, 1150 releaseNoteLabelNeeded, 1151 doNotMergeLabel, 1152 wipLabel, 1153 holdLabel, 1154 } { 1155 if obj.HasLabel(label) { 1156 sq.SetMergeStatus(obj, noMergeMessage(label)) 1157 return false 1158 } 1159 } 1160 1161 for _, label := range additionalLabels { 1162 if !obj.HasLabel(label) { 1163 sq.SetMergeStatus(obj, noAdditionalLabelMessage(label)) 1164 return false 1165 } 1166 } 1167 1168 for _, label := range blockingLabels { 1169 if obj.HasLabel(label) { 1170 sq.SetMergeStatus(obj, noMergeMessage(label)) 1171 return false 1172 } 1173 } 1174 1175 return true 1176 } 1177 1178 func (sq *SubmitQueue) validForMerge(obj *github.MungeObject) bool { 1179 return sq.validForMergeExt(obj, true) 1180 } 1181 1182 // Munge is the workhorse the will actually make updates to the PR 1183 func (sq *SubmitQueue) Munge(obj *github.MungeObject) { 1184 if !sq.validForMerge(obj) { 1185 return 1186 } 1187 1188 added := false 1189 sq.Lock() 1190 if _, ok := sq.githubE2EQueue[*obj.Issue.Number]; !ok { 1191 atomic.AddInt32(&sq.prsAdded, 1) 1192 added = true 1193 } 1194 // Add this most-recent object in place of the existing object. It will 1195 // have more up2date information. Even though we explicitly refresh the 1196 // PR information before do anything with it, this allow things like the 1197 // queue order to change dynamically as labels are added/removed. 1198 sq.githubE2EQueue[*obj.Issue.Number] = obj 1199 sq.Unlock() 1200 if added { 1201 sq.SetMergeStatus(obj, ghE2EQueued) 1202 } 1203 1204 return 1205 } 1206 1207 func (sq *SubmitQueue) deleteQueueItem(obj *github.MungeObject) { 1208 if sq.onQueue(obj) { 1209 atomic.AddInt32(&sq.prsRemoved, 1) 1210 } 1211 delete(sq.githubE2EQueue, *obj.Issue.Number) 1212 } 1213 1214 // If the PR was put in the github e2e queue previously, but now we don't 1215 // think it should be in the e2e queue, remove it. MUST be called with sq.Lock() 1216 // held. 1217 func (sq *SubmitQueue) cleanupOldE2E(obj *github.MungeObject, reason string) { 1218 switch { 1219 case reason == e2eFailure: 1220 case reason == ghE2EQueued: 1221 case reason == ghE2EWaitingStart: 1222 case reason == ghE2ERunning: 1223 // Do nothing 1224 case strings.HasPrefix(reason, ciFailure): 1225 // ciFailure is intersting. If the PR is being actively retested and then the 1226 // time based loop finds the same PR it will try to set ciFailure. We should in fact 1227 // not ever call this function in this case, but if we do call here, log it. 1228 if sq.githubE2ERunning != nil && *sq.githubE2ERunning.Issue.Number == *obj.Issue.Number { 1229 glog.Errorf("Trying to clean up %d due to ciFailure while it is being tested", *obj.Issue.Number) 1230 return 1231 } 1232 fallthrough 1233 default: 1234 if sq.githubE2ERunning != nil && *sq.githubE2ERunning.Issue.Number == *obj.Issue.Number { 1235 sq.githubE2ERunning = nil 1236 } 1237 sq.deleteQueueItem(obj) 1238 } 1239 1240 } 1241 1242 func labelPriority(obj *github.MungeObject) int { 1243 for i, label := range labelPriorities { 1244 if obj.HasLabel(label) { 1245 return i 1246 } 1247 } 1248 return len(labelPriorities) 1249 } 1250 1251 func compareHighPriorityLabels(a *github.MungeObject, b *github.MungeObject) int { 1252 aPrio := labelPriority(a) 1253 bPrio := labelPriority(b) 1254 1255 if aPrio > lastHighPriorityLabel && bPrio > lastHighPriorityLabel { 1256 return 0 1257 } 1258 return aPrio - bPrio 1259 } 1260 1261 func compareLowPriorityLabels(a *github.MungeObject, b *github.MungeObject) int { 1262 aPrio := labelPriority(a) 1263 bPrio := labelPriority(b) 1264 1265 return aPrio - bPrio 1266 } 1267 1268 type queueSorter struct { 1269 queue []*github.MungeObject 1270 labelTimeCache *mungerutil.LabelTimeCache 1271 } 1272 1273 func (s queueSorter) Len() int { return len(s.queue) } 1274 func (s queueSorter) Swap(i, j int) { s.queue[i], s.queue[j] = s.queue[j], s.queue[i] } 1275 1276 // If you update the function PLEASE PLEASE PLEASE also update servePriorityInfo() 1277 func (s queueSorter) Less(i, j int) bool { 1278 a := s.queue[i] 1279 b := s.queue[j] 1280 1281 if c := compareHighPriorityLabels(a, b); c < 0 { 1282 return true 1283 } else if c > 0 { 1284 return false 1285 } 1286 1287 aDue, _ := a.ReleaseMilestoneDue() 1288 bDue, _ := b.ReleaseMilestoneDue() 1289 1290 if aDue.Before(bDue) { 1291 return true 1292 } else if aDue.After(bDue) { 1293 return false 1294 } 1295 1296 if c := compareLowPriorityLabels(a, b); c < 0 { 1297 return true 1298 } else if c > 0 { 1299 return false 1300 } 1301 1302 aTime, aOK := s.labelTimeCache.FirstLabelTime(a) 1303 bTime, bOK := s.labelTimeCache.FirstLabelTime(b) 1304 1305 // Shouldn't really happen since these have been LGTMed to be 1306 // in the queue at all. But just in case, . 1307 if !aOK && bOK { 1308 return false 1309 } else if aOK && !bOK { 1310 return true 1311 } else if !aOK && !bOK { 1312 return false 1313 } 1314 1315 return aTime.Before(bTime) 1316 } 1317 1318 // onQueue just tells if a PR is already on the queue. 1319 // sq.Lock() must be held 1320 func (sq *SubmitQueue) onQueue(obj *github.MungeObject) bool { 1321 for _, queueObj := range sq.githubE2EQueue { 1322 if *queueObj.Issue.Number == *obj.Issue.Number { 1323 return true 1324 } 1325 1326 } 1327 return false 1328 } 1329 1330 // sq.Lock() better held!!! 1331 func (sq *SubmitQueue) orderedE2EQueue() []int { 1332 prs := []*github.MungeObject{} 1333 for _, obj := range sq.githubE2EQueue { 1334 prs = append(prs, obj) 1335 } 1336 sort.Sort(queueSorter{prs, sq.lgtmTimeCache}) 1337 1338 var ordered []int 1339 for _, obj := range prs { 1340 ordered = append(ordered, *obj.Issue.Number) 1341 } 1342 return ordered 1343 } 1344 1345 // handleGithubE2EAndMerge waits for PRs that are ready to re-run the github 1346 // e2e tests, runs the test, and then merges if everything was successful. 1347 func (sq *SubmitQueue) handleGithubE2EAndMerge() { 1348 for { 1349 sq.Lock() 1350 l := len(sq.githubE2EQueue) 1351 sq.Unlock() 1352 // Wait until something is ready to be processed 1353 if l == 0 { 1354 time.Sleep(sq.githubE2EPollTime) 1355 continue 1356 } 1357 1358 obj := sq.selectPullRequest() 1359 if obj == nil { 1360 continue 1361 } 1362 1363 // only critical fixes can be merged if postsubmits are failing 1364 if !sq.e2eStable(false) && !obj.HasLabel(criticalFixLabel) { 1365 time.Sleep(sq.githubE2EPollTime) 1366 continue 1367 } 1368 1369 // re-test and maybe merge 1370 remove := sq.doGithubE2EAndMerge(obj) 1371 if remove { 1372 // remove it from the map after we finish testing 1373 sq.Lock() 1374 if sq.githubE2ERunning != nil { 1375 sq.githubE2ELastPRNum = *sq.githubE2ERunning.Issue.Number 1376 } 1377 sq.githubE2ERunning = nil 1378 sq.deleteQueueItem(obj) 1379 sq.Unlock() 1380 } 1381 } 1382 } 1383 1384 func (sq *SubmitQueue) mergePullRequest(obj *github.MungeObject, msg, extra string) bool { 1385 isMaster, _ := obj.IsForBranch("master") 1386 if isMaster { 1387 sq.opts.Lock() 1388 if sq.MergeToMasterMessage != "" { 1389 extra = extra + ". " + sq.MergeToMasterMessage 1390 } 1391 sq.opts.Unlock() 1392 } 1393 ok := obj.MergePR("submit-queue" + extra) 1394 if !ok { 1395 return ok 1396 } 1397 sq.SetMergeStatus(obj, msg) 1398 sq.updateMergeRate() 1399 return true 1400 } 1401 1402 func (sq *SubmitQueue) selectPullRequest() *github.MungeObject { 1403 if sq.interruptedObj != nil { 1404 return sq.interruptedObj.obj 1405 } 1406 sq.Lock() 1407 defer sq.Unlock() 1408 if len(sq.githubE2EQueue) == 0 { 1409 return nil 1410 } 1411 keys := sq.orderedE2EQueue() 1412 obj := sq.githubE2EQueue[keys[0]] 1413 if sq.githubE2ERunning != nil { 1414 sq.githubE2ELastPRNum = *sq.githubE2ERunning.Issue.Number 1415 } 1416 sq.githubE2ERunning = obj 1417 1418 return obj 1419 } 1420 1421 func (interruptedObj *submitQueueInterruptedObject) hasSHAChanged() bool { 1422 headSHA, baseRef, gotHeadSHA := interruptedObj.obj.GetHeadAndBase() 1423 if !gotHeadSHA { 1424 return true 1425 } 1426 1427 baseSHA, gotBaseSHA := interruptedObj.obj.GetSHAFromRef(baseRef) 1428 if !gotBaseSHA { 1429 return true 1430 } 1431 1432 return interruptedObj.interruptedMergeBaseSHA != baseSHA || 1433 interruptedObj.interruptedMergeHeadSHA != headSHA 1434 } 1435 1436 func newInterruptedObject(obj *github.MungeObject) *submitQueueInterruptedObject { 1437 if headSHA, baseRef, gotHeadSHA := obj.GetHeadAndBase(); !gotHeadSHA { 1438 return nil 1439 } else if baseSHA, gotBaseSHA := obj.GetSHAFromRef(baseRef); !gotBaseSHA { 1440 return nil 1441 } else { 1442 return &submitQueueInterruptedObject{obj, headSHA, baseSHA} 1443 } 1444 } 1445 1446 // Returns true if we can discard the PR from the queue, false if we must keep it for later. 1447 // If you modify this, consider modifying doBatchMerge too. 1448 func (sq *SubmitQueue) doGithubE2EAndMerge(obj *github.MungeObject) bool { 1449 interruptedObj := sq.interruptedObj 1450 sq.interruptedObj = nil 1451 1452 ok := obj.Refresh() 1453 if !ok { 1454 glog.Errorf("%d: unknown err", *obj.Issue.Number) 1455 sq.SetMergeStatus(obj, unknown) 1456 return true 1457 } 1458 1459 if !sq.validForMerge(obj) { 1460 return true 1461 } 1462 1463 if obj.HasLabel(retestNotRequiredLabel) || obj.HasLabel(retestNotRequiredDocsOnlyLabel) { 1464 atomic.AddInt32(&sq.instantMerges, 1) 1465 sq.mergePullRequest(obj, mergedSkippedRetest, "") 1466 return true 1467 } 1468 1469 sha, _, ok := obj.GetHeadAndBase() 1470 if !ok { 1471 glog.Errorf("%d: Unable to get SHA", *obj.Issue.Number) 1472 sq.SetMergeStatus(obj, unknown) 1473 return true 1474 } 1475 if interruptedObj != nil { 1476 if interruptedObj.hasSHAChanged() { 1477 // This PR will have to be rested. 1478 // Make sure we don't have higher priority first. 1479 return false 1480 } 1481 glog.Infof("Skipping retest since head and base sha match previous attempt!") 1482 atomic.AddInt32(&sq.retestsAvoided, 1) 1483 } else { 1484 if sq.retestPR(obj) { 1485 return true 1486 } 1487 1488 ok := obj.Refresh() 1489 if !ok { 1490 sq.SetMergeStatus(obj, unknown) 1491 return true 1492 } 1493 } 1494 1495 sq.mergeLock.Lock() 1496 defer sq.mergeLock.Unlock() 1497 1498 // We shouldn't merge if it's not valid anymore 1499 if !sq.validForMerge(obj) { 1500 glog.Errorf("%d: Not mergeable anymore. Do not merge.", *obj.Issue.Number) 1501 return true 1502 } 1503 1504 if newSha, _, ok := obj.GetHeadAndBase(); !ok { 1505 glog.Errorf("%d: Unable to get SHA", *obj.Issue.Number) 1506 sq.SetMergeStatus(obj, unknown) 1507 return true 1508 } else if newSha != sha { 1509 glog.Errorf("%d: Changed while running the test. Do not merge.", *obj.Issue.Number) 1510 sq.SetMergeStatus(obj, headCommitChanged) 1511 return false 1512 } 1513 1514 if !sq.e2eStable(true) && !obj.HasLabel(criticalFixLabel) { 1515 if sq.validForMerge(obj) { 1516 sq.interruptedObj = newInterruptedObject(obj) 1517 } 1518 sq.SetMergeStatus(obj, e2eFailure) 1519 return true 1520 } 1521 1522 sq.mergePullRequest(obj, merged, "") 1523 return true 1524 } 1525 1526 // Returns true if merge status changes, and false otherwise. 1527 func (sq *SubmitQueue) retestPR(obj *github.MungeObject) bool { 1528 sq.opts.Lock() 1529 retestContexts := mungeopts.RequiredContexts.Retest 1530 sq.opts.Unlock() 1531 1532 if len(retestContexts) == 0 { 1533 return false 1534 } 1535 1536 if err := obj.WriteComment(newRetestBody); err != nil { 1537 glog.Errorf("%d: unknown err: %v", *obj.Issue.Number, err) 1538 sq.SetMergeStatus(obj, unknown) 1539 return true 1540 } 1541 1542 // Wait for the retest to start 1543 sq.SetMergeStatus(obj, ghE2EWaitingStart) 1544 atomic.AddInt32(&sq.prsTested, 1) 1545 sq.opts.Lock() 1546 prMaxWaitTime := mungeopts.PRMaxWaitTime 1547 sq.opts.Unlock() 1548 done := obj.WaitForPending(retestContexts, prMaxWaitTime) 1549 if !done { 1550 sq.SetMergeStatus(obj, fmt.Sprintf("Timed out waiting for PR %d to start testing", obj.Number())) 1551 return true 1552 } 1553 1554 // Wait for the status to go back to something other than pending 1555 sq.SetMergeStatus(obj, ghE2ERunning) 1556 done = obj.WaitForNotPending(retestContexts, prMaxWaitTime) 1557 if !done { 1558 sq.SetMergeStatus(obj, fmt.Sprintf("Timed out waiting for PR %d to finish testing", obj.Number())) 1559 return true 1560 } 1561 1562 // Check if the thing we care about is success 1563 if success, ok := obj.IsStatusSuccess(retestContexts); !success || !ok { 1564 sq.SetMergeStatus(obj, ghE2EFailed) 1565 return true 1566 } 1567 1568 // no action taken. 1569 return false 1570 } 1571 1572 func (sq *SubmitQueue) serve(data []byte, res http.ResponseWriter, req *http.Request) { 1573 if data == nil { 1574 res.Header().Set("Content-type", "text/plain") 1575 res.WriteHeader(http.StatusInternalServerError) 1576 } else { 1577 res.Header().Set("Content-type", "application/json") 1578 res.WriteHeader(http.StatusOK) 1579 res.Write(data) 1580 } 1581 } 1582 1583 func (sq *SubmitQueue) serveHistory(res http.ResponseWriter, req *http.Request) { 1584 data := sq.getQueueHistory() 1585 sq.serve(data, res, req) 1586 } 1587 1588 func (sq *SubmitQueue) servePRs(res http.ResponseWriter, req *http.Request) { 1589 data := sq.getQueueStatus() 1590 sq.serve(data, res, req) 1591 } 1592 1593 func (sq *SubmitQueue) serveGithubE2EStatus(res http.ResponseWriter, req *http.Request) { 1594 data := sq.getGithubE2EStatus() 1595 sq.serve(data, res, req) 1596 } 1597 1598 func (sq *SubmitQueue) serveCIStatus(res http.ResponseWriter, req *http.Request) { 1599 sq.Lock() 1600 data := sq.marshal(sq.ciStatus) 1601 sq.Unlock() 1602 sq.serve(data, res, req) 1603 } 1604 1605 func (sq *SubmitQueue) serveHealth(res http.ResponseWriter, req *http.Request) { 1606 sq.Lock() 1607 data := sq.marshal(sq.health) 1608 sq.Unlock() 1609 sq.serve(data, res, req) 1610 } 1611 1612 func (sq *SubmitQueue) serveSQStats(res http.ResponseWriter, req *http.Request) { 1613 data := submitQueueStats{ 1614 Added: int(atomic.LoadInt32(&sq.prsAdded)), 1615 FlakesIgnored: int(atomic.LoadInt32(&sq.flakesIgnored)), 1616 Initialized: atomic.LoadInt32(&sq.loopStarts) > 1, 1617 InstantMerges: int(atomic.LoadInt32(&sq.instantMerges)), 1618 BatchMerges: int(atomic.LoadInt32(&sq.batchMerges)), 1619 LastMergeTime: sq.lastMergeTime, 1620 MergeRate: sq.calcMergeRateWithTail(), 1621 MergesSinceRestart: int(atomic.LoadInt32(&sq.totalMerges)), 1622 Removed: int(atomic.LoadInt32(&sq.prsRemoved)), 1623 RetestsAvoided: int(atomic.LoadInt32(&sq.retestsAvoided)), 1624 StartTime: sq.startTime, 1625 Tested: int(atomic.LoadInt32(&sq.prsTested)), 1626 } 1627 sq.serve(sq.marshal(data), res, req) 1628 } 1629 1630 func (sq *SubmitQueue) serveFlakes(res http.ResponseWriter, req *http.Request) { 1631 data := sq.e2e.Flakes() 1632 sq.serve(mungerutil.PrettyMarshal(data), res, req) 1633 } 1634 1635 func (sq *SubmitQueue) serveMetadata(res http.ResponseWriter, req *http.Request) { 1636 sq.Lock() 1637 data := sq.marshal(sq.Metadata) 1638 sq.Unlock() 1639 sq.serve(data, res, req) 1640 } 1641 1642 func (sq *SubmitQueue) serveBatch(res http.ResponseWriter, req *http.Request) { 1643 sq.serve(sq.marshal(sq.batchStatus), res, req) 1644 } 1645 1646 func (sq *SubmitQueue) serveMergeInfo(res http.ResponseWriter, req *http.Request) { 1647 // Lock to get options since we are not running in the main goroutine. 1648 sq.opts.Lock() 1649 doNotMergeMilestones := sq.DoNotMergeMilestones 1650 additionalLabels := sq.AdditionalRequiredLabels 1651 blockingLabels := sq.BlockingLabels 1652 gateApproved := sq.GateApproved 1653 gateCLA := sq.GateCLA 1654 mergeContexts := mungeopts.RequiredContexts.Merge 1655 retestContexts := mungeopts.RequiredContexts.Retest 1656 claYesLabels := sq.ClaYesLabels 1657 sq.opts.Unlock() 1658 1659 res.Header().Set("Content-type", "text/plain") 1660 res.WriteHeader(http.StatusOK) 1661 var out bytes.Buffer 1662 out.WriteString("PRs must meet the following set of conditions to be considered for automatic merging by the submit queue.") 1663 out.WriteString("<ol>") 1664 if gateCLA { 1665 out.WriteString(fmt.Sprintf("<li>The PR must have one of the following labels: %q </li>", claYesLabels)) 1666 } 1667 out.WriteString("<li>The PR must be mergeable. aka cannot need a rebase</li>") 1668 if len(mergeContexts) > 0 || len(retestContexts) > 0 { 1669 out.WriteString("<li>All of the following github statuses must be green") 1670 out.WriteString("<ul>") 1671 for _, context := range mergeContexts { 1672 out.WriteString(fmt.Sprintf("<li>%s</li>", context)) 1673 } 1674 for _, context := range retestContexts { 1675 out.WriteString(fmt.Sprintf("<li>%s</li>", context)) 1676 } 1677 out.WriteString("</ul>") 1678 } 1679 out.WriteString(fmt.Sprintf("<li>The PR cannot have any of the following milestones: %q</li>", doNotMergeMilestones)) 1680 out.WriteString(fmt.Sprintf(`<li>The PR must have the %q label</li>`, lgtmLabel)) 1681 out.WriteString(fmt.Sprintf("<li>The PR must not have been updated since the %q label was applied</li>", lgtmLabel)) 1682 if gateApproved { 1683 out.WriteString(fmt.Sprintf(`<li>The PR must have the %q label</li>`, approvedLabel)) 1684 } 1685 if len(additionalLabels) > 0 { 1686 out.WriteString(fmt.Sprintf(`<li>The PR must have the following labels: %q</li>`, additionalLabels)) 1687 } 1688 if len(blockingLabels) > 0 { 1689 out.WriteString(fmt.Sprintf(`<li>The PR must not have the following labels: %q</li>`, blockingLabels)) 1690 } 1691 out.WriteString(`<li>The PR must not have the any labels starting with "do-not-merge"</li>`) 1692 out.WriteString(`</ol><br>`) 1693 out.WriteString("The PR can then be queued to re-test before merge. Once it reaches the top of the queue all of the above conditions must be true but so must the following:") 1694 out.WriteString("<ol>") 1695 if len(retestContexts) > 0 { 1696 out.WriteString("<li>All of the following tests must pass a second time") 1697 out.WriteString("<ul>") 1698 for _, context := range retestContexts { 1699 out.WriteString(fmt.Sprintf("<li>%s</li>", context)) 1700 } 1701 out.WriteString("</ul>") 1702 out.WriteString(fmt.Sprintf("Unless the %q or %q label is present</li>", retestNotRequiredLabel, retestNotRequiredDocsOnlyLabel)) 1703 } 1704 out.WriteString("</ol>") 1705 out.WriteString("And then the PR will be merged!!") 1706 res.Write(out.Bytes()) 1707 } 1708 1709 func writeLabel(label string, res http.ResponseWriter) { 1710 out := fmt.Sprintf(` <li>%q label 1711 <ul> 1712 <li>A PR with %q will come next</li> 1713 </ul> 1714 </li> 1715 `, label, label) 1716 res.Write([]byte(out)) 1717 } 1718 1719 func (sq *SubmitQueue) servePriorityInfo(res http.ResponseWriter, req *http.Request) { 1720 res.Header().Set("Content-type", "text/plain") 1721 res.WriteHeader(http.StatusOK) 1722 res.Write([]byte(`The merge queue is sorted by the following. If there is a tie in any test the next test will be used. 1723 <ol> 1724 <li>'` + criticalFixLabel + `' label 1725 <ul> 1726 <li>A PR with '` + criticalFixLabel + `' will come first</li> 1727 <li>A PR with '` + criticalFixLabel + `' will merge even if the e2e tests are blocked</li> 1728 </ul> 1729 </li> 1730 `)) 1731 for i := 1; i <= lastHighPriorityLabel; i++ { 1732 writeLabel(labelPriorities[i], res) 1733 } 1734 res.Write([]byte(` <li>Release milestone due date 1735 <ul> 1736 <li>Release milestones are of the form vX.Y where X and Y are integers</li> 1737 <li>The release milestore must have a due date set to affect queue order</li> 1738 <li>Other milestones are ignored</li> 1739 </ul> 1740 </li> 1741 `)) 1742 for i := lastHighPriorityLabel + 1; i < len(labelPriorities); i++ { 1743 writeLabel(labelPriorities[i], res) 1744 } 1745 res.Write([]byte(` <li>First time at which the LGTM label was applied. 1746 <ul> 1747 <li>This means all PRs start at the bottom of the queue (within their priority and milestone bands, of course) and progress towards the top.</li> 1748 </ul> 1749 </li> 1750 </ol> `)) 1751 } 1752 1753 func (sq *SubmitQueue) getHealthSVG() []byte { 1754 sq.Lock() 1755 defer sq.Unlock() 1756 blocked := false 1757 blockingJobs := make([]string, 0) 1758 blocked = !sq.health.MergePossibleNow 1759 status := "running" 1760 color := "brightgreen" 1761 if blocked { 1762 status = "blocked" 1763 color = "red" 1764 for job, status := range sq.e2e.GetBuildStatus() { 1765 if status.Status == "Not Stable" { 1766 job = strings.Replace(job, "kubernetes-", "", -1) 1767 blockingJobs = append(blockingJobs, job) 1768 } 1769 } 1770 sort.Strings(blockingJobs) 1771 if len(blockingJobs) > 3 { 1772 blockingJobs = append(blockingJobs[:3], "...") 1773 } 1774 if len(blockingJobs) > 0 { 1775 status += " by " + strings.Join(blockingJobs, ", ") 1776 } 1777 } 1778 return shield.Make("queue", status, color) 1779 } 1780 1781 func (sq *SubmitQueue) serveHealthSVG(res http.ResponseWriter, req *http.Request) { 1782 res.Header().Set("Content-type", "image/svg+xml") 1783 res.Header().Set("Cache-Control", "max-age=60") 1784 res.WriteHeader(http.StatusOK) 1785 res.Write(sq.getHealthSVG()) 1786 } 1787 1788 func (sq *SubmitQueue) isStaleIssueComment(obj *github.MungeObject, comment *githubapi.IssueComment) bool { 1789 if !obj.IsRobot(comment.User) { 1790 return false 1791 } 1792 if *comment.Body != newRetestBody { 1793 return false 1794 } 1795 stale := commentBeforeLastCI(obj, comment, mungeopts.RequiredContexts.Retest) 1796 if stale { 1797 glog.V(6).Infof("Found stale SubmitQueue safe to merge comment") 1798 } 1799 return stale 1800 } 1801 1802 // StaleIssueComments returns a slice of stale issue comments. 1803 func (sq *SubmitQueue) StaleIssueComments(obj *github.MungeObject, comments []*githubapi.IssueComment) []*githubapi.IssueComment { 1804 return forEachCommentTest(obj, comments, sq.isStaleIssueComment) 1805 }