github.com/shashidharatd/test-infra@v0.0.0-20171006011030-71304e1ca560/mungegithub/mungers/submit-queue.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package mungers 18 19 import ( 20 "bytes" 21 "encoding/json" 22 "errors" 23 "fmt" 24 "math" 25 "net/http" 26 "sort" 27 "strconv" 28 "strings" 29 "sync" 30 "sync/atomic" 31 "time" 32 33 utilclock "k8s.io/kubernetes/pkg/util/clock" 34 "k8s.io/kubernetes/pkg/util/sets" 35 36 "k8s.io/contrib/test-utils/utils" 37 "k8s.io/test-infra/mungegithub/features" 38 "k8s.io/test-infra/mungegithub/github" 39 "k8s.io/test-infra/mungegithub/mungeopts" 40 "k8s.io/test-infra/mungegithub/mungers/e2e" 41 fake_e2e "k8s.io/test-infra/mungegithub/mungers/e2e/fake" 42 "k8s.io/test-infra/mungegithub/mungers/mungerutil" 43 "k8s.io/test-infra/mungegithub/mungers/shield" 44 "k8s.io/test-infra/mungegithub/options" 45 "k8s.io/test-infra/mungegithub/sharedmux" 46 47 "github.com/NYTimes/gziphandler" 48 "github.com/golang/glog" 49 githubapi "github.com/google/go-github/github" 50 "github.com/prometheus/client_golang/prometheus" 51 ) 52 53 const ( 54 approvedLabel = "approved" 55 lgtmLabel = "lgtm" 56 retestNotRequiredLabel = "retest-not-required" 57 retestNotRequiredDocsOnlyLabel = "retest-not-required-docs-only" 58 doNotMergeLabel = "do-not-merge" 59 wipLabel = "do-not-merge/work-in-progress" 60 holdLabel = "do-not-merge/hold" 61 deprecatedReleaseNoteLabelNeeded = "release-note-label-needed" 62 releaseNoteLabelNeeded = "do-not-merge/release-note-label-needed" 63 cncfClaYesLabel = "cncf-cla: yes" 64 cncfClaNoLabel = "cncf-cla: no" 65 claHumanLabel = "cla: human-approved" 66 criticalFixLabel = "queue/critical-fix" 67 blocksOthersLabel = "queue/blocks-others" 68 fixLabel = "queue/fix" 69 multirebaseLabel = "queue/multiple-rebases" 70 71 sqContext = "Submit Queue" 72 73 githubE2EPollTime = 30 * time.Second 74 ) 75 76 var ( 77 // This MUST cause a RETEST of everything in the mungeopts.RequiredContexts.Retest 78 newRetestBody = "/test all [submit-queue is verifying that this PR is safe to merge]" 79 80 // this is the order in which labels will be compared for queue priority 81 labelPriorities = []string{criticalFixLabel, retestNotRequiredLabel, retestNotRequiredDocsOnlyLabel, multirebaseLabel, fixLabel, blocksOthersLabel} 82 // high priority labels are checked before the release 83 lastHighPriorityLabel = 2 // retestNotRequiredDocsOnlyLabel 84 ) 85 86 type submitStatus struct { 87 Time time.Time 88 statusPullRequest 89 Reason string 90 } 91 92 type statusPullRequest struct { 93 Number int 94 URL string 95 Title string 96 Login string 97 AvatarURL string 98 Additions int 99 Deletions int 100 ExtraInfo []string 101 BaseRef string 102 } 103 104 type e2eQueueStatus struct { 105 E2ERunning *statusPullRequest 106 E2EQueue []*statusPullRequest 107 BatchStatus *submitQueueBatchStatus 108 } 109 110 type submitQueueStatus struct { 111 PRStatus map[string]submitStatus 112 } 113 114 // Information about the e2e test health. Call updateHealth on the SubmitQueue 115 // at roughly constant intervals to keep this up to date. The mergeable fraction 116 // of time for the queue as a whole and the individual jobs will then be 117 // NumStable[PerJob] / TotalLoops. 118 type submitQueueHealth struct { 119 TotalLoops int 120 NumStable int 121 NumStablePerJob map[string]int 122 MergePossibleNow bool 123 } 124 125 // Generate health information using a queue of healthRecords. The bools are 126 // true for stable and false otherwise. 127 type healthRecord struct { 128 Time time.Time 129 Overall bool 130 Jobs map[string]bool 131 } 132 133 // information about the sq itself including how fast things are merging and 134 // how long since the last merge 135 type submitQueueStats struct { 136 Added int // Number of items added to the queue since restart 137 FlakesIgnored int 138 Initialized bool // true if we've made at least one complete pass 139 InstantMerges int // Number of merges without retests required 140 BatchMerges int // Number of merges caused by batch 141 LastMergeTime time.Time 142 MergeRate float64 143 MergesSinceRestart int 144 Removed int // Number of items dequeued since restart 145 RetestsAvoided int 146 StartTime time.Time 147 Tested int // Number of e2e tests completed 148 } 149 150 // pull-request that has been tested as successful, but interrupted because head flaked 151 type submitQueueInterruptedObject struct { 152 obj *github.MungeObject 153 // If these two items match when we're about to kick off a retest, it's safe to skip the retest. 154 interruptedMergeHeadSHA string 155 interruptedMergeBaseSHA string 156 } 157 158 // Contains metadata about this instance of the submit queue such as URLs. 159 // Consumed by the template system. 160 type submitQueueMetadata struct { 161 ProjectName string 162 163 ChartURL string 164 HistoryURL string 165 // chartURL and historyURL are option storage locations. They are distinct from ChartURL and 166 // HistoryURL since the the public variables are used asynchronously by a fileserver and updates 167 // to the options values should not cause a race condition. 168 chartURL string 169 historyURL string 170 171 RepoPullURL string 172 ProwURL string 173 } 174 175 type submitQueueBatchStatus struct { 176 Error map[string]string 177 Running *prowJob 178 } 179 180 type prometheusMetrics struct { 181 Blocked prometheus.Gauge 182 OpenPRs prometheus.Gauge 183 QueuedPRs prometheus.Gauge 184 MergeCount prometheus.Counter 185 LastMergeTime prometheus.Gauge 186 } 187 188 var ( 189 sqPromMetrics = prometheusMetrics{ 190 Blocked: prometheus.NewGauge(prometheus.GaugeOpts{ 191 Name: "submitqueue_blocked", 192 Help: "The submit-queue is currently blocked", 193 }), 194 OpenPRs: prometheus.NewGauge(prometheus.GaugeOpts{ 195 Name: "submitqueue_open_pullrequests_total", 196 Help: "Number of open pull-requests", 197 }), 198 QueuedPRs: prometheus.NewGauge(prometheus.GaugeOpts{ 199 Name: "submitqueue_queued_pullrequests_total", 200 Help: "Number of pull-requests queued", 201 }), 202 MergeCount: prometheus.NewCounter(prometheus.CounterOpts{ 203 Name: "submitqueue_merge_total", 204 Help: "Number of merges done", 205 }), 206 LastMergeTime: prometheus.NewGauge(prometheus.GaugeOpts{ 207 Name: "submitqueue_time_of_last_merge", 208 Help: "Time of last merge", 209 }), 210 } 211 ) 212 213 // marshaled in serveCIStatus 214 type jobStatus struct { 215 State string `json:"state"` 216 BuildID string `json:"build_id"` 217 URL string `json:"url"` 218 } 219 220 // SubmitQueue will merge PR which meet a set of requirements. 221 // PR must have LGTM after the last commit 222 // PR must have passed all github CI checks 223 // The google internal jenkins instance must be passing the BlockingJobNames e2e tests 224 type SubmitQueue struct { 225 githubConfig *github.Config 226 opts *options.Options 227 NonBlockingJobNames []string 228 229 GateApproved bool 230 GateCLA bool 231 GateGHReviewApproved bool 232 GateGHReviewChangesRequested bool 233 234 // AdditionalRequiredLabels is a set of additional labels required for merging 235 // on top of the existing required ("lgtm", "approved", "cncf-cla: yes"). 236 AdditionalRequiredLabels []string 237 238 // If FakeE2E is true, don't try to connect to JenkinsHost, all jobs are passing. 239 FakeE2E bool 240 241 DoNotMergeMilestones []string 242 243 Metadata submitQueueMetadata 244 AdminPort int 245 246 sync.Mutex 247 prStatus map[string]submitStatus // protected by sync.Mutex 248 statusHistory []submitStatus // protected by sync.Mutex 249 lastClosedTime time.Time 250 251 clock utilclock.Clock 252 startTime time.Time // when the queue started (duh) 253 lastMergeTime time.Time 254 totalMerges int32 255 mergeRate float64 // per 24 hours 256 loopStarts int32 // if > 1, then we must have made a complete pass. 257 258 githubE2ERunning *github.MungeObject // protect by sync.Mutex! 259 githubE2EQueue map[int]*github.MungeObject // protected by sync.Mutex! 260 githubE2EPollTime time.Duration 261 lgtmTimeCache *mungerutil.LabelTimeCache 262 githubE2ELastPRNum int 263 264 lastE2EStable bool // was e2e stable last time they were checked, protect by sync.Mutex 265 e2e e2e.E2ETester 266 267 interruptedObj *submitQueueInterruptedObject 268 flakesIgnored int32 // Increments for each merge while 1+ job is flaky 269 instantMerges int32 // Increments whenever we merge without retesting 270 batchMerges int32 // Increments whenever we merge because of a batch 271 prsAdded int32 // Increments whenever an items queues 272 prsRemoved int32 // Increments whenever an item dequeues 273 prsTested int32 // Number of prs that completed second testing 274 retestsAvoided int32 // Increments whenever we skip due to head not changing. 275 276 health submitQueueHealth 277 healthHistory []healthRecord 278 279 emergencyMergeStopFlag int32 280 281 features *features.Features 282 283 mergeLock sync.Mutex // acquired when attempting to merge a specific PR 284 ProwURL string // prow base page 285 BatchEnabled bool 286 ContextURL string 287 batchStatus submitQueueBatchStatus 288 ciStatus map[string]map[string]jobStatus // type (eg batch) : job : status 289 290 // MergeToMasterMessage is an extra message when PR is merged to master branch, 291 // it must not end in a period. 292 MergeToMasterMessage string 293 } 294 295 func init() { 296 clock := utilclock.RealClock{} 297 prometheus.MustRegister(sqPromMetrics.Blocked) 298 prometheus.MustRegister(sqPromMetrics.OpenPRs) 299 prometheus.MustRegister(sqPromMetrics.QueuedPRs) 300 prometheus.MustRegister(sqPromMetrics.MergeCount) 301 prometheus.MustRegister(sqPromMetrics.LastMergeTime) 302 sq := &SubmitQueue{ 303 clock: clock, 304 startTime: clock.Now(), 305 lastMergeTime: clock.Now(), 306 lastE2EStable: true, 307 prStatus: map[string]submitStatus{}, 308 githubE2EQueue: map[int]*github.MungeObject{}, 309 } 310 RegisterMungerOrDie(sq) 311 RegisterStaleIssueComments(sq) 312 } 313 314 // Name is the name usable in --pr-mungers 315 func (sq *SubmitQueue) Name() string { return "submit-queue" } 316 317 // RequiredFeatures is a slice of 'features' that must be provided 318 func (sq *SubmitQueue) RequiredFeatures() []string { 319 return []string{features.BranchProtectionFeature, features.ServerFeatureName} 320 } 321 322 func (sq *SubmitQueue) emergencyMergeStop() bool { 323 return atomic.LoadInt32(&sq.emergencyMergeStopFlag) != 0 324 } 325 326 func (sq *SubmitQueue) setEmergencyMergeStop(stopMerges bool) { 327 if stopMerges { 328 atomic.StoreInt32(&sq.emergencyMergeStopFlag, 1) 329 } else { 330 atomic.StoreInt32(&sq.emergencyMergeStopFlag, 0) 331 } 332 } 333 334 // EmergencyStopHTTP sets the emergency stop flag. It expects the path of 335 // req.URL to contain either "emergency/stop", "emergency/resume", or "emergency/status". 336 func (sq *SubmitQueue) EmergencyStopHTTP(res http.ResponseWriter, req *http.Request) { 337 switch { 338 case strings.Contains(req.URL.Path, "emergency/stop"): 339 sq.setEmergencyMergeStop(true) 340 case strings.Contains(req.URL.Path, "emergency/resume"): 341 sq.setEmergencyMergeStop(false) 342 case strings.Contains(req.URL.Path, "emergency/status"): 343 default: 344 http.NotFound(res, req) 345 return 346 } 347 sq.serve(sq.marshal(struct{ EmergencyInProgress bool }{sq.emergencyMergeStop()}), res, req) 348 } 349 350 func round(num float64) int { 351 return int(num + math.Copysign(0.5, num)) 352 } 353 354 func toFixed(num float64) float64 { 355 output := math.Pow(10, float64(3)) 356 return float64(round(num*output)) / output 357 } 358 359 // This is the calculation of the exponential smoothing factor. It tries to 360 // make sure that if we get lots of fast merges we don't race the 'daily' 361 // avg really high really fast. But more importantly it means that if merges 362 // start going slowly the 'daily' average will get pulled down a lot by one 363 // slow merge instead of requiring numerous merges to get pulled down 364 func getSmoothFactor(dur time.Duration) float64 { 365 hours := dur.Hours() 366 smooth := .155*math.Log(hours) + .422 367 if smooth < .1 { 368 return .1 369 } 370 if smooth > .999 { 371 return .999 372 } 373 return smooth 374 } 375 376 // This calculates an exponentially smoothed merge Rate based on the formula 377 // newRate = (1-smooth)oldRate + smooth*newRate 378 // Which is really great and simple for constant time series data. But of course 379 // ours isn't time series data so I vary the smoothing factor based on how long 380 // its been since the last entry. See the comments on the `getSmoothFactor` for 381 // a discussion of why. 382 // This whole thing was dreamed up by eparis one weekend via a combination 383 // of guess-and-test and intuition. Someone who knows about this stuff 384 // is likely to laugh at the naivete. Point him to where someone intelligent 385 // has thought about this stuff and he will gladly do something smart. 386 // Merges that took less than 5 minutes are ignored completely for the rate 387 // calculation. 388 func calcMergeRate(oldRate float64, last, now time.Time) float64 { 389 since := now.Sub(last) 390 if since <= 5*time.Minute { 391 // retest-not-required PR merges shouldn't affect our best 392 // guess about the rate. 393 return oldRate 394 } 395 var rate float64 396 if since == 0 { 397 rate = 96 398 } else { 399 rate = 24.0 * time.Hour.Hours() / since.Hours() 400 } 401 smoothingFactor := getSmoothFactor(since) 402 mergeRate := ((1.0 - smoothingFactor) * oldRate) + (smoothingFactor * rate) 403 return toFixed(mergeRate) 404 } 405 406 // Updates a smoothed rate at which PRs are merging per day. 407 // Updates merge stats. Should be called once for every merge. 408 func (sq *SubmitQueue) updateMergeRate() { 409 now := sq.clock.Now() 410 sq.mergeRate = calcMergeRate(sq.mergeRate, sq.lastMergeTime, now) 411 412 // Update stats 413 sqPromMetrics.MergeCount.Inc() 414 atomic.AddInt32(&sq.totalMerges, 1) 415 sq.lastMergeTime = now 416 sqPromMetrics.LastMergeTime.Set(float64(sq.lastMergeTime.Unix())) 417 } 418 419 // This calculated the smoothed merge rate BUT it looks at the time since 420 // the last merge vs 'Now'. If we have not passed the next 'expected' time 421 // for a merge this just returns previous calculations. If 'Now' is later 422 // than we would expect given the existing mergeRate then pretend a merge 423 // happened right now and return the new merge rate. This way the merge rate 424 // is lower even if no merge has happened in a long time. 425 func (sq *SubmitQueue) calcMergeRateWithTail() float64 { 426 now := sq.clock.Now() 427 428 if sq.mergeRate == 0 { 429 return 0 430 } 431 // Figure out when we think the next merge would happen given the history 432 next := time.Duration(24/sq.mergeRate*time.Hour.Hours()) * time.Hour 433 expectedMergeTime := sq.lastMergeTime.Add(next) 434 435 // If we aren't there yet, just return the history 436 if !now.After(expectedMergeTime) { 437 return sq.mergeRate 438 } 439 440 // Pretend as though a merge happened right now to pull down the rate 441 return calcMergeRate(sq.mergeRate, sq.lastMergeTime, now) 442 } 443 444 // Initialize will initialize the munger 445 func (sq *SubmitQueue) Initialize(config *github.Config, features *features.Features) error { 446 sq.features = features 447 return sq.internalInitialize(config, features, "") 448 } 449 450 // internalInitialize will initialize the munger. 451 // if overrideURL is specified, will create testUtils 452 func (sq *SubmitQueue) internalInitialize(config *github.Config, features *features.Features, overrideURL string) error { 453 sq.Lock() 454 defer sq.Unlock() 455 456 // initialize to invalid pr number 457 sq.githubE2ELastPRNum = -1 458 459 sq.Metadata.ChartURL = sq.Metadata.chartURL 460 sq.Metadata.HistoryURL = sq.Metadata.historyURL 461 sq.Metadata.ProwURL = sq.ProwURL 462 sq.Metadata.RepoPullURL = fmt.Sprintf("https://github.com/%s/%s/pulls/", config.Org, config.Project) 463 sq.Metadata.ProjectName = strings.Title(config.Project) 464 sq.githubConfig = config 465 466 if sq.BatchEnabled && sq.ProwURL == "" { 467 return errors.New("batch merges require prow-url to be set") 468 } 469 470 // TODO: This is not how injection for tests should work. 471 if sq.FakeE2E { 472 sq.e2e = &fake_e2e.FakeE2ETester{} 473 } else { 474 var gcs *utils.Utils 475 if overrideURL != "" { 476 gcs = utils.NewTestUtils("bucket", "logs", overrideURL) 477 } else { 478 gcs = utils.NewWithPresubmitDetection( 479 mungeopts.GCS.BucketName, mungeopts.GCS.LogDir, 480 mungeopts.GCS.PullKey, mungeopts.GCS.PullLogDir, 481 ) 482 } 483 484 sq.e2e = (&e2e.RealE2ETester{ 485 Opts: sq.opts, 486 NonBlockingJobNames: &sq.NonBlockingJobNames, 487 BuildStatus: map[string]e2e.BuildInfo{}, 488 GoogleGCSBucketUtils: gcs, 489 }).Init(sharedmux.Admin) 490 } 491 492 sq.lgtmTimeCache = mungerutil.NewLabelTimeCache(lgtmLabel) 493 494 if features.Server.Enabled { 495 features.Server.Handle("/prs", gziphandler.GzipHandler(http.HandlerFunc(sq.servePRs))) 496 features.Server.Handle("/history", gziphandler.GzipHandler(http.HandlerFunc(sq.serveHistory))) 497 features.Server.Handle("/github-e2e-queue", gziphandler.GzipHandler(http.HandlerFunc(sq.serveGithubE2EStatus))) 498 features.Server.Handle("/merge-info", gziphandler.GzipHandler(http.HandlerFunc(sq.serveMergeInfo))) 499 features.Server.Handle("/priority-info", gziphandler.GzipHandler(http.HandlerFunc(sq.servePriorityInfo))) 500 features.Server.Handle("/health", gziphandler.GzipHandler(http.HandlerFunc(sq.serveHealth))) 501 features.Server.Handle("/health.svg", gziphandler.GzipHandler(http.HandlerFunc(sq.serveHealthSVG))) 502 features.Server.Handle("/sq-stats", gziphandler.GzipHandler(http.HandlerFunc(sq.serveSQStats))) 503 features.Server.Handle("/flakes", gziphandler.GzipHandler(http.HandlerFunc(sq.serveFlakes))) 504 features.Server.Handle("/metadata", gziphandler.GzipHandler(http.HandlerFunc(sq.serveMetadata))) 505 if sq.BatchEnabled { 506 features.Server.Handle("/batch", gziphandler.GzipHandler(http.HandlerFunc(sq.serveBatch))) 507 } 508 // this endpoint is useless without access to prow 509 if sq.ProwURL != "" { 510 features.Server.Handle("/ci-status", gziphandler.GzipHandler(http.HandlerFunc(sq.serveCIStatus))) 511 } 512 } 513 514 sharedmux.Admin.HandleFunc("/api/emergency/stop", sq.EmergencyStopHTTP) 515 sharedmux.Admin.HandleFunc("/api/emergency/resume", sq.EmergencyStopHTTP) 516 sharedmux.Admin.HandleFunc("/api/emergency/status", sq.EmergencyStopHTTP) 517 518 if sq.githubE2EPollTime == 0 { 519 sq.githubE2EPollTime = githubE2EPollTime 520 } 521 522 sq.healthHistory = make([]healthRecord, 0) 523 524 go sq.handleGithubE2EAndMerge() 525 go sq.updateGoogleE2ELoop() 526 if sq.BatchEnabled { 527 go sq.handleGithubE2EBatchMerge() 528 } 529 if sq.ProwURL != "" { 530 go sq.monitorProw() 531 } 532 533 if sq.AdminPort != 0 { 534 go http.ListenAndServe(fmt.Sprintf("0.0.0.0:%v", sq.AdminPort), sharedmux.Admin) 535 } 536 return nil 537 } 538 539 // EachLoop is called at the start of every munge loop 540 func (sq *SubmitQueue) EachLoop() error { 541 issues := []*githubapi.Issue{} 542 if !sq.lastClosedTime.IsZero() { 543 listOpts := &githubapi.IssueListByRepoOptions{ 544 State: "closed", 545 Since: sq.lastClosedTime, 546 } 547 var err error 548 issues, err = sq.githubConfig.ListAllIssues(listOpts) 549 if err != nil { 550 return err 551 } 552 } else { 553 sq.lastClosedTime = time.Now() 554 } 555 556 sq.Lock() 557 for _, issue := range issues { 558 if issue.ClosedAt != nil && issue.ClosedAt.After(sq.lastClosedTime) { 559 sq.lastClosedTime = *issue.ClosedAt 560 } 561 delete(sq.prStatus, strconv.Itoa(*issue.Number)) 562 } 563 564 sq.updateHealth() 565 sqPromMetrics.OpenPRs.Set(float64(len(sq.prStatus))) 566 sqPromMetrics.QueuedPRs.Set(float64(len(sq.githubE2EQueue))) 567 568 objs := []*github.MungeObject{} 569 for _, obj := range sq.githubE2EQueue { 570 objs = append(objs, obj) 571 } 572 sq.Unlock() 573 574 for _, obj := range objs { 575 obj.Refresh() 576 // This should recheck it and clean up the queue, we don't care about the result 577 _ = sq.validForMerge(obj) 578 } 579 atomic.AddInt32(&sq.loopStarts, 1) 580 return nil 581 } 582 583 // RegisterOptions registers options for this munger; returns any that require a restart when changed. 584 func (sq *SubmitQueue) RegisterOptions(opts *options.Options) sets.String { 585 sq.opts = opts 586 opts.RegisterStringSlice(&sq.NonBlockingJobNames, "nonblocking-jobs", []string{}, "Comma separated list of jobs that don't block merges, but will have status reported and issues filed.") 587 opts.RegisterStringSlice(&sq.AdditionalRequiredLabels, "additional-required-labels", []string{}, "Comma separated list of labels required for merging PRs on top of the existing required.") 588 opts.RegisterBool(&sq.FakeE2E, "fake-e2e", false, "Whether to use a fake for testing E2E stability.") 589 opts.RegisterStringSlice(&sq.DoNotMergeMilestones, "do-not-merge-milestones", []string{}, "List of milestones which, when applied, will cause the PR to not be merged.") 590 opts.RegisterInt(&sq.AdminPort, "admin-port", 9999, "If non-zero, will serve administrative actions on this port.") 591 opts.RegisterString(&sq.Metadata.historyURL, "history-url", "", "URL to access the submit-queue instance's health history.") 592 opts.RegisterString(&sq.Metadata.chartURL, "chart-url", "", "URL to access the submit-queue instance's health charts.") 593 opts.RegisterString(&sq.ProwURL, "prow-url", "", "Prow deployment base URL to read batch results and direct users to.") 594 opts.RegisterBool(&sq.BatchEnabled, "batch-enabled", false, "Do batch merges (requires prow/splice coordination).") 595 opts.RegisterString(&sq.ContextURL, "context-url", "", "URL where the submit queue is serving - used in Github status contexts.") 596 opts.RegisterBool(&sq.GateApproved, "gate-approved", false, "Gate on approved label.") 597 opts.RegisterBool(&sq.GateCLA, "gate-cla", false, "Gate on cla labels.") 598 opts.RegisterString(&sq.MergeToMasterMessage, "merge-to-master-message", "", "Extra message when PR is merged to master branch.") 599 opts.RegisterBool(&sq.GateGHReviewApproved, "gh-review-approved", false, "Gate github review, approve") 600 opts.RegisterBool(&sq.GateGHReviewChangesRequested, "gh-review-changes-requested", false, "Gate github review, changes request") 601 602 opts.RegisterUpdateCallback(func(changed sets.String) error { 603 if changed.HasAny("prow-url", "batch-enabled") { 604 if sq.BatchEnabled && sq.ProwURL == "" { 605 return fmt.Errorf("batch merges require prow-url to be set") 606 } 607 } 608 return nil 609 }) 610 611 return sets.NewString( 612 "batch-enabled", // Need to start or kill batch processing. 613 "context-url", // Need to remunge all PRs to update statuses with new url. 614 "admin-port", // Need to restart server on new port. 615 // For the following: need to restart fileserver. 616 "chart-url", 617 "history-url", 618 // For the following: need to re-initialize e2e which is used by other goroutines. 619 "fake-e2e", 620 "gcs-bucket", 621 "gcs-logs-dir", 622 "pull-logs-dir", 623 "pull-key", 624 // For the following: need to remunge all PRs if changed from true to false. 625 "gate-cla", 626 "gate-approved", 627 // Need to remunge all PRs if anything changes in the following set of labels. 628 "additional-required-labels", 629 ) 630 } 631 632 // Hold the lock 633 func (sq *SubmitQueue) updateHealth() { 634 // Remove old entries from the front. 635 for len(sq.healthHistory) > 0 && time.Since(sq.healthHistory[0].Time).Hours() > 24.0 { 636 sq.healthHistory = sq.healthHistory[1:] 637 } 638 // Make the current record 639 emergencyStop := sq.emergencyMergeStop() 640 newEntry := healthRecord{ 641 Time: time.Now(), 642 Overall: !emergencyStop, 643 Jobs: map[string]bool{}, 644 } 645 for job, status := range sq.e2e.GetBuildStatus() { 646 // Ignore flakes. 647 newEntry.Jobs[job] = status.Status != "Not Stable" 648 } 649 if emergencyStop { 650 // invent an "emergency stop" job that's failing. 651 newEntry.Jobs["Emergency Stop"] = false 652 } 653 sq.healthHistory = append(sq.healthHistory, newEntry) 654 // Now compute the health structure so we don't have to do it on page load 655 sq.health.TotalLoops = len(sq.healthHistory) 656 sq.health.NumStable = 0 657 sq.health.NumStablePerJob = map[string]int{} 658 sq.health.MergePossibleNow = !emergencyStop 659 if sq.health.MergePossibleNow { 660 sqPromMetrics.Blocked.Set(0) 661 } else { 662 sqPromMetrics.Blocked.Set(1) 663 } 664 for _, record := range sq.healthHistory { 665 if record.Overall { 666 sq.health.NumStable++ 667 } 668 for job, stable := range record.Jobs { 669 if _, ok := sq.health.NumStablePerJob[job]; !ok { 670 sq.health.NumStablePerJob[job] = 0 671 } 672 if stable { 673 sq.health.NumStablePerJob[job]++ 674 } 675 } 676 } 677 } 678 679 func (sq *SubmitQueue) monitorProw() { 680 nonBlockingJobNames := make(map[string]bool) 681 requireRetestJobNames := make(map[string]bool) 682 683 for { 684 sq.opts.Lock() 685 for _, jobName := range sq.NonBlockingJobNames { 686 nonBlockingJobNames[jobName] = true 687 } 688 for _, jobName := range mungeopts.RequiredContexts.Retest { 689 requireRetestJobNames[jobName] = true 690 } 691 url := sq.ProwURL + "/data.js" 692 693 currentPR := -1 694 if sq.githubE2ERunning != nil { 695 currentPR = *sq.githubE2ERunning.Issue.Number 696 } 697 sq.opts.Unlock() 698 699 lastPR := sq.githubE2ELastPRNum 700 // get current job info from prow 701 allJobs, err := getJobs(url) 702 if err != nil { 703 glog.Errorf("Error reading batch jobs from Prow URL %v: %v", url, err) 704 time.Sleep(time.Minute) 705 continue 706 } 707 // TODO: copy these from sq first instead 708 ciStatus := make(map[string]map[string]jobStatus) 709 ciLatest := make(map[string]map[string]time.Time) 710 711 for _, job := range allJobs { 712 if job.Finished == "" || job.BuildID == "" { 713 continue 714 } 715 // type/category 716 key := job.Type + "/" 717 // the most recent submit-queue PR(s) 718 if job.Number == currentPR || job.Number == lastPR { 719 key += "single" 720 } else if nonBlockingJobNames[job.Job] { 721 key += "nonblocking" 722 } else if requireRetestJobNames[job.Job] { 723 key += "requiredretest" 724 } 725 726 ft, err := time.Parse(time.RFC3339Nano, job.Finished) 727 if err != nil { 728 glog.Errorf("Error parsing job finish time %s: %v", job.Finished, err) 729 continue 730 } 731 732 if _, ok := ciLatest[key]; !ok { 733 ciLatest[key] = make(map[string]time.Time) 734 ciStatus[key] = make(map[string]jobStatus) 735 } 736 latest, ok := ciLatest[key][job.Job] 737 738 // TODO: flake cache? 739 if !ok || latest.Before(ft) { 740 ciLatest[key][job.Job] = ft 741 ciStatus[key][job.Job] = jobStatus{ 742 State: job.State, 743 BuildID: job.BuildID, 744 URL: job.URL, 745 } 746 } 747 } 748 749 sq.Lock() 750 sq.ciStatus = ciStatus 751 sq.Unlock() 752 753 time.Sleep(time.Minute) 754 } 755 } 756 757 func (sq *SubmitQueue) e2eStable(aboutToMerge bool) bool { 758 wentStable := false 759 wentUnstable := false 760 761 sq.e2e.LoadNonBlockingStatus() 762 stable := !sq.emergencyMergeStop() 763 764 sq.Lock() 765 last := sq.lastE2EStable 766 if last && !stable { 767 wentUnstable = true 768 } else if !last && stable { 769 wentStable = true 770 } 771 sq.lastE2EStable = stable 772 sq.Unlock() 773 774 reason := "" 775 avatar := "" 776 if wentStable { 777 reason = e2eRecover 778 avatar = "success.png" 779 } else if wentUnstable { 780 reason = e2eFailure 781 avatar = "error.png" 782 } 783 if reason != "" { 784 submitStatus := submitStatus{ 785 Time: sq.clock.Now(), 786 statusPullRequest: statusPullRequest{ 787 Title: reason, 788 AvatarURL: avatar, 789 }, 790 Reason: reason, 791 } 792 sq.Lock() 793 sq.statusHistory = append(sq.statusHistory, submitStatus) 794 sq.Unlock() 795 } 796 return stable 797 } 798 799 // This serves little purpose other than to show updates every minute in the 800 // web UI. Stable() will get called as needed against individual PRs as well. 801 func (sq *SubmitQueue) updateGoogleE2ELoop() { 802 for { 803 _ = sq.e2eStable(false) 804 time.Sleep(1 * time.Minute) 805 } 806 } 807 808 func objToStatusPullRequest(obj *github.MungeObject) *statusPullRequest { 809 if obj == nil { 810 return &statusPullRequest{} 811 } 812 res := statusPullRequest{ 813 Number: *obj.Issue.Number, 814 URL: *obj.Issue.HTMLURL, 815 Title: *obj.Issue.Title, 816 Login: *obj.Issue.User.Login, 817 AvatarURL: *obj.Issue.User.AvatarURL, 818 } 819 pr, ok := obj.GetPR() 820 if !ok { 821 return &res 822 } 823 if pr.Additions != nil { 824 res.Additions = *pr.Additions 825 } 826 if pr.Deletions != nil { 827 res.Deletions = *pr.Deletions 828 } 829 if pr.Base != nil && pr.Base.Ref != nil { 830 res.BaseRef = *pr.Base.Ref 831 } 832 833 labelPriority := labelPriority(obj) 834 if labelPriority <= lastHighPriorityLabel { 835 res.ExtraInfo = append(res.ExtraInfo, labelPriorities[labelPriority]) 836 } 837 838 milestone, ok := obj.Annotations["milestone"] 839 if !ok { 840 milestone, _ = obj.ReleaseMilestone() 841 obj.Annotations["milestone"] = milestone 842 } 843 if milestone != "" { 844 res.ExtraInfo = append(res.ExtraInfo, milestone) 845 } 846 847 if labelPriority > lastHighPriorityLabel && labelPriority < len(labelPriorities) { 848 res.ExtraInfo = append(res.ExtraInfo, labelPriorities[labelPriority]) 849 } 850 851 return &res 852 } 853 854 func reasonToState(reason string) string { 855 switch reason { 856 case merged, mergedByHand, mergedSkippedRetest, mergedBatch: 857 return "success" 858 case e2eFailure, ghE2EQueued, ghE2EWaitingStart, ghE2ERunning: 859 return "success" 860 case unknown: 861 return "failure" 862 default: 863 return "pending" 864 } 865 } 866 867 // SetMergeStatus will set the status given a particular PR. This function should 868 // be used instead of manipulating the prStatus directly as sq.Lock() must be 869 // called when manipulating that structure 870 // `obj` is the active github object 871 // `reason` is the new 'status' for this object 872 func (sq *SubmitQueue) SetMergeStatus(obj *github.MungeObject, reason string) { 873 glog.V(4).Infof("SubmitQueue not merging %d because %q", *obj.Issue.Number, reason) 874 submitStatus := submitStatus{ 875 Time: sq.clock.Now(), 876 statusPullRequest: *objToStatusPullRequest(obj), 877 Reason: reason, 878 } 879 880 status, ok := obj.GetStatus(sqContext) 881 if !ok || status == nil || *status.Description != reason { 882 state := reasonToState(reason) 883 sq.opts.Lock() 884 contextURL := sq.ContextURL 885 sq.opts.Unlock() 886 url := fmt.Sprintf("%s/#/prs?prDisplay=%d&historyDisplay=%d", contextURL, *obj.Issue.Number, *obj.Issue.Number) 887 _ = obj.SetStatus(state, url, reason, sqContext) 888 } 889 890 sq.Lock() 891 defer sq.Unlock() 892 893 // If we are currently retesting E2E the normal munge loop might find 894 // that the ci tests are not green. That's normal and expected and we 895 // should just ignore that status update entirely. 896 if sq.githubE2ERunning != nil && *sq.githubE2ERunning.Issue.Number == *obj.Issue.Number && strings.HasPrefix(reason, ciFailure) { 897 return 898 } 899 900 if sq.onQueue(obj) { 901 sq.statusHistory = append(sq.statusHistory, submitStatus) 902 if len(sq.statusHistory) > 128 { 903 sq.statusHistory = sq.statusHistory[1:] 904 } 905 } 906 sq.prStatus[strconv.Itoa(*obj.Issue.Number)] = submitStatus 907 sq.cleanupOldE2E(obj, reason) 908 } 909 910 // setContextFailedStatus calls SetMergeStatus after determining a particular github status 911 // which is failed. 912 func (sq *SubmitQueue) setContextFailedStatus(obj *github.MungeObject, contexts []string) { 913 for i, context := range contexts { 914 contextSlice := contexts[i : i+1] 915 success, ok := obj.IsStatusSuccess(contextSlice) 916 if ok && success { 917 continue 918 } 919 failMsg := fmt.Sprintf(ciFailureFmt, context) 920 sq.SetMergeStatus(obj, failMsg) 921 return 922 } 923 glog.Errorf("Inside setContextFailedStatus() but none of the status's failed! %d: %v", obj.Number(), contexts) 924 sq.SetMergeStatus(obj, ciFailure) 925 } 926 927 // sq.Lock() MUST be held! 928 func (sq *SubmitQueue) getE2EQueueStatus() []*statusPullRequest { 929 queue := []*statusPullRequest{} 930 keys := sq.orderedE2EQueue() 931 for _, k := range keys { 932 obj := sq.githubE2EQueue[k] 933 request := objToStatusPullRequest(obj) 934 queue = append(queue, request) 935 } 936 return queue 937 } 938 939 func (sq *SubmitQueue) marshal(data interface{}) []byte { 940 b, err := json.Marshal(data) 941 if err != nil { 942 glog.Errorf("Unable to Marshal data: %#v: %v", data, err) 943 return nil 944 } 945 return b 946 } 947 948 func (sq *SubmitQueue) getQueueHistory() []byte { 949 sq.Lock() 950 defer sq.Unlock() 951 return sq.marshal(sq.statusHistory) 952 } 953 954 // GetQueueStatus returns a json representation of the state of the submit 955 // queue. This can be used to generate web pages about the submit queue. 956 func (sq *SubmitQueue) getQueueStatus() []byte { 957 status := submitQueueStatus{PRStatus: map[string]submitStatus{}} 958 sq.Lock() 959 defer sq.Unlock() 960 961 for key, value := range sq.prStatus { 962 status.PRStatus[key] = value 963 } 964 return sq.marshal(status) 965 } 966 967 func (sq *SubmitQueue) getGithubE2EStatus() []byte { 968 sq.Lock() 969 defer sq.Unlock() 970 status := e2eQueueStatus{ 971 E2EQueue: sq.getE2EQueueStatus(), 972 E2ERunning: objToStatusPullRequest(sq.githubE2ERunning), 973 BatchStatus: &sq.batchStatus, 974 } 975 return sq.marshal(status) 976 } 977 978 func noMergeMessage(label string) string { 979 return "Will not auto merge because " + label + " is present" 980 } 981 982 func noAdditionalLabelMessage(label string) string { 983 return "Will not auto merge because " + label + " is missing" 984 } 985 986 const ( 987 unknown = "unknown failure" 988 noCLA = "PR is missing CLA label; needs one of " + cncfClaYesLabel + " or " + claHumanLabel 989 noLGTM = "PR does not have " + lgtmLabel + " label." 990 noApproved = "PR does not have " + approvedLabel + " label." 991 lgtmEarly = "The PR was changed after the " + lgtmLabel + " label was added." 992 unmergeable = "PR is unable to be automatically merged. Needs rebase." 993 undeterminedMergability = "Unable to determine is PR is mergeable. Will try again later." 994 ciFailure = "Required Github CI test is not green" 995 ciFailureFmt = ciFailure + ": %s" 996 e2eFailure = "The e2e tests are failing. The entire submit queue is blocked." 997 e2eRecover = "The e2e tests started passing. The submit queue is unblocked." 998 merged = "MERGED!" 999 mergedSkippedRetest = "MERGED! (skipped retest because of label)" 1000 mergedBatch = "MERGED! (batch)" 1001 mergedByHand = "MERGED! (by hand outside of submit queue)" 1002 ghE2EQueued = "Queued to run github e2e tests a second time." 1003 ghE2EWaitingStart = "Requested and waiting for github e2e test to start running a second time." 1004 ghE2ERunning = "Running github e2e tests a second time." 1005 ghE2EFailed = "Second github e2e run failed." 1006 unmergeableMilestone = "Milestone is for a future release and cannot be merged" 1007 headCommitChanged = "This PR has changed since we ran the tests" 1008 ghReviewStateUnclear = "Cannot get gh reviews status" 1009 ghReviewApproved = "This pr has no Github review \"approved\"." 1010 ghReviewChangesRequested = "Reviewer(s) requested changes through github review process." 1011 ) 1012 1013 // validForMergeExt is the base logic about what PR can be automatically merged. 1014 // PRs must pass this logic to be placed on the queue and they must pass this 1015 // logic a second time to be retested/merged after they get to the top of 1016 // the queue. 1017 // 1018 // checkStatus is true if the PR should only merge if the appropriate Github status 1019 // checks are passing. 1020 // 1021 // If you update the logic PLEASE PLEASE PLEASE update serveMergeInfo() as well. 1022 func (sq *SubmitQueue) validForMergeExt(obj *github.MungeObject, checkStatus bool) bool { 1023 // Can't merge an issue! 1024 if !obj.IsPR() { 1025 return false 1026 } 1027 1028 // Can't merge something already merged. 1029 if m, ok := obj.IsMerged(); !ok { 1030 glog.Errorf("%d: unknown err", *obj.Issue.Number) 1031 sq.SetMergeStatus(obj, unknown) 1032 return false 1033 } else if m { 1034 sq.SetMergeStatus(obj, mergedByHand) 1035 return false 1036 } 1037 1038 // Lock to get options since we may be running on a goroutine besides the main one. 1039 sq.opts.Lock() 1040 gateCLA := sq.GateCLA 1041 gateApproved := sq.GateApproved 1042 doNotMergeMilestones := sq.DoNotMergeMilestones 1043 mergeContexts := mungeopts.RequiredContexts.Merge 1044 retestContexts := mungeopts.RequiredContexts.Retest 1045 additionalLabels := sq.AdditionalRequiredLabels 1046 sq.opts.Unlock() 1047 1048 milestone := obj.Issue.Milestone 1049 title := "" 1050 // Net set means the empty milestone, "" 1051 if milestone != nil && milestone.Title != nil { 1052 title = *milestone.Title 1053 } 1054 for _, blocked := range doNotMergeMilestones { 1055 if title == blocked || (title == "" && blocked == "NO-MILESTONE") { 1056 sq.SetMergeStatus(obj, unmergeableMilestone) 1057 return false 1058 } 1059 } 1060 1061 // Must pass CLA checks 1062 if gateCLA { 1063 if !obj.HasLabel(claHumanLabel) && !obj.HasLabel(cncfClaYesLabel) { 1064 sq.SetMergeStatus(obj, noCLA) 1065 return false 1066 } 1067 } 1068 1069 // Obviously must be mergeable 1070 if mergeable, ok := obj.IsMergeable(); !ok { 1071 sq.SetMergeStatus(obj, undeterminedMergability) 1072 return false 1073 } else if !mergeable { 1074 sq.SetMergeStatus(obj, unmergeable) 1075 return false 1076 } 1077 1078 // Validate the status information for this PR 1079 if checkStatus { 1080 if len(mergeContexts) > 0 { 1081 if success, ok := obj.IsStatusSuccess(mergeContexts); !ok || !success { 1082 sq.setContextFailedStatus(obj, mergeContexts) 1083 return false 1084 } 1085 } 1086 if len(retestContexts) > 0 { 1087 if success, ok := obj.IsStatusSuccess(retestContexts); !ok || !success { 1088 sq.setContextFailedStatus(obj, retestContexts) 1089 return false 1090 } 1091 } 1092 } 1093 1094 if sq.GateGHReviewApproved || sq.GateGHReviewChangesRequested { 1095 if approvedReview, changesRequestedReview, ok := obj.CollectGHReviewStatus(); !ok { 1096 sq.SetMergeStatus(obj, ghReviewStateUnclear) 1097 return false 1098 } else if len(approvedReview) == 0 && sq.GateGHReviewApproved { 1099 sq.SetMergeStatus(obj, ghReviewApproved) 1100 return false 1101 } else if len(changesRequestedReview) > 0 && sq.GateGHReviewChangesRequested { 1102 sq.SetMergeStatus(obj, ghReviewChangesRequested) 1103 return false 1104 } 1105 } 1106 1107 if !obj.HasLabel(lgtmLabel) { 1108 sq.SetMergeStatus(obj, noLGTM) 1109 return false 1110 } 1111 1112 // PR cannot change since LGTM was added 1113 if after, ok := obj.ModifiedAfterLabeled(lgtmLabel); !ok { 1114 sq.SetMergeStatus(obj, unknown) 1115 return false 1116 } else if after { 1117 sq.SetMergeStatus(obj, lgtmEarly) 1118 return false 1119 } 1120 1121 if gateApproved { 1122 if !obj.HasLabel(approvedLabel) { 1123 sq.SetMergeStatus(obj, noApproved) 1124 return false 1125 } 1126 } 1127 1128 // PR cannot have any labels which prevent merging. 1129 for _, label := range []string{ 1130 cherrypickUnapprovedLabel, 1131 blockedPathsLabel, 1132 deprecatedReleaseNoteLabelNeeded, 1133 releaseNoteLabelNeeded, 1134 doNotMergeLabel, 1135 wipLabel, 1136 holdLabel, 1137 } { 1138 if obj.HasLabel(label) { 1139 sq.SetMergeStatus(obj, noMergeMessage(label)) 1140 return false 1141 } 1142 } 1143 1144 for _, label := range additionalLabels { 1145 if !obj.HasLabel(label) { 1146 sq.SetMergeStatus(obj, noAdditionalLabelMessage(label)) 1147 return false 1148 } 1149 } 1150 1151 return true 1152 } 1153 1154 func (sq *SubmitQueue) validForMerge(obj *github.MungeObject) bool { 1155 return sq.validForMergeExt(obj, true) 1156 } 1157 1158 // Munge is the workhorse the will actually make updates to the PR 1159 func (sq *SubmitQueue) Munge(obj *github.MungeObject) { 1160 if !sq.validForMerge(obj) { 1161 return 1162 } 1163 1164 added := false 1165 sq.Lock() 1166 if _, ok := sq.githubE2EQueue[*obj.Issue.Number]; !ok { 1167 atomic.AddInt32(&sq.prsAdded, 1) 1168 added = true 1169 } 1170 // Add this most-recent object in place of the existing object. It will 1171 // have more up2date information. Even though we explicitly refresh the 1172 // PR information before do anything with it, this allow things like the 1173 // queue order to change dynamically as labels are added/removed. 1174 sq.githubE2EQueue[*obj.Issue.Number] = obj 1175 sq.Unlock() 1176 if added { 1177 sq.SetMergeStatus(obj, ghE2EQueued) 1178 } 1179 1180 return 1181 } 1182 1183 func (sq *SubmitQueue) deleteQueueItem(obj *github.MungeObject) { 1184 if sq.onQueue(obj) { 1185 atomic.AddInt32(&sq.prsRemoved, 1) 1186 } 1187 delete(sq.githubE2EQueue, *obj.Issue.Number) 1188 } 1189 1190 // If the PR was put in the github e2e queue previously, but now we don't 1191 // think it should be in the e2e queue, remove it. MUST be called with sq.Lock() 1192 // held. 1193 func (sq *SubmitQueue) cleanupOldE2E(obj *github.MungeObject, reason string) { 1194 switch { 1195 case reason == e2eFailure: 1196 case reason == ghE2EQueued: 1197 case reason == ghE2EWaitingStart: 1198 case reason == ghE2ERunning: 1199 // Do nothing 1200 case strings.HasPrefix(reason, ciFailure): 1201 // ciFailure is intersting. If the PR is being actively retested and then the 1202 // time based loop finds the same PR it will try to set ciFailure. We should in fact 1203 // not ever call this function in this case, but if we do call here, log it. 1204 if sq.githubE2ERunning != nil && *sq.githubE2ERunning.Issue.Number == *obj.Issue.Number { 1205 glog.Errorf("Trying to clean up %d due to ciFailure while it is being tested", *obj.Issue.Number) 1206 return 1207 } 1208 fallthrough 1209 default: 1210 if sq.githubE2ERunning != nil && *sq.githubE2ERunning.Issue.Number == *obj.Issue.Number { 1211 sq.githubE2ERunning = nil 1212 } 1213 sq.deleteQueueItem(obj) 1214 } 1215 1216 } 1217 1218 func labelPriority(obj *github.MungeObject) int { 1219 for i, label := range labelPriorities { 1220 if obj.HasLabel(label) { 1221 return i 1222 } 1223 } 1224 return len(labelPriorities) 1225 } 1226 1227 func compareHighPriorityLabels(a *github.MungeObject, b *github.MungeObject) int { 1228 aPrio := labelPriority(a) 1229 bPrio := labelPriority(b) 1230 1231 if aPrio > lastHighPriorityLabel && bPrio > lastHighPriorityLabel { 1232 return 0 1233 } 1234 return aPrio - bPrio 1235 } 1236 1237 func compareLowPriorityLabels(a *github.MungeObject, b *github.MungeObject) int { 1238 aPrio := labelPriority(a) 1239 bPrio := labelPriority(b) 1240 1241 return aPrio - bPrio 1242 } 1243 1244 type queueSorter struct { 1245 queue []*github.MungeObject 1246 labelTimeCache *mungerutil.LabelTimeCache 1247 } 1248 1249 func (s queueSorter) Len() int { return len(s.queue) } 1250 func (s queueSorter) Swap(i, j int) { s.queue[i], s.queue[j] = s.queue[j], s.queue[i] } 1251 1252 // If you update the function PLEASE PLEASE PLEASE also update servePriorityInfo() 1253 func (s queueSorter) Less(i, j int) bool { 1254 a := s.queue[i] 1255 b := s.queue[j] 1256 1257 if c := compareHighPriorityLabels(a, b); c < 0 { 1258 return true 1259 } else if c > 0 { 1260 return false 1261 } 1262 1263 aDue, _ := a.ReleaseMilestoneDue() 1264 bDue, _ := b.ReleaseMilestoneDue() 1265 1266 if aDue.Before(bDue) { 1267 return true 1268 } else if aDue.After(bDue) { 1269 return false 1270 } 1271 1272 if c := compareLowPriorityLabels(a, b); c < 0 { 1273 return true 1274 } else if c > 0 { 1275 return false 1276 } 1277 1278 aTime, aOK := s.labelTimeCache.FirstLabelTime(a) 1279 bTime, bOK := s.labelTimeCache.FirstLabelTime(b) 1280 1281 // Shouldn't really happen since these have been LGTMed to be 1282 // in the queue at all. But just in case, . 1283 if !aOK && bOK { 1284 return false 1285 } else if aOK && !bOK { 1286 return true 1287 } else if !aOK && !bOK { 1288 return false 1289 } 1290 1291 return aTime.Before(bTime) 1292 } 1293 1294 // onQueue just tells if a PR is already on the queue. 1295 // sq.Lock() must be held 1296 func (sq *SubmitQueue) onQueue(obj *github.MungeObject) bool { 1297 for _, queueObj := range sq.githubE2EQueue { 1298 if *queueObj.Issue.Number == *obj.Issue.Number { 1299 return true 1300 } 1301 1302 } 1303 return false 1304 } 1305 1306 // sq.Lock() better held!!! 1307 func (sq *SubmitQueue) orderedE2EQueue() []int { 1308 prs := []*github.MungeObject{} 1309 for _, obj := range sq.githubE2EQueue { 1310 prs = append(prs, obj) 1311 } 1312 sort.Sort(queueSorter{prs, sq.lgtmTimeCache}) 1313 1314 var ordered []int 1315 for _, obj := range prs { 1316 ordered = append(ordered, *obj.Issue.Number) 1317 } 1318 return ordered 1319 } 1320 1321 // handleGithubE2EAndMerge waits for PRs that are ready to re-run the github 1322 // e2e tests, runs the test, and then merges if everything was successful. 1323 func (sq *SubmitQueue) handleGithubE2EAndMerge() { 1324 for { 1325 sq.Lock() 1326 l := len(sq.githubE2EQueue) 1327 sq.Unlock() 1328 // Wait until something is ready to be processed 1329 if l == 0 { 1330 time.Sleep(sq.githubE2EPollTime) 1331 continue 1332 } 1333 1334 obj := sq.selectPullRequest() 1335 if obj == nil { 1336 continue 1337 } 1338 1339 // only critical fixes can be merged if postsubmits are failing 1340 if !sq.e2eStable(false) && !obj.HasLabel(criticalFixLabel) { 1341 time.Sleep(sq.githubE2EPollTime) 1342 continue 1343 } 1344 1345 // re-test and maybe merge 1346 remove := sq.doGithubE2EAndMerge(obj) 1347 if remove { 1348 // remove it from the map after we finish testing 1349 sq.Lock() 1350 if sq.githubE2ERunning != nil { 1351 sq.githubE2ELastPRNum = *sq.githubE2ERunning.Issue.Number 1352 } 1353 sq.githubE2ERunning = nil 1354 sq.deleteQueueItem(obj) 1355 sq.Unlock() 1356 } 1357 } 1358 } 1359 1360 func (sq *SubmitQueue) mergePullRequest(obj *github.MungeObject, msg, extra string) bool { 1361 isMaster, _ := obj.IsForBranch("master") 1362 if isMaster { 1363 sq.opts.Lock() 1364 if sq.MergeToMasterMessage != "" { 1365 extra = extra + ". " + sq.MergeToMasterMessage 1366 } 1367 sq.opts.Unlock() 1368 } 1369 ok := obj.MergePR("submit-queue" + extra) 1370 if !ok { 1371 return ok 1372 } 1373 sq.SetMergeStatus(obj, msg) 1374 sq.updateMergeRate() 1375 return true 1376 } 1377 1378 func (sq *SubmitQueue) selectPullRequest() *github.MungeObject { 1379 if sq.interruptedObj != nil { 1380 return sq.interruptedObj.obj 1381 } 1382 sq.Lock() 1383 defer sq.Unlock() 1384 if len(sq.githubE2EQueue) == 0 { 1385 return nil 1386 } 1387 keys := sq.orderedE2EQueue() 1388 obj := sq.githubE2EQueue[keys[0]] 1389 if sq.githubE2ERunning != nil { 1390 sq.githubE2ELastPRNum = *sq.githubE2ERunning.Issue.Number 1391 } 1392 sq.githubE2ERunning = obj 1393 1394 return obj 1395 } 1396 1397 func (interruptedObj *submitQueueInterruptedObject) hasSHAChanged() bool { 1398 headSHA, baseRef, gotHeadSHA := interruptedObj.obj.GetHeadAndBase() 1399 if !gotHeadSHA { 1400 return true 1401 } 1402 1403 baseSHA, gotBaseSHA := interruptedObj.obj.GetSHAFromRef(baseRef) 1404 if !gotBaseSHA { 1405 return true 1406 } 1407 1408 return interruptedObj.interruptedMergeBaseSHA != baseSHA || 1409 interruptedObj.interruptedMergeHeadSHA != headSHA 1410 } 1411 1412 func newInterruptedObject(obj *github.MungeObject) *submitQueueInterruptedObject { 1413 if headSHA, baseRef, gotHeadSHA := obj.GetHeadAndBase(); !gotHeadSHA { 1414 return nil 1415 } else if baseSHA, gotBaseSHA := obj.GetSHAFromRef(baseRef); !gotBaseSHA { 1416 return nil 1417 } else { 1418 return &submitQueueInterruptedObject{obj, headSHA, baseSHA} 1419 } 1420 } 1421 1422 // Returns true if we can discard the PR from the queue, false if we must keep it for later. 1423 // If you modify this, consider modifying doBatchMerge too. 1424 func (sq *SubmitQueue) doGithubE2EAndMerge(obj *github.MungeObject) bool { 1425 interruptedObj := sq.interruptedObj 1426 sq.interruptedObj = nil 1427 1428 ok := obj.Refresh() 1429 if !ok { 1430 glog.Errorf("%d: unknown err", *obj.Issue.Number) 1431 sq.SetMergeStatus(obj, unknown) 1432 return true 1433 } 1434 1435 if !sq.validForMerge(obj) { 1436 return true 1437 } 1438 1439 if obj.HasLabel(retestNotRequiredLabel) || obj.HasLabel(retestNotRequiredDocsOnlyLabel) { 1440 atomic.AddInt32(&sq.instantMerges, 1) 1441 sq.mergePullRequest(obj, mergedSkippedRetest, "") 1442 return true 1443 } 1444 1445 sha, _, ok := obj.GetHeadAndBase() 1446 if !ok { 1447 glog.Errorf("%d: Unable to get SHA", *obj.Issue.Number) 1448 sq.SetMergeStatus(obj, unknown) 1449 return true 1450 } 1451 if interruptedObj != nil { 1452 if interruptedObj.hasSHAChanged() { 1453 // This PR will have to be rested. 1454 // Make sure we don't have higher priority first. 1455 return false 1456 } 1457 glog.Infof("Skipping retest since head and base sha match previous attempt!") 1458 atomic.AddInt32(&sq.retestsAvoided, 1) 1459 } else { 1460 if sq.retestPR(obj) { 1461 return true 1462 } 1463 1464 ok := obj.Refresh() 1465 if !ok { 1466 sq.SetMergeStatus(obj, unknown) 1467 return true 1468 } 1469 } 1470 1471 sq.mergeLock.Lock() 1472 defer sq.mergeLock.Unlock() 1473 1474 // We shouldn't merge if it's not valid anymore 1475 if !sq.validForMerge(obj) { 1476 glog.Errorf("%d: Not mergeable anymore. Do not merge.", *obj.Issue.Number) 1477 return true 1478 } 1479 1480 if newSha, _, ok := obj.GetHeadAndBase(); !ok { 1481 glog.Errorf("%d: Unable to get SHA", *obj.Issue.Number) 1482 sq.SetMergeStatus(obj, unknown) 1483 return true 1484 } else if newSha != sha { 1485 glog.Errorf("%d: Changed while running the test. Do not merge.", *obj.Issue.Number) 1486 sq.SetMergeStatus(obj, headCommitChanged) 1487 return false 1488 } 1489 1490 if !sq.e2eStable(true) && !obj.HasLabel(criticalFixLabel) { 1491 if sq.validForMerge(obj) { 1492 sq.interruptedObj = newInterruptedObject(obj) 1493 } 1494 sq.SetMergeStatus(obj, e2eFailure) 1495 return true 1496 } 1497 1498 sq.mergePullRequest(obj, merged, "") 1499 return true 1500 } 1501 1502 // Returns true if merge status changes, and false otherwise. 1503 func (sq *SubmitQueue) retestPR(obj *github.MungeObject) bool { 1504 sq.opts.Lock() 1505 retestContexts := mungeopts.RequiredContexts.Retest 1506 sq.opts.Unlock() 1507 1508 if len(retestContexts) == 0 { 1509 return false 1510 } 1511 1512 if err := obj.WriteComment(newRetestBody); err != nil { 1513 glog.Errorf("%d: unknown err: %v", *obj.Issue.Number, err) 1514 sq.SetMergeStatus(obj, unknown) 1515 return true 1516 } 1517 1518 // Wait for the retest to start 1519 sq.SetMergeStatus(obj, ghE2EWaitingStart) 1520 atomic.AddInt32(&sq.prsTested, 1) 1521 sq.opts.Lock() 1522 prMaxWaitTime := mungeopts.PRMaxWaitTime 1523 sq.opts.Unlock() 1524 done := obj.WaitForPending(retestContexts, prMaxWaitTime) 1525 if !done { 1526 sq.SetMergeStatus(obj, fmt.Sprintf("Timed out waiting for PR %d to start testing", obj.Number())) 1527 return true 1528 } 1529 1530 // Wait for the status to go back to something other than pending 1531 sq.SetMergeStatus(obj, ghE2ERunning) 1532 done = obj.WaitForNotPending(retestContexts, prMaxWaitTime) 1533 if !done { 1534 sq.SetMergeStatus(obj, fmt.Sprintf("Timed out waiting for PR %d to finish testing", obj.Number())) 1535 return true 1536 } 1537 1538 // Check if the thing we care about is success 1539 if success, ok := obj.IsStatusSuccess(retestContexts); !success || !ok { 1540 sq.SetMergeStatus(obj, ghE2EFailed) 1541 return true 1542 } 1543 1544 // no action taken. 1545 return false 1546 } 1547 1548 func (sq *SubmitQueue) serve(data []byte, res http.ResponseWriter, req *http.Request) { 1549 if data == nil { 1550 res.Header().Set("Content-type", "text/plain") 1551 res.WriteHeader(http.StatusInternalServerError) 1552 } else { 1553 res.Header().Set("Content-type", "application/json") 1554 res.WriteHeader(http.StatusOK) 1555 res.Write(data) 1556 } 1557 } 1558 1559 func (sq *SubmitQueue) serveHistory(res http.ResponseWriter, req *http.Request) { 1560 data := sq.getQueueHistory() 1561 sq.serve(data, res, req) 1562 } 1563 1564 func (sq *SubmitQueue) servePRs(res http.ResponseWriter, req *http.Request) { 1565 data := sq.getQueueStatus() 1566 sq.serve(data, res, req) 1567 } 1568 1569 func (sq *SubmitQueue) serveGithubE2EStatus(res http.ResponseWriter, req *http.Request) { 1570 data := sq.getGithubE2EStatus() 1571 sq.serve(data, res, req) 1572 } 1573 1574 func (sq *SubmitQueue) serveCIStatus(res http.ResponseWriter, req *http.Request) { 1575 sq.Lock() 1576 data := sq.marshal(sq.ciStatus) 1577 sq.Unlock() 1578 sq.serve(data, res, req) 1579 } 1580 1581 func (sq *SubmitQueue) serveHealth(res http.ResponseWriter, req *http.Request) { 1582 sq.Lock() 1583 data := sq.marshal(sq.health) 1584 sq.Unlock() 1585 sq.serve(data, res, req) 1586 } 1587 1588 func (sq *SubmitQueue) serveSQStats(res http.ResponseWriter, req *http.Request) { 1589 data := submitQueueStats{ 1590 Added: int(atomic.LoadInt32(&sq.prsAdded)), 1591 FlakesIgnored: int(atomic.LoadInt32(&sq.flakesIgnored)), 1592 Initialized: atomic.LoadInt32(&sq.loopStarts) > 1, 1593 InstantMerges: int(atomic.LoadInt32(&sq.instantMerges)), 1594 BatchMerges: int(atomic.LoadInt32(&sq.batchMerges)), 1595 LastMergeTime: sq.lastMergeTime, 1596 MergeRate: sq.calcMergeRateWithTail(), 1597 MergesSinceRestart: int(atomic.LoadInt32(&sq.totalMerges)), 1598 Removed: int(atomic.LoadInt32(&sq.prsRemoved)), 1599 RetestsAvoided: int(atomic.LoadInt32(&sq.retestsAvoided)), 1600 StartTime: sq.startTime, 1601 Tested: int(atomic.LoadInt32(&sq.prsTested)), 1602 } 1603 sq.serve(sq.marshal(data), res, req) 1604 } 1605 1606 func (sq *SubmitQueue) serveFlakes(res http.ResponseWriter, req *http.Request) { 1607 data := sq.e2e.Flakes() 1608 sq.serve(mungerutil.PrettyMarshal(data), res, req) 1609 } 1610 1611 func (sq *SubmitQueue) serveMetadata(res http.ResponseWriter, req *http.Request) { 1612 sq.Lock() 1613 data := sq.marshal(sq.Metadata) 1614 sq.Unlock() 1615 sq.serve(data, res, req) 1616 } 1617 1618 func (sq *SubmitQueue) serveBatch(res http.ResponseWriter, req *http.Request) { 1619 sq.serve(sq.marshal(sq.batchStatus), res, req) 1620 } 1621 1622 func (sq *SubmitQueue) serveMergeInfo(res http.ResponseWriter, req *http.Request) { 1623 // Lock to get options since we are not running in the main goroutine. 1624 sq.opts.Lock() 1625 doNotMergeMilestones := sq.DoNotMergeMilestones 1626 additionalLabels := sq.AdditionalRequiredLabels 1627 gateApproved := sq.GateApproved 1628 gateCLA := sq.GateCLA 1629 mergeContexts := mungeopts.RequiredContexts.Merge 1630 retestContexts := mungeopts.RequiredContexts.Retest 1631 sq.opts.Unlock() 1632 1633 res.Header().Set("Content-type", "text/plain") 1634 res.WriteHeader(http.StatusOK) 1635 var out bytes.Buffer 1636 out.WriteString("PRs must meet the following set of conditions to be considered for automatic merging by the submit queue.") 1637 out.WriteString("<ol>") 1638 if gateCLA { 1639 out.WriteString(fmt.Sprintf("<li>The PR must have the label %q or %q </li>", cncfClaYesLabel, claHumanLabel)) 1640 } 1641 out.WriteString("<li>The PR must be mergeable. aka cannot need a rebase</li>") 1642 if len(mergeContexts) > 0 || len(retestContexts) > 0 { 1643 out.WriteString("<li>All of the following github statuses must be green") 1644 out.WriteString("<ul>") 1645 for _, context := range mergeContexts { 1646 out.WriteString(fmt.Sprintf("<li>%s</li>", context)) 1647 } 1648 for _, context := range retestContexts { 1649 out.WriteString(fmt.Sprintf("<li>%s</li>", context)) 1650 } 1651 out.WriteString("</ul>") 1652 } 1653 out.WriteString(fmt.Sprintf("<li>The PR cannot have any of the following milestones: %q</li>", doNotMergeMilestones)) 1654 out.WriteString(fmt.Sprintf(`<li>The PR must have the %q label</li>`, lgtmLabel)) 1655 out.WriteString(fmt.Sprintf("<li>The PR must not have been updated since the %q label was applied</li>", lgtmLabel)) 1656 if gateApproved { 1657 out.WriteString(fmt.Sprintf(`<li>The PR must have the %q label</li>`, approvedLabel)) 1658 } 1659 if len(additionalLabels) > 0 { 1660 out.WriteString(fmt.Sprintf(`<li>The PR must have the following labels: %q</li>`, additionalLabels)) 1661 } 1662 out.WriteString(`<li>The PR must not have the any labels starting with "do-not-merge"</li>`) 1663 out.WriteString(`</ol><br>`) 1664 out.WriteString("The PR can then be queued to re-test before merge. Once it reaches the top of the queue all of the above conditions must be true but so must the following:") 1665 out.WriteString("<ol>") 1666 if len(retestContexts) > 0 { 1667 out.WriteString("<li>All of the following tests must pass a second time") 1668 out.WriteString("<ul>") 1669 for _, context := range retestContexts { 1670 out.WriteString(fmt.Sprintf("<li>%s</li>", context)) 1671 } 1672 out.WriteString("</ul>") 1673 out.WriteString(fmt.Sprintf("Unless the %q or %q label is present</li>", retestNotRequiredLabel, retestNotRequiredDocsOnlyLabel)) 1674 } 1675 out.WriteString("</ol>") 1676 out.WriteString("And then the PR will be merged!!") 1677 res.Write(out.Bytes()) 1678 } 1679 1680 func writeLabel(label string, res http.ResponseWriter) { 1681 out := fmt.Sprintf(` <li>%q label 1682 <ul> 1683 <li>A PR with %q will come next</li> 1684 </ul> 1685 </li> 1686 `, label, label) 1687 res.Write([]byte(out)) 1688 } 1689 1690 func (sq *SubmitQueue) servePriorityInfo(res http.ResponseWriter, req *http.Request) { 1691 res.Header().Set("Content-type", "text/plain") 1692 res.WriteHeader(http.StatusOK) 1693 res.Write([]byte(`The merge queue is sorted by the following. If there is a tie in any test the next test will be used. 1694 <ol> 1695 <li>'` + criticalFixLabel + `' label 1696 <ul> 1697 <li>A PR with '` + criticalFixLabel + `' will come first</li> 1698 <li>A PR with '` + criticalFixLabel + `' will merge even if the e2e tests are blocked</li> 1699 </ul> 1700 </li> 1701 `)) 1702 for i := 1; i <= lastHighPriorityLabel; i++ { 1703 writeLabel(labelPriorities[i], res) 1704 } 1705 res.Write([]byte(` <li>Release milestone due date 1706 <ul> 1707 <li>Release milestones are of the form vX.Y where X and Y are integers</li> 1708 <li>The release milestore must have a due date set to affect queue order</li> 1709 <li>Other milestones are ignored</li> 1710 </ul> 1711 </li> 1712 `)) 1713 for i := lastHighPriorityLabel + 1; i < len(labelPriorities); i++ { 1714 writeLabel(labelPriorities[i], res) 1715 } 1716 res.Write([]byte(` <li>First time at which the LGTM label was applied. 1717 <ul> 1718 <li>This means all PRs start at the bottom of the queue (within their priority and milestone bands, of course) and progress towards the top.</li> 1719 </ul> 1720 </li> 1721 </ol> `)) 1722 } 1723 1724 func (sq *SubmitQueue) getHealthSVG() []byte { 1725 sq.Lock() 1726 defer sq.Unlock() 1727 blocked := false 1728 blockingJobs := make([]string, 0) 1729 blocked = !sq.health.MergePossibleNow 1730 status := "running" 1731 color := "brightgreen" 1732 if blocked { 1733 status = "blocked" 1734 color = "red" 1735 for job, status := range sq.e2e.GetBuildStatus() { 1736 if status.Status == "Not Stable" { 1737 job = strings.Replace(job, "kubernetes-", "", -1) 1738 blockingJobs = append(blockingJobs, job) 1739 } 1740 } 1741 sort.Strings(blockingJobs) 1742 if len(blockingJobs) > 3 { 1743 blockingJobs = append(blockingJobs[:3], "...") 1744 } 1745 if len(blockingJobs) > 0 { 1746 status += " by " + strings.Join(blockingJobs, ", ") 1747 } 1748 } 1749 return shield.Make("queue", status, color) 1750 } 1751 1752 func (sq *SubmitQueue) serveHealthSVG(res http.ResponseWriter, req *http.Request) { 1753 res.Header().Set("Content-type", "image/svg+xml") 1754 res.Header().Set("Cache-Control", "max-age=60") 1755 res.WriteHeader(http.StatusOK) 1756 res.Write(sq.getHealthSVG()) 1757 } 1758 1759 func (sq *SubmitQueue) isStaleIssueComment(obj *github.MungeObject, comment *githubapi.IssueComment) bool { 1760 if !obj.IsRobot(comment.User) { 1761 return false 1762 } 1763 if *comment.Body != newRetestBody { 1764 return false 1765 } 1766 stale := commentBeforeLastCI(obj, comment, mungeopts.RequiredContexts.Retest) 1767 if stale { 1768 glog.V(6).Infof("Found stale SubmitQueue safe to merge comment") 1769 } 1770 return stale 1771 } 1772 1773 // StaleIssueComments returns a slice of stale issue comments. 1774 func (sq *SubmitQueue) StaleIssueComments(obj *github.MungeObject, comments []*githubapi.IssueComment) []*githubapi.IssueComment { 1775 return forEachCommentTest(obj, comments, sq.isStaleIssueComment) 1776 }