github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/servermaster/jobop/backoff.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package jobop 15 16 import ( 17 "time" 18 19 "github.com/cenkalti/backoff/v4" 20 "github.com/pingcap/log" 21 "github.com/pingcap/tiflow/engine/pkg/clock" 22 "go.uber.org/zap" 23 ) 24 25 type backoffEventType int32 26 27 const ( 28 backoffOnline backoffEventType = iota + 1 29 backoffOffline 30 ) 31 32 type backoffEvent struct { 33 tp backoffEventType 34 ts time.Time 35 } 36 37 // NewJobBackoff creates a new job backoff 38 func NewJobBackoff(jobID string, clocker clock.Clock, config *BackoffConfig) *JobBackoff { 39 errBackoff := backoff.NewExponentialBackOff() 40 errBackoff.InitialInterval = config.InitialInterval 41 errBackoff.MaxInterval = config.MaxInterval 42 errBackoff.Multiplier = config.Multiplier 43 // MaxElapsedTime=0 means the backoff never stops, since there is other ways 44 // to stop backoff, including continuously failure check, cancel job. 45 errBackoff.MaxElapsedTime = 0 46 errBackoff.Reset() 47 48 return &JobBackoff{ 49 jobID: jobID, 50 clocker: clocker, 51 config: config, 52 errBackoff: errBackoff, 53 } 54 } 55 56 // JobBackoff is a job backoff manager, it recoreds job online and offline events 57 // and determines whether a job can be re-created based on backoff mechanism. 58 // The backoff stragegy is as following 59 // - Each time a fail event arrives, the backoff time will be move forward by 60 // nextBackoff. 61 // - If a job is success for more than `resetInterval`, the backoff history will 62 // be cleared, and backoff time will be re-calculated. 63 type JobBackoff struct { 64 jobID string 65 clocker clock.Clock 66 config *BackoffConfig 67 68 events []backoffEvent 69 errBackoff *backoff.ExponentialBackOff 70 backoffInterval time.Duration 71 } 72 73 // Terminate returns whether job should be terminated. 74 // It happens when job fails continuously for more than max try times. 75 func (b *JobBackoff) Terminate() bool { 76 if len(b.events) < b.config.MaxTryTime { 77 return false 78 } 79 failCount := 0 80 for _, event := range b.events { 81 if event.tp == backoffOffline { 82 failCount++ 83 } 84 } 85 return failCount >= b.config.MaxTryTime 86 } 87 88 // Allow returns whether new request(create job) is allowd 89 func (b *JobBackoff) Allow() bool { 90 var lastErrorTime time.Time 91 for i := len(b.events) - 1; i >= 0; i-- { 92 event := b.events[i] 93 if event.tp == backoffOffline { 94 lastErrorTime = event.ts 95 break 96 } 97 } 98 return b.clocker.Since(lastErrorTime) >= b.backoffInterval 99 } 100 101 // Success is called when a success event happens 102 func (b *JobBackoff) Success() { 103 event := backoffEvent{ 104 tp: backoffOnline, 105 ts: b.clocker.Now(), 106 } 107 b.addEvent(event) 108 } 109 110 // Fail is called when a failure event happens 111 func (b *JobBackoff) Fail() { 112 event := backoffEvent{ 113 tp: backoffOffline, 114 ts: b.clocker.Now(), 115 } 116 b.addEvent(event) 117 b.nextBackoff() 118 } 119 120 // addEvent appends new backoff event into backoffer 121 func (b *JobBackoff) addEvent(event backoffEvent) { 122 // The last event is online and it is earlier than `resetInterval`, 123 // reset the backoff 124 if len(b.events) > 0 { 125 lastEvent := b.events[len(b.events)-1] 126 if lastEvent.tp == backoffOnline && 127 b.clocker.Since(lastEvent.ts) >= b.config.ResetInterval { 128 b.events = make([]backoffEvent, 0) 129 b.resetErrBackoff() 130 } 131 } 132 b.events = append(b.events, event) 133 } 134 135 func (b *JobBackoff) resetErrBackoff() { 136 b.errBackoff.Reset() 137 b.backoffInterval = 0 138 } 139 140 func (b *JobBackoff) nextBackoff() { 141 oldInterval := b.backoffInterval 142 b.backoffInterval = b.errBackoff.NextBackOff() 143 log.Info("job backoff interval is changed", 144 zap.String("job-id", b.jobID), 145 zap.Duration("old-interval", oldInterval), 146 zap.Duration("new-interval", b.backoffInterval), 147 ) 148 }