github.com/cloudwego/kitex@v0.9.0/pkg/retry/backup_retryer.go (about) 1 /* 2 * Copyright 2021 CloudWeGo Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package retry 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "runtime" 24 "strconv" 25 "strings" 26 "sync" 27 "sync/atomic" 28 "time" 29 30 "github.com/cloudwego/kitex/pkg/circuitbreak" 31 "github.com/cloudwego/kitex/pkg/gofunc" 32 "github.com/cloudwego/kitex/pkg/kerrors" 33 "github.com/cloudwego/kitex/pkg/klog" 34 "github.com/cloudwego/kitex/pkg/rpcinfo" 35 "github.com/cloudwego/kitex/pkg/utils" 36 ) 37 38 var errUnexpectedFinish = errors.New("backup request: all retries finished unexpectedly, " + 39 "please submit an issue to https://github.com/cloudwego/kitex/issues") 40 41 func newBackupRetryer(policy Policy, cbC *cbContainer) (Retryer, error) { 42 br := &backupRetryer{cbContainer: cbC} 43 if err := br.UpdatePolicy(policy); err != nil { 44 return nil, fmt.Errorf("newBackupRetryer failed, err=%w", err) 45 } 46 return br, nil 47 } 48 49 type backupRetryer struct { 50 enable bool 51 policy *BackupPolicy 52 cbContainer *cbContainer 53 retryDelay time.Duration 54 sync.RWMutex 55 errMsg string 56 } 57 58 type resultWrapper struct { 59 ri rpcinfo.RPCInfo 60 err error 61 } 62 63 // ShouldRetry implements the Retryer interface. 64 func (r *backupRetryer) ShouldRetry(ctx context.Context, err error, callTimes int, req interface{}, cbKey string) (string, bool) { 65 r.RLock() 66 defer r.RUnlock() 67 if !r.enable { 68 return "", false 69 } 70 if stop, msg := circuitBreakerStop(ctx, r.policy.StopPolicy, r.cbContainer, req, cbKey); stop { 71 return msg, false 72 } 73 return "", true 74 } 75 76 // AllowRetry implements the Retryer interface. 77 func (r *backupRetryer) AllowRetry(ctx context.Context) (string, bool) { 78 r.RLock() 79 defer r.RUnlock() 80 if !r.enable || r.policy.StopPolicy.MaxRetryTimes == 0 { 81 return "", false 82 } 83 if stop, msg := chainStop(ctx, r.policy.StopPolicy); stop { 84 return msg, false 85 } 86 return "", true 87 } 88 89 // Do implement the Retryer interface. 90 func (r *backupRetryer) Do(ctx context.Context, rpcCall RPCCallFunc, firstRI rpcinfo.RPCInfo, req interface{}) (lastRI rpcinfo.RPCInfo, recycleRI bool, err error) { 91 r.RLock() 92 retryTimes := r.policy.StopPolicy.MaxRetryTimes 93 retryDelay := r.retryDelay 94 r.RUnlock() 95 var callTimes int32 = 0 96 var callCosts utils.StringBuilder 97 callCosts.RawStringBuilder().Grow(32) 98 var recordCostDoing int32 = 0 99 var abort int32 = 0 100 finishedCount := 0 101 // notice: buff num of chan is very important here, it cannot less than call times, or the below chan receive will block 102 done := make(chan *resultWrapper, retryTimes+1) 103 cbKey, _ := r.cbContainer.cbCtl.GetKey(ctx, req) 104 timer := time.NewTimer(retryDelay) 105 defer func() { 106 if panicInfo := recover(); panicInfo != nil { 107 err = panicToErr(ctx, panicInfo, firstRI) 108 } 109 timer.Stop() 110 }() 111 // include first call, max loop is retryTimes + 1 112 doCall := true 113 for i := 0; ; { 114 if doCall { 115 doCall = false 116 i++ 117 gofunc.GoFunc(ctx, func() { 118 if atomic.LoadInt32(&abort) == 1 { 119 return 120 } 121 var ( 122 e error 123 cRI rpcinfo.RPCInfo 124 ) 125 defer func() { 126 if panicInfo := recover(); panicInfo != nil { 127 e = panicToErr(ctx, panicInfo, firstRI) 128 } 129 done <- &resultWrapper{cRI, e} 130 }() 131 ct := atomic.AddInt32(&callTimes, 1) 132 callStart := time.Now() 133 if r.cbContainer.enablePercentageLimit { 134 // record stat before call since requests may be slow, making the limiter more accurate 135 recordRetryStat(cbKey, r.cbContainer.cbPanel, ct) 136 } 137 cRI, _, e = rpcCall(ctx, r) 138 recordCost(ct, callStart, &recordCostDoing, &callCosts, &abort, e) 139 if !r.cbContainer.enablePercentageLimit && r.cbContainer.cbStat { 140 circuitbreak.RecordStat(ctx, req, nil, e, cbKey, r.cbContainer.cbCtl, r.cbContainer.cbPanel) 141 } 142 }) 143 } 144 select { 145 case <-timer.C: 146 if _, ok := r.ShouldRetry(ctx, nil, i, req, cbKey); ok && i <= retryTimes { 147 doCall = true 148 timer.Reset(retryDelay) 149 } 150 case res := <-done: 151 if res.err != nil && errors.Is(res.err, kerrors.ErrRPCFinish) { 152 // There will be only one request (goroutine) pass the `checkRPCState`, others will skip decoding 153 // and return `ErrRPCFinish`, to avoid concurrent write to response and save the cost of decoding. 154 // We can safely ignore this error and wait for the response of the passed goroutine. 155 if finishedCount++; finishedCount >= retryTimes+1 { 156 // But if all requests return this error, it must be a bug, preventive panic to avoid dead loop 157 panic(errUnexpectedFinish) 158 } 159 continue 160 } 161 atomic.StoreInt32(&abort, 1) 162 recordRetryInfo(res.ri, atomic.LoadInt32(&callTimes), callCosts.String()) 163 return res.ri, false, res.err 164 } 165 } 166 } 167 168 // Prepare implements the Retryer interface. 169 func (r *backupRetryer) Prepare(ctx context.Context, prevRI, retryRI rpcinfo.RPCInfo) { 170 handleRetryInstance(r.policy.RetrySameNode, prevRI, retryRI) 171 } 172 173 // UpdatePolicy implements the Retryer interface. 174 func (r *backupRetryer) UpdatePolicy(rp Policy) (err error) { 175 if !rp.Enable { 176 r.Lock() 177 r.enable = rp.Enable 178 r.Unlock() 179 return nil 180 } 181 var errMsg string 182 if rp.BackupPolicy == nil || rp.Type != BackupType { 183 errMsg = "BackupPolicy is nil or retry type not match, cannot do update in backupRetryer" 184 err = errors.New(errMsg) 185 } 186 if errMsg == "" && (rp.BackupPolicy.RetryDelayMS == 0 || rp.BackupPolicy.StopPolicy.MaxRetryTimes < 0 || 187 rp.BackupPolicy.StopPolicy.MaxRetryTimes > maxBackupRetryTimes) { 188 errMsg = "invalid backup request delay duration or retryTimes" 189 err = errors.New(errMsg) 190 } 191 if errMsg == "" { 192 if e := checkCBErrorRate(&rp.BackupPolicy.StopPolicy.CBPolicy); e != nil { 193 rp.BackupPolicy.StopPolicy.CBPolicy.ErrorRate = defaultCBErrRate 194 errMsg = fmt.Sprintf("backupRetryer %s, use default %0.2f", e.Error(), defaultCBErrRate) 195 klog.Warnf(errMsg) 196 } 197 } 198 199 r.Lock() 200 defer r.Unlock() 201 r.enable = rp.Enable 202 if err != nil { 203 r.errMsg = errMsg 204 return err 205 } 206 r.policy = rp.BackupPolicy 207 r.retryDelay = time.Duration(rp.BackupPolicy.RetryDelayMS) * time.Millisecond 208 return nil 209 } 210 211 // AppendErrMsgIfNeeded implements the Retryer interface. 212 func (r *backupRetryer) AppendErrMsgIfNeeded(err error, ri rpcinfo.RPCInfo, msg string) { 213 if kerrors.IsTimeoutError(err) { 214 // Add additional reason to the error message when timeout occurs but the backup request is not sent. 215 appendErrMsg(err, msg) 216 } 217 } 218 219 // Dump implements the Retryer interface. 220 func (r *backupRetryer) Dump() map[string]interface{} { 221 r.RLock() 222 defer r.RUnlock() 223 if r.errMsg != "" { 224 return map[string]interface{}{ 225 "enable": r.enable, 226 "backupRequest": r.policy, 227 "errMsg": r.errMsg, 228 } 229 } 230 return map[string]interface{}{"enable": r.enable, "backupRequest": r.policy} 231 } 232 233 // Type implements the Retryer interface. 234 func (r *backupRetryer) Type() Type { 235 return BackupType 236 } 237 238 // record request cost, it may execute concurrent 239 func recordCost(ct int32, start time.Time, recordCostDoing *int32, sb *utils.StringBuilder, abort *int32, err error) { 240 if atomic.LoadInt32(abort) == 1 { 241 return 242 } 243 for !atomic.CompareAndSwapInt32(recordCostDoing, 0, 1) { 244 runtime.Gosched() 245 } 246 sb.WithLocked(func(b *strings.Builder) error { 247 if b.Len() > 0 { 248 b.WriteByte(',') 249 } 250 b.WriteString(strconv.Itoa(int(ct))) 251 b.WriteByte('-') 252 b.WriteString(strconv.FormatInt(time.Since(start).Microseconds(), 10)) 253 if err != nil && errors.Is(err, kerrors.ErrRPCFinish) { 254 // ErrRPCFinish means previous call returns first but is decoding. 255 // Add ignore to distinguish. 256 b.WriteString("(ignore)") 257 } 258 return nil 259 }) 260 atomic.StoreInt32(recordCostDoing, 0) 261 }