github.com/cloudwego/kitex@v0.9.0/pkg/retry/backup_retryer.go (about)

     1  /*
     2   * Copyright 2021 CloudWeGo Authors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package retry
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"runtime"
    24  	"strconv"
    25  	"strings"
    26  	"sync"
    27  	"sync/atomic"
    28  	"time"
    29  
    30  	"github.com/cloudwego/kitex/pkg/circuitbreak"
    31  	"github.com/cloudwego/kitex/pkg/gofunc"
    32  	"github.com/cloudwego/kitex/pkg/kerrors"
    33  	"github.com/cloudwego/kitex/pkg/klog"
    34  	"github.com/cloudwego/kitex/pkg/rpcinfo"
    35  	"github.com/cloudwego/kitex/pkg/utils"
    36  )
    37  
    38  var errUnexpectedFinish = errors.New("backup request: all retries finished unexpectedly, " +
    39  	"please submit an issue to https://github.com/cloudwego/kitex/issues")
    40  
    41  func newBackupRetryer(policy Policy, cbC *cbContainer) (Retryer, error) {
    42  	br := &backupRetryer{cbContainer: cbC}
    43  	if err := br.UpdatePolicy(policy); err != nil {
    44  		return nil, fmt.Errorf("newBackupRetryer failed, err=%w", err)
    45  	}
    46  	return br, nil
    47  }
    48  
    49  type backupRetryer struct {
    50  	enable      bool
    51  	policy      *BackupPolicy
    52  	cbContainer *cbContainer
    53  	retryDelay  time.Duration
    54  	sync.RWMutex
    55  	errMsg string
    56  }
    57  
    58  type resultWrapper struct {
    59  	ri  rpcinfo.RPCInfo
    60  	err error
    61  }
    62  
    63  // ShouldRetry implements the Retryer interface.
    64  func (r *backupRetryer) ShouldRetry(ctx context.Context, err error, callTimes int, req interface{}, cbKey string) (string, bool) {
    65  	r.RLock()
    66  	defer r.RUnlock()
    67  	if !r.enable {
    68  		return "", false
    69  	}
    70  	if stop, msg := circuitBreakerStop(ctx, r.policy.StopPolicy, r.cbContainer, req, cbKey); stop {
    71  		return msg, false
    72  	}
    73  	return "", true
    74  }
    75  
    76  // AllowRetry implements the Retryer interface.
    77  func (r *backupRetryer) AllowRetry(ctx context.Context) (string, bool) {
    78  	r.RLock()
    79  	defer r.RUnlock()
    80  	if !r.enable || r.policy.StopPolicy.MaxRetryTimes == 0 {
    81  		return "", false
    82  	}
    83  	if stop, msg := chainStop(ctx, r.policy.StopPolicy); stop {
    84  		return msg, false
    85  	}
    86  	return "", true
    87  }
    88  
    89  // Do implement the Retryer interface.
    90  func (r *backupRetryer) Do(ctx context.Context, rpcCall RPCCallFunc, firstRI rpcinfo.RPCInfo, req interface{}) (lastRI rpcinfo.RPCInfo, recycleRI bool, err error) {
    91  	r.RLock()
    92  	retryTimes := r.policy.StopPolicy.MaxRetryTimes
    93  	retryDelay := r.retryDelay
    94  	r.RUnlock()
    95  	var callTimes int32 = 0
    96  	var callCosts utils.StringBuilder
    97  	callCosts.RawStringBuilder().Grow(32)
    98  	var recordCostDoing int32 = 0
    99  	var abort int32 = 0
   100  	finishedCount := 0
   101  	// notice: buff num of chan is very important here, it cannot less than call times, or the below chan receive will block
   102  	done := make(chan *resultWrapper, retryTimes+1)
   103  	cbKey, _ := r.cbContainer.cbCtl.GetKey(ctx, req)
   104  	timer := time.NewTimer(retryDelay)
   105  	defer func() {
   106  		if panicInfo := recover(); panicInfo != nil {
   107  			err = panicToErr(ctx, panicInfo, firstRI)
   108  		}
   109  		timer.Stop()
   110  	}()
   111  	// include first call, max loop is retryTimes + 1
   112  	doCall := true
   113  	for i := 0; ; {
   114  		if doCall {
   115  			doCall = false
   116  			i++
   117  			gofunc.GoFunc(ctx, func() {
   118  				if atomic.LoadInt32(&abort) == 1 {
   119  					return
   120  				}
   121  				var (
   122  					e   error
   123  					cRI rpcinfo.RPCInfo
   124  				)
   125  				defer func() {
   126  					if panicInfo := recover(); panicInfo != nil {
   127  						e = panicToErr(ctx, panicInfo, firstRI)
   128  					}
   129  					done <- &resultWrapper{cRI, e}
   130  				}()
   131  				ct := atomic.AddInt32(&callTimes, 1)
   132  				callStart := time.Now()
   133  				if r.cbContainer.enablePercentageLimit {
   134  					// record stat before call since requests may be slow, making the limiter more accurate
   135  					recordRetryStat(cbKey, r.cbContainer.cbPanel, ct)
   136  				}
   137  				cRI, _, e = rpcCall(ctx, r)
   138  				recordCost(ct, callStart, &recordCostDoing, &callCosts, &abort, e)
   139  				if !r.cbContainer.enablePercentageLimit && r.cbContainer.cbStat {
   140  					circuitbreak.RecordStat(ctx, req, nil, e, cbKey, r.cbContainer.cbCtl, r.cbContainer.cbPanel)
   141  				}
   142  			})
   143  		}
   144  		select {
   145  		case <-timer.C:
   146  			if _, ok := r.ShouldRetry(ctx, nil, i, req, cbKey); ok && i <= retryTimes {
   147  				doCall = true
   148  				timer.Reset(retryDelay)
   149  			}
   150  		case res := <-done:
   151  			if res.err != nil && errors.Is(res.err, kerrors.ErrRPCFinish) {
   152  				// There will be only one request (goroutine) pass the `checkRPCState`, others will skip decoding
   153  				// and return `ErrRPCFinish`, to avoid concurrent write to response and save the cost of decoding.
   154  				// We can safely ignore this error and wait for the response of the passed goroutine.
   155  				if finishedCount++; finishedCount >= retryTimes+1 {
   156  					// But if all requests return this error, it must be a bug, preventive panic to avoid dead loop
   157  					panic(errUnexpectedFinish)
   158  				}
   159  				continue
   160  			}
   161  			atomic.StoreInt32(&abort, 1)
   162  			recordRetryInfo(res.ri, atomic.LoadInt32(&callTimes), callCosts.String())
   163  			return res.ri, false, res.err
   164  		}
   165  	}
   166  }
   167  
   168  // Prepare implements the Retryer interface.
   169  func (r *backupRetryer) Prepare(ctx context.Context, prevRI, retryRI rpcinfo.RPCInfo) {
   170  	handleRetryInstance(r.policy.RetrySameNode, prevRI, retryRI)
   171  }
   172  
   173  // UpdatePolicy implements the Retryer interface.
   174  func (r *backupRetryer) UpdatePolicy(rp Policy) (err error) {
   175  	if !rp.Enable {
   176  		r.Lock()
   177  		r.enable = rp.Enable
   178  		r.Unlock()
   179  		return nil
   180  	}
   181  	var errMsg string
   182  	if rp.BackupPolicy == nil || rp.Type != BackupType {
   183  		errMsg = "BackupPolicy is nil or retry type not match, cannot do update in backupRetryer"
   184  		err = errors.New(errMsg)
   185  	}
   186  	if errMsg == "" && (rp.BackupPolicy.RetryDelayMS == 0 || rp.BackupPolicy.StopPolicy.MaxRetryTimes < 0 ||
   187  		rp.BackupPolicy.StopPolicy.MaxRetryTimes > maxBackupRetryTimes) {
   188  		errMsg = "invalid backup request delay duration or retryTimes"
   189  		err = errors.New(errMsg)
   190  	}
   191  	if errMsg == "" {
   192  		if e := checkCBErrorRate(&rp.BackupPolicy.StopPolicy.CBPolicy); e != nil {
   193  			rp.BackupPolicy.StopPolicy.CBPolicy.ErrorRate = defaultCBErrRate
   194  			errMsg = fmt.Sprintf("backupRetryer %s, use default %0.2f", e.Error(), defaultCBErrRate)
   195  			klog.Warnf(errMsg)
   196  		}
   197  	}
   198  
   199  	r.Lock()
   200  	defer r.Unlock()
   201  	r.enable = rp.Enable
   202  	if err != nil {
   203  		r.errMsg = errMsg
   204  		return err
   205  	}
   206  	r.policy = rp.BackupPolicy
   207  	r.retryDelay = time.Duration(rp.BackupPolicy.RetryDelayMS) * time.Millisecond
   208  	return nil
   209  }
   210  
   211  // AppendErrMsgIfNeeded implements the Retryer interface.
   212  func (r *backupRetryer) AppendErrMsgIfNeeded(err error, ri rpcinfo.RPCInfo, msg string) {
   213  	if kerrors.IsTimeoutError(err) {
   214  		// Add additional reason to the error message when timeout occurs but the backup request is not sent.
   215  		appendErrMsg(err, msg)
   216  	}
   217  }
   218  
   219  // Dump implements the Retryer interface.
   220  func (r *backupRetryer) Dump() map[string]interface{} {
   221  	r.RLock()
   222  	defer r.RUnlock()
   223  	if r.errMsg != "" {
   224  		return map[string]interface{}{
   225  			"enable":        r.enable,
   226  			"backupRequest": r.policy,
   227  			"errMsg":        r.errMsg,
   228  		}
   229  	}
   230  	return map[string]interface{}{"enable": r.enable, "backupRequest": r.policy}
   231  }
   232  
   233  // Type implements the Retryer interface.
   234  func (r *backupRetryer) Type() Type {
   235  	return BackupType
   236  }
   237  
   238  // record request cost, it may execute concurrent
   239  func recordCost(ct int32, start time.Time, recordCostDoing *int32, sb *utils.StringBuilder, abort *int32, err error) {
   240  	if atomic.LoadInt32(abort) == 1 {
   241  		return
   242  	}
   243  	for !atomic.CompareAndSwapInt32(recordCostDoing, 0, 1) {
   244  		runtime.Gosched()
   245  	}
   246  	sb.WithLocked(func(b *strings.Builder) error {
   247  		if b.Len() > 0 {
   248  			b.WriteByte(',')
   249  		}
   250  		b.WriteString(strconv.Itoa(int(ct)))
   251  		b.WriteByte('-')
   252  		b.WriteString(strconv.FormatInt(time.Since(start).Microseconds(), 10))
   253  		if err != nil && errors.Is(err, kerrors.ErrRPCFinish) {
   254  			// ErrRPCFinish means previous call returns first but is decoding.
   255  			// Add ignore to distinguish.
   256  			b.WriteString("(ignore)")
   257  		}
   258  		return nil
   259  	})
   260  	atomic.StoreInt32(recordCostDoing, 0)
   261  }