github.com/whtcorpsinc/milevadb-prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/milevadb-server/einsteindb/backoff.go (about) 1 // Copyright 2020 WHTCORPS INC, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package einsteindb 15 16 import ( 17 "context" 18 "fmt" 19 "math" 20 "strings" 21 "sync/atomic" 22 "time" 23 24 "github.com/opentracing/opentracing-go" 25 "github.com/prometheus/client_golang/prometheus" 26 "github.com/whtcorpsinc/errors" 27 "github.com/whtcorpsinc/log" 28 "github.com/whtcorpsinc/milevadb/ekv" 29 "github.com/whtcorpsinc/milevadb/metrics" 30 "github.com/whtcorpsinc/milevadb/soliton/execdetails" 31 "github.com/whtcorpsinc/milevadb/soliton/fastrand" 32 "github.com/whtcorpsinc/milevadb/soliton/logutil" 33 "go.uber.org/zap" 34 "go.uber.org/zap/zapembedded" 35 ) 36 37 const ( 38 // NoJitter makes the backoff sequence strict exponential. 39 NoJitter = 1 + iota 40 // FullJitter applies random factors to strict exponential. 41 FullJitter 42 // EqualJitter is also randomized, but prevents very short sleeps. 43 EqualJitter 44 // DecorrJitter increases the maximum jitter based on the last random value. 45 DecorrJitter 46 ) 47 48 var ( 49 einsteindbBackoffHistogramRPC = metrics.EinsteinDBBackoffHistogram.WithLabelValues("einsteindbRPC") 50 einsteindbBackoffHistogramLock = metrics.EinsteinDBBackoffHistogram.WithLabelValues("txnLock") 51 einsteindbBackoffHistogramLockFast = metrics.EinsteinDBBackoffHistogram.WithLabelValues("einsteindbLockFast") 52 einsteindbBackoffHistogramFIDel = metrics.EinsteinDBBackoffHistogram.WithLabelValues("FIDelRPC") 53 einsteindbBackoffHistogramRegionMiss = metrics.EinsteinDBBackoffHistogram.WithLabelValues("regionMiss") 54 einsteindbBackoffHistogramServerBusy = metrics.EinsteinDBBackoffHistogram.WithLabelValues("serverBusy") 55 einsteindbBackoffHistogramStaleCmd = metrics.EinsteinDBBackoffHistogram.WithLabelValues("staleCommand") 56 einsteindbBackoffHistogramEmpty = metrics.EinsteinDBBackoffHistogram.WithLabelValues("") 57 ) 58 59 func (t backoffType) metric() prometheus.Observer { 60 switch t { 61 case boEinsteinDBRPC: 62 return einsteindbBackoffHistogramRPC 63 case BoTxnLock: 64 return einsteindbBackoffHistogramLock 65 case boTxnLockFast: 66 return einsteindbBackoffHistogramLockFast 67 case BoFIDelRPC: 68 return einsteindbBackoffHistogramFIDel 69 case BoRegionMiss: 70 return einsteindbBackoffHistogramRegionMiss 71 case boServerBusy: 72 return einsteindbBackoffHistogramServerBusy 73 case boStaleCmd: 74 return einsteindbBackoffHistogramStaleCmd 75 } 76 return einsteindbBackoffHistogramEmpty 77 } 78 79 // NewBackoffFn creates a backoff func which implements exponential backoff with 80 // optional jitters. 81 // See http://www.awsarchitectureblog.com/2020/03/backoff.html 82 func NewBackoffFn(base, cap, jitter int) func(ctx context.Context, maxSleepMs int) int { 83 if base < 2 { 84 // Top prevent panic in 'rand.Intn'. 85 base = 2 86 } 87 attempts := 0 88 lastSleep := base 89 return func(ctx context.Context, maxSleepMs int) int { 90 var sleep int 91 switch jitter { 92 case NoJitter: 93 sleep = expo(base, cap, attempts) 94 case FullJitter: 95 v := expo(base, cap, attempts) 96 sleep = int(fastrand.Uint32N(uint32(v))) 97 case EqualJitter: 98 v := expo(base, cap, attempts) 99 sleep = v/2 + int(fastrand.Uint32N(uint32(v/2))) 100 case DecorrJitter: 101 sleep = int(math.Min(float64(cap), float64(base+int(fastrand.Uint32N(uint32(lastSleep*3-base)))))) 102 } 103 logutil.BgLogger().Debug("backoff", 104 zap.Int("base", base), 105 zap.Int("sleep", sleep)) 106 107 realSleep := sleep 108 // when set maxSleepMs >= 0 in `einsteindb.BackoffWithMaxSleep` will force sleep maxSleepMs milliseconds. 109 if maxSleepMs >= 0 && realSleep > maxSleepMs { 110 realSleep = maxSleepMs 111 } 112 select { 113 case <-time.After(time.Duration(realSleep) * time.Millisecond): 114 attempts++ 115 lastSleep = sleep 116 return realSleep 117 case <-ctx.Done(): 118 return 0 119 } 120 } 121 } 122 123 func expo(base, cap, n int) int { 124 return int(math.Min(float64(cap), float64(base)*math.Pow(2.0, float64(n)))) 125 } 126 127 type backoffType int 128 129 // Back off types. 130 const ( 131 boEinsteinDBRPC backoffType = iota 132 BoTxnLock 133 boTxnLockFast 134 BoFIDelRPC 135 BoRegionMiss 136 boServerBusy 137 boTxnNotFound 138 boStaleCmd 139 boMaxTsNotSynced 140 ) 141 142 func (t backoffType) createFn(vars *ekv.Variables) func(context.Context, int) int { 143 if vars.Hook != nil { 144 vars.Hook(t.String(), vars) 145 } 146 switch t { 147 case boEinsteinDBRPC: 148 return NewBackoffFn(100, 2000, EqualJitter) 149 case BoTxnLock: 150 return NewBackoffFn(200, 3000, EqualJitter) 151 case boTxnLockFast: 152 return NewBackoffFn(vars.BackoffLockFast, 3000, EqualJitter) 153 case BoFIDelRPC: 154 return NewBackoffFn(500, 3000, EqualJitter) 155 case BoRegionMiss: 156 // change base time to 2ms, because it may recover soon. 157 return NewBackoffFn(2, 500, NoJitter) 158 case boTxnNotFound: 159 return NewBackoffFn(2, 500, NoJitter) 160 case boServerBusy: 161 return NewBackoffFn(2000, 10000, EqualJitter) 162 case boStaleCmd: 163 return NewBackoffFn(2, 1000, NoJitter) 164 case boMaxTsNotSynced: 165 return NewBackoffFn(2, 500, NoJitter) 166 } 167 return nil 168 } 169 170 func (t backoffType) String() string { 171 switch t { 172 case boEinsteinDBRPC: 173 return "einsteindbRPC" 174 case BoTxnLock: 175 return "txnLock" 176 case boTxnLockFast: 177 return "txnLockFast" 178 case BoFIDelRPC: 179 return "FIDelRPC" 180 case BoRegionMiss: 181 return "regionMiss" 182 case boServerBusy: 183 return "serverBusy" 184 case boStaleCmd: 185 return "staleCommand" 186 case boTxnNotFound: 187 return "txnNotFound" 188 case boMaxTsNotSynced: 189 return "maxTsNotSynced" 190 } 191 return "" 192 } 193 194 func (t backoffType) TError() error { 195 switch t { 196 case boEinsteinDBRPC: 197 return ErrEinsteinDBServerTimeout 198 case BoTxnLock, boTxnLockFast, boTxnNotFound: 199 return ErrResolveLockTimeout 200 case BoFIDelRPC: 201 return ErrFIDelServerTimeout 202 case BoRegionMiss: 203 return ErrRegionUnavailable 204 case boServerBusy: 205 return ErrEinsteinDBServerBusy 206 case boStaleCmd: 207 return ErrEinsteinDBStaleCommand 208 case boMaxTsNotSynced: 209 return ErrEinsteinDBMaxTimestampNotSynced 210 } 211 return ErrUnknown 212 } 213 214 // Maximum total sleep time(in ms) for ekv/cop commands. 215 const ( 216 GetMemberInfoBackoff = 5000 217 copBuildTaskMaxBackoff = 5000 218 tsoMaxBackoff = 15000 219 scannerNextMaxBackoff = 20000 220 batchGetMaxBackoff = 20000 221 copNextMaxBackoff = 20000 222 getMaxBackoff = 20000 223 cleanupMaxBackoff = 20000 224 GcOneRegionMaxBackoff = 20000 225 GcResolveLockMaxBackoff = 100000 226 deleteRangeOneRegionMaxBackoff = 100000 227 rawekvMaxBackoff = 20000 228 splitRegionBackoff = 20000 229 maxSplitRegionsBackoff = 120000 230 scatterRegionBackoff = 20000 231 waitScatterRegionFinishBackoff = 120000 232 locateRegionMaxBackoff = 20000 233 pessimisticLockMaxBackoff = 20000 234 pessimisticRollbackMaxBackoff = 20000 235 ) 236 237 var ( 238 // CommitMaxBackoff is max sleep time of the 'commit' command 239 CommitMaxBackoff = uint64(41000) 240 241 // PrewriteMaxBackoff is max sleep time of the `pre-write` command. 242 PrewriteMaxBackoff = 20000 243 ) 244 245 // Backoffer is a utility for retrying queries. 246 type Backoffer struct { 247 ctx context.Context 248 249 fn map[backoffType]func(context.Context, int) int 250 maxSleep int 251 totalSleep int 252 errors []error 253 types []fmt.Stringer 254 vars *ekv.Variables 255 noop bool 256 257 backoffSleepMS map[backoffType]int 258 backoffTimes map[backoffType]int 259 } 260 261 type txnStartCtxKeyType struct{} 262 263 // txnStartKey is a key for transaction start_ts info in context.Context. 264 var txnStartKey = txnStartCtxKeyType{} 265 266 // NewBackoffer (Deprecated) creates a Backoffer with maximum sleep time(in ms). 267 func NewBackoffer(ctx context.Context, maxSleep int) *Backoffer { 268 return &Backoffer{ 269 ctx: ctx, 270 maxSleep: maxSleep, 271 vars: ekv.DefaultVars, 272 } 273 } 274 275 // NewBackofferWithVars creates a Backoffer with maximum sleep time(in ms) and ekv.Variables. 276 func NewBackofferWithVars(ctx context.Context, maxSleep int, vars *ekv.Variables) *Backoffer { 277 return NewBackoffer(ctx, maxSleep).withVars(vars) 278 } 279 280 // NewNoopBackoff create a Backoffer do nothing just return error directly 281 func NewNoopBackoff(ctx context.Context) *Backoffer { 282 return &Backoffer{ctx: ctx, noop: true} 283 } 284 285 // withVars sets the ekv.Variables to the Backoffer and return it. 286 func (b *Backoffer) withVars(vars *ekv.Variables) *Backoffer { 287 if vars != nil { 288 b.vars = vars 289 } 290 // maxSleep is the max sleep time in millisecond. 291 // When it is multiplied by BackOffWeight, it should not be greater than MaxInt32. 292 if math.MaxInt32/b.vars.BackOffWeight >= b.maxSleep { 293 b.maxSleep *= b.vars.BackOffWeight 294 } 295 return b 296 } 297 298 // Backoff sleeps a while base on the backoffType and records the error message. 299 // It returns a retryable error if total sleep time exceeds maxSleep. 300 func (b *Backoffer) Backoff(typ backoffType, err error) error { 301 if span := opentracing.SpanFromContext(b.ctx); span != nil && span.Tracer() != nil { 302 span1 := span.Tracer().StartSpan(fmt.Sprintf("einsteindb.backoff.%s", typ), opentracing.ChildOf(span.Context())) 303 defer span1.Finish() 304 opentracing.ContextWithSpan(b.ctx, span1) 305 } 306 return b.BackoffWithMaxSleep(typ, -1, err) 307 } 308 309 // BackoffWithMaxSleep sleeps a while base on the backoffType and records the error message 310 // and never sleep more than maxSleepMs for each sleep. 311 func (b *Backoffer) BackoffWithMaxSleep(typ backoffType, maxSleepMs int, err error) error { 312 if strings.Contains(err.Error(), mismatchClusterID) { 313 logutil.BgLogger().Fatal("critical error", zap.Error(err)) 314 } 315 select { 316 case <-b.ctx.Done(): 317 return errors.Trace(err) 318 default: 319 } 320 321 b.errors = append(b.errors, errors.Errorf("%s at %s", err.Error(), time.Now().Format(time.RFC3339Nano))) 322 b.types = append(b.types, typ) 323 if b.noop || (b.maxSleep > 0 && b.totalSleep >= b.maxSleep) { 324 errMsg := fmt.Sprintf("%s backoffer.maxSleep %dms is exceeded, errors:", typ.String(), b.maxSleep) 325 for i, err := range b.errors { 326 // Print only last 3 errors for non-DEBUG log levels. 327 if log.GetLevel() == zapembedded.DebugLevel || i >= len(b.errors)-3 { 328 errMsg += "\n" + err.Error() 329 } 330 } 331 logutil.BgLogger().Warn(errMsg) 332 // Use the first backoff type to generate a MyALLEGROSQL error. 333 return b.types[0].(backoffType).TError() 334 } 335 336 // Lazy initialize. 337 if b.fn == nil { 338 b.fn = make(map[backoffType]func(context.Context, int) int) 339 } 340 f, ok := b.fn[typ] 341 if !ok { 342 f = typ.createFn(b.vars) 343 b.fn[typ] = f 344 } 345 346 realSleep := f(b.ctx, maxSleepMs) 347 typ.metric().Observe(float64(realSleep) / 1000) 348 b.totalSleep += realSleep 349 if b.backoffSleepMS == nil { 350 b.backoffSleepMS = make(map[backoffType]int) 351 } 352 b.backoffSleepMS[typ] += realSleep 353 if b.backoffTimes == nil { 354 b.backoffTimes = make(map[backoffType]int) 355 } 356 b.backoffTimes[typ]++ 357 358 stmtInterDirc := b.ctx.Value(execdetails.StmtInterDircDetailKey) 359 if stmtInterDirc != nil { 360 detail := stmtInterDirc.(*execdetails.StmtInterDircDetails) 361 atomic.AddInt64(&detail.BackoffDuration, int64(realSleep)*int64(time.Millisecond)) 362 atomic.AddInt64(&detail.BackoffCount, 1) 363 } 364 365 if b.vars != nil && b.vars.Killed != nil { 366 if atomic.LoadUint32(b.vars.Killed) == 1 { 367 return ErrQueryInterrupted 368 } 369 } 370 371 var startTs interface{} 372 if ts := b.ctx.Value(txnStartKey); ts != nil { 373 startTs = ts 374 } 375 logutil.Logger(b.ctx).Debug("retry later", 376 zap.Error(err), 377 zap.Int("totalSleep", b.totalSleep), 378 zap.Int("maxSleep", b.maxSleep), 379 zap.Stringer("type", typ), 380 zap.Reflect("txnStartTS", startTs)) 381 return nil 382 } 383 384 func (b *Backoffer) String() string { 385 if b.totalSleep == 0 { 386 return "" 387 } 388 return fmt.Sprintf(" backoff(%dms %v)", b.totalSleep, b.types) 389 } 390 391 // Clone creates a new Backoffer which keeps current Backoffer's sleep time and errors, and shares 392 // current Backoffer's context. 393 func (b *Backoffer) Clone() *Backoffer { 394 return &Backoffer{ 395 ctx: b.ctx, 396 maxSleep: b.maxSleep, 397 totalSleep: b.totalSleep, 398 errors: b.errors, 399 vars: b.vars, 400 } 401 } 402 403 // Fork creates a new Backoffer which keeps current Backoffer's sleep time and errors, and holds 404 // a child context of current Backoffer's context. 405 func (b *Backoffer) Fork() (*Backoffer, context.CancelFunc) { 406 ctx, cancel := context.WithCancel(b.ctx) 407 return &Backoffer{ 408 ctx: ctx, 409 maxSleep: b.maxSleep, 410 totalSleep: b.totalSleep, 411 errors: b.errors, 412 vars: b.vars, 413 }, cancel 414 }