github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/servermaster/jobop/backoff.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package jobop
    15  
    16  import (
    17  	"time"
    18  
    19  	"github.com/cenkalti/backoff/v4"
    20  	"github.com/pingcap/log"
    21  	"github.com/pingcap/tiflow/engine/pkg/clock"
    22  	"go.uber.org/zap"
    23  )
    24  
    25  type backoffEventType int32
    26  
    27  const (
    28  	backoffOnline backoffEventType = iota + 1
    29  	backoffOffline
    30  )
    31  
    32  type backoffEvent struct {
    33  	tp backoffEventType
    34  	ts time.Time
    35  }
    36  
    37  // NewJobBackoff creates a new job backoff
    38  func NewJobBackoff(jobID string, clocker clock.Clock, config *BackoffConfig) *JobBackoff {
    39  	errBackoff := backoff.NewExponentialBackOff()
    40  	errBackoff.InitialInterval = config.InitialInterval
    41  	errBackoff.MaxInterval = config.MaxInterval
    42  	errBackoff.Multiplier = config.Multiplier
    43  	// MaxElapsedTime=0 means the backoff never stops, since there is other ways
    44  	// to stop backoff, including continuously failure check, cancel job.
    45  	errBackoff.MaxElapsedTime = 0
    46  	errBackoff.Reset()
    47  
    48  	return &JobBackoff{
    49  		jobID:      jobID,
    50  		clocker:    clocker,
    51  		config:     config,
    52  		errBackoff: errBackoff,
    53  	}
    54  }
    55  
    56  // JobBackoff is a job backoff manager, it recoreds job online and offline events
    57  // and determines whether a job can be re-created based on backoff mechanism.
    58  // The backoff stragegy is as following
    59  //   - Each time a fail event arrives, the backoff time will be move forward by
    60  //     nextBackoff.
    61  //   - If a job is success for more than `resetInterval`, the backoff history will
    62  //     be cleared, and backoff time will be re-calculated.
    63  type JobBackoff struct {
    64  	jobID   string
    65  	clocker clock.Clock
    66  	config  *BackoffConfig
    67  
    68  	events          []backoffEvent
    69  	errBackoff      *backoff.ExponentialBackOff
    70  	backoffInterval time.Duration
    71  }
    72  
    73  // Terminate returns whether job should be terminated.
    74  // It happens when job fails continuously for more than max try times.
    75  func (b *JobBackoff) Terminate() bool {
    76  	if len(b.events) < b.config.MaxTryTime {
    77  		return false
    78  	}
    79  	failCount := 0
    80  	for _, event := range b.events {
    81  		if event.tp == backoffOffline {
    82  			failCount++
    83  		}
    84  	}
    85  	return failCount >= b.config.MaxTryTime
    86  }
    87  
    88  // Allow returns whether new request(create job) is allowd
    89  func (b *JobBackoff) Allow() bool {
    90  	var lastErrorTime time.Time
    91  	for i := len(b.events) - 1; i >= 0; i-- {
    92  		event := b.events[i]
    93  		if event.tp == backoffOffline {
    94  			lastErrorTime = event.ts
    95  			break
    96  		}
    97  	}
    98  	return b.clocker.Since(lastErrorTime) >= b.backoffInterval
    99  }
   100  
   101  // Success is called when a success event happens
   102  func (b *JobBackoff) Success() {
   103  	event := backoffEvent{
   104  		tp: backoffOnline,
   105  		ts: b.clocker.Now(),
   106  	}
   107  	b.addEvent(event)
   108  }
   109  
   110  // Fail is called when a failure event happens
   111  func (b *JobBackoff) Fail() {
   112  	event := backoffEvent{
   113  		tp: backoffOffline,
   114  		ts: b.clocker.Now(),
   115  	}
   116  	b.addEvent(event)
   117  	b.nextBackoff()
   118  }
   119  
   120  // addEvent appends new backoff event into backoffer
   121  func (b *JobBackoff) addEvent(event backoffEvent) {
   122  	// The last event is online and it is earlier than `resetInterval`,
   123  	// reset the backoff
   124  	if len(b.events) > 0 {
   125  		lastEvent := b.events[len(b.events)-1]
   126  		if lastEvent.tp == backoffOnline &&
   127  			b.clocker.Since(lastEvent.ts) >= b.config.ResetInterval {
   128  			b.events = make([]backoffEvent, 0)
   129  			b.resetErrBackoff()
   130  		}
   131  	}
   132  	b.events = append(b.events, event)
   133  }
   134  
   135  func (b *JobBackoff) resetErrBackoff() {
   136  	b.errBackoff.Reset()
   137  	b.backoffInterval = 0
   138  }
   139  
   140  func (b *JobBackoff) nextBackoff() {
   141  	oldInterval := b.backoffInterval
   142  	b.backoffInterval = b.errBackoff.NextBackOff()
   143  	log.Info("job backoff interval is changed",
   144  		zap.String("job-id", b.jobID),
   145  		zap.Duration("old-interval", oldInterval),
   146  		zap.Duration("new-interval", b.backoffInterval),
   147  	)
   148  }