github.com/aporeto-inc/trireme-lib@v10.358.0+incompatible/monitor/internal/k8s/event_retry_handler.go (about)

     1  package k8smonitor
     2  
     3  import (
     4  	"context"
     5  	"time"
     6  
     7  	"go.aporeto.io/enforcerd/internal/extractors/containermetadata"
     8  	"go.uber.org/zap"
     9  )
    10  
    11  var (
    12  	retryWaittimeUnit = time.Second
    13  	retryTimeout      = time.Second * 30
    14  )
    15  
    16  type startEventRetryFunc func(containermetadata.CommonKubernetesContainerMetadata, uint)
    17  
    18  func newStartEventRetryFunc(mainCtx context.Context, extractor containermetadata.CommonContainerMetadataExtractor, startEvent startEventFunc) startEventRetryFunc {
    19  	return func(kmd containermetadata.CommonKubernetesContainerMetadata, retry uint) {
    20  		// we only care about pod sandboxes for restarts
    21  		// make sure that we stick to that
    22  		if kmd.Kind() != containermetadata.PodSandbox {
    23  			zap.L().Debug(
    24  				"K8sMonitor: startEventRetry: this is not a pod sandbox. Aborting retry...",
    25  				zap.Uint("retry", retry),
    26  				zap.String("kind", kmd.Kind().String()),
    27  				zap.String("id", kmd.ID()),
    28  			)
    29  			return
    30  		}
    31  
    32  		// wait before we retry
    33  		waitTime := calculateWaitTime(retry)
    34  		zap.L().Debug(
    35  			"K8sMonitor: startEventRetry: waiting before retry...",
    36  			zap.Uint("retry", retry),
    37  			zap.Duration("waitTime", waitTime),
    38  			zap.String("id", kmd.ID()),
    39  		)
    40  		select {
    41  		case <-mainCtx.Done():
    42  			// no point in continuing if the main context is done
    43  			return
    44  		case <-time.After(waitTime):
    45  		}
    46  
    47  		// check if the sandbox still exists, otherwise we can abort the retries
    48  		if !extractor.Has(containermetadata.NewRuncArguments(containermetadata.StartAction, kmd.ID())) {
    49  			zap.L().Debug(
    50  				"K8sMonitor: startEventRetry: container for start event does not exist any longer. Aborting...",
    51  				zap.Uint("retry", retry),
    52  				zap.String("id", kmd.ID()),
    53  			)
    54  			return
    55  		}
    56  
    57  		// now create a new context and retry
    58  		// the recursion occurs within the startEvent
    59  		ctx, cancel := context.WithTimeout(mainCtx, retryTimeout)
    60  		defer cancel()
    61  		if err := startEvent(ctx, kmd, retry); err != nil {
    62  			zap.L().Error(
    63  				"K8sMonitor: startEventRetry: failed to process start event on retry",
    64  				zap.Uint("retry", retry),
    65  				zap.Error(err),
    66  				zap.String("id", kmd.ID()),
    67  				zap.String("podUID", kmd.PodUID()),
    68  				zap.String("podName", kmd.PodName()),
    69  				zap.String("podNamespace", kmd.PodNamespace()),
    70  			)
    71  		}
    72  	}
    73  }
    74  
    75  // calculateWaitTime calculates a fibonacci style backoff wait time based on the number of retry
    76  // It uses `retryWaittimeUnit` as the base unit for the wait time
    77  func calculateWaitTime(retry uint) time.Duration {
    78  	var n uint
    79  	switch retry {
    80  	case 0:
    81  		n = 0
    82  	case 1:
    83  		n = 1
    84  	case 2:
    85  		n = 1
    86  	case 3:
    87  		n = 2
    88  	case 4:
    89  		n = 3
    90  	case 5:
    91  		n = 5
    92  	case 6:
    93  		n = 8
    94  	case 7:
    95  		n = 13
    96  	case 8:
    97  		n = 21
    98  	case 9:
    99  		n = 34
   100  	default:
   101  		n = 55
   102  	}
   103  	return retryWaittimeUnit * time.Duration(n)
   104  }