github.com/castai/kvisor@v1.7.1-0.20240516114728-b3572a2607b5/cmd/controller/state/castai_controller.go (about)

     1  package state
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"sync/atomic"
     8  	"time"
     9  
    10  	castaipb "github.com/castai/kvisor/api/v1/runtime"
    11  	"github.com/castai/kvisor/cmd/controller/kube"
    12  	"github.com/castai/kvisor/pkg/castai"
    13  	"github.com/castai/kvisor/pkg/logging"
    14  	"golang.org/x/sync/errgroup"
    15  )
    16  
    17  type CastaiConfig struct {
    18  	RemoteConfigSyncDuration time.Duration `validate:"required"`
    19  }
    20  
    21  func NewCastaiController(log *logging.Logger, cfg CastaiConfig, appProtoConfig *castaipb.ControllerConfig, kubeClient *kube.Client, castaiClient *castai.Client) *CastaiController {
    22  	if cfg.RemoteConfigSyncDuration == 0 {
    23  		cfg.RemoteConfigSyncDuration = 5 * time.Minute
    24  	}
    25  	return &CastaiController{
    26  		id:                             "castai",
    27  		enabled:                        castaiClient != nil,
    28  		log:                            log.WithField("component", "castai_ctrl"),
    29  		kubeClient:                     kubeClient,
    30  		cfg:                            cfg,
    31  		appProtoConfig:                 appProtoConfig,
    32  		castaiClient:                   castaiClient,
    33  		remoteConfigFetchErrors:        &atomic.Int64{},
    34  		remoteConfigInitialSyncTimeout: 1 * time.Minute,
    35  		remoteConfigRetryWaitDuration:  20 * time.Second,
    36  		removeConfigMaxFailures:        10,
    37  		streamReconnectWaitDuration:    2 * time.Second,
    38  	}
    39  }
    40  
    41  type CastaiController struct {
    42  	id             string
    43  	enabled        bool
    44  	log            *logging.Logger
    45  	kubeClient     *kube.Client
    46  	cfg            CastaiConfig
    47  	castaiClient   *castai.Client
    48  	appProtoConfig *castaipb.ControllerConfig
    49  
    50  	remoteConfigFetchErrors        *atomic.Int64
    51  	removeConfigMaxFailures        int64
    52  	streamReconnectWaitDuration    time.Duration
    53  	remoteConfigRetryWaitDuration  time.Duration
    54  	remoteConfigInitialSyncTimeout time.Duration
    55  }
    56  
    57  func (c *CastaiController) Enabled() bool {
    58  	return c.enabled
    59  }
    60  
    61  func (c *CastaiController) Run(ctx context.Context) error {
    62  	c.log.Info("running")
    63  	defer c.log.Infof("stopping")
    64  
    65  	ctxCtx, cancel := context.WithTimeout(ctx, c.remoteConfigInitialSyncTimeout)
    66  	defer cancel()
    67  
    68  	if err := c.fetchInitialRemoteConfig(ctxCtx); err != nil {
    69  		return fmt.Errorf("fetching initial config: %w", err)
    70  	}
    71  
    72  	errg, ctx := errgroup.WithContext(ctx)
    73  	errg.Go(func() error {
    74  		return c.runRemoteConfigSyncLoop(ctx)
    75  	})
    76  
    77  	return errg.Wait()
    78  }
    79  
    80  func (c *CastaiController) fetchConfig(ctx context.Context, req *castaipb.GetConfigurationRequest) (*castaipb.Configuration, error) {
    81  	resp, err := c.castaiClient.GRPC.GetConfiguration(ctx, req)
    82  	if err != nil {
    83  		return nil, err
    84  	}
    85  	if resp.Config == nil {
    86  		resp.Config = &castaipb.Configuration{}
    87  	}
    88  	return resp.Config, nil
    89  }
    90  
    91  func (c *CastaiController) fetchInitialRemoteConfig(ctx context.Context) error {
    92  	for {
    93  		select {
    94  		case <-ctx.Done():
    95  			return ctx.Err()
    96  		default:
    97  		}
    98  
    99  		cfg, err := c.fetchConfig(ctx, &castaipb.GetConfigurationRequest{
   100  			CurrentConfig: &castaipb.GetConfigurationRequest_Controller{
   101  				Controller: c.appProtoConfig,
   102  			},
   103  		})
   104  		if err != nil {
   105  			c.log.Errorf("fetching initial config: %v", err)
   106  			sleep(ctx, c.remoteConfigRetryWaitDuration)
   107  			continue
   108  		}
   109  		c.updateRemoteConfig(cfg)
   110  		c.log.Info("initial config synced")
   111  		return nil
   112  	}
   113  }
   114  
   115  func (c *CastaiController) runRemoteConfigSyncLoop(ctx context.Context) error {
   116  	ticker := time.NewTicker(c.cfg.RemoteConfigSyncDuration)
   117  	defer ticker.Stop()
   118  
   119  	for {
   120  		select {
   121  		case <-ctx.Done():
   122  			return ctx.Err()
   123  		case <-ticker.C:
   124  			cfg, err := c.fetchConfig(ctx, &castaipb.GetConfigurationRequest{})
   125  			if err != nil {
   126  				if errors.Is(err, context.Canceled) {
   127  					return err
   128  				}
   129  				c.log.Errorf("fetching config: %v", err)
   130  				fetchErrors := c.remoteConfigFetchErrors.Add(1)
   131  				if fetchErrors >= c.removeConfigMaxFailures {
   132  					return fmt.Errorf("maximum %d remote config fetch errors reached", fetchErrors)
   133  				}
   134  				continue
   135  			}
   136  			c.remoteConfigFetchErrors.Store(0)
   137  			c.updateRemoteConfig(cfg)
   138  		}
   139  	}
   140  }
   141  
   142  func (c *CastaiController) updateRemoteConfig(cfg *castaipb.Configuration) {
   143  }