github.com/castai/kvisor@v1.7.1-0.20240516114728-b3572a2607b5/cmd/controller/app/app.go (about)

     1  package app
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"log/slog"
     8  	"net"
     9  	"net/http"
    10  	_ "net/http/pprof" //nolint:gosec // TODO: Fix this, should not use default pprof.
    11  	"time"
    12  
    13  	kubepb "github.com/castai/kvisor/api/v1/kube"
    14  	castaipb "github.com/castai/kvisor/api/v1/runtime"
    15  	"github.com/castai/kvisor/cmd/controller/kube"
    16  	"github.com/castai/kvisor/cmd/controller/state"
    17  	"github.com/castai/kvisor/cmd/controller/state/delta"
    18  	"github.com/castai/kvisor/cmd/controller/state/imagescan"
    19  	"github.com/castai/kvisor/cmd/controller/state/kubebench"
    20  	"github.com/castai/kvisor/cmd/controller/state/kubelinter"
    21  	"github.com/castai/kvisor/pkg/blobscache"
    22  	"github.com/castai/kvisor/pkg/castai"
    23  	"github.com/castai/kvisor/pkg/logging"
    24  	"github.com/go-playground/validator/v10"
    25  	"github.com/grafana/pyroscope-go"
    26  	"github.com/labstack/echo/v4"
    27  	"github.com/labstack/echo/v4/middleware"
    28  	"github.com/prometheus/client_golang/prometheus/promhttp"
    29  	"github.com/samber/lo"
    30  	"golang.org/x/sync/errgroup"
    31  	"golang.org/x/time/rate"
    32  	"google.golang.org/grpc"
    33  	"k8s.io/client-go/informers"
    34  	"k8s.io/client-go/kubernetes"
    35  )
    36  
    37  type Config struct {
    38  	// Logging configuration.
    39  	LogLevel        string
    40  	LogRateInterval time.Duration
    41  	LogRateBurst    int
    42  
    43  	// Built binary version.
    44  	Version      string
    45  	ChartVersion string
    46  
    47  	// Current running pod metadata.
    48  	PodNamespace string `validate:"required"`
    49  	PodName      string `validate:"required"`
    50  
    51  	// HTTPListenPort is internal http servers listen port.
    52  	HTTPListenPort        int `validate:"required"`
    53  	MetricsHTTPListenPort int
    54  	KubeServerListenPort  int `validate:"required"`
    55  
    56  	// PyroscopeAddr is optional pyroscope addr to send traces.
    57  	PyroscopeAddr string
    58  
    59  	CastaiController state.CastaiConfig
    60  	CastaiEnv        castai.Config
    61  	ImageScan        imagescan.Config
    62  	Linter           kubelinter.Config
    63  	KubeBench        kubebench.Config
    64  	Delta            delta.Config
    65  	JobsCleanup      state.JobsCleanupConfig
    66  	AgentConfig      AgentConfig
    67  }
    68  
    69  type AgentConfig struct {
    70  	Enabled bool
    71  }
    72  
    73  func (c Config) Proto() *castaipb.ControllerConfig {
    74  	return &castaipb.ControllerConfig{
    75  		LogLevel:              c.LogLevel,
    76  		LogRateInterval:       c.LogRateInterval.String(),
    77  		LogRateBurst:          int32(c.LogRateBurst),
    78  		Version:               c.Version,
    79  		ChartVersion:          c.ChartVersion,
    80  		PodNamespace:          c.PodNamespace,
    81  		PodName:               c.PodName,
    82  		HttpListenPort:        int32(c.HTTPListenPort),
    83  		MetricsHttpListenPort: int32(c.MetricsHTTPListenPort),
    84  		PyroscopeAddr:         c.PyroscopeAddr,
    85  		CastaiController: &castaipb.CastaiControllerConfig{
    86  			RemoteConfigSyncDuration: c.CastaiController.RemoteConfigSyncDuration.String(),
    87  		},
    88  		CastaiEnv: &castaipb.CastaiConfig{
    89  			ClusterId:   c.CastaiEnv.ClusterID,
    90  			ApiGrpcAddr: c.CastaiEnv.APIGrpcAddr,
    91  			Insecure:    c.CastaiEnv.Insecure,
    92  		},
    93  		ImageScan: &castaipb.ImageScanConfig{
    94  			Enabled:                   c.ImageScan.Enabled,
    95  			CastaiSecretRefName:       c.ImageScan.CastaiSecretRefName,
    96  			ScanInterval:              c.ImageScan.ScanInterval.String(),
    97  			ScanTimeout:               c.ImageScan.ScanTimeout.String(),
    98  			MaxConcurrentScans:        c.ImageScan.MaxConcurrentScans,
    99  			ScanJobImagePullPolicy:    c.ImageScan.ScanJobImagePullPolicy,
   100  			Mode:                      c.ImageScan.Mode,
   101  			CpuRequest:                c.ImageScan.CPURequest,
   102  			CpuLimit:                  c.ImageScan.CPULimit,
   103  			MemoryRequest:             c.ImageScan.MemoryRequest,
   104  			MemoryLimit:               c.ImageScan.MemoryLimit,
   105  			ProfileEnabled:            c.ImageScan.ProfileEnabled,
   106  			PhlareEnabled:             c.ImageScan.PhlareEnabled,
   107  			PrivateRegistryPullSecret: c.ImageScan.PrivateRegistryPullSecret,
   108  			ServiceAccount:            c.ImageScan.ServiceAccount,
   109  			InitDelay:                 c.ImageScan.InitDelay.String(),
   110  			ImageScanBlobsCacheUrl:    c.ImageScan.ImageScanBlobsCacheURL,
   111  		},
   112  		Linter: &castaipb.LinterConfig{
   113  			Enabled:      c.Linter.Enabled,
   114  			ScanInterval: c.Linter.ScanInterval.String(),
   115  			InitDelay:    c.Linter.InitDelay.String(),
   116  		},
   117  		KubeBench: &castaipb.KubeBenchConfig{
   118  			Enabled:            c.KubeBench.Enabled,
   119  			Force:              c.KubeBench.Force,
   120  			ScanInterval:       c.KubeBench.ScanInterval.String(),
   121  			JobImagePullPolicy: c.KubeBench.JobImagePullPolicy,
   122  			CloudProvider:      c.KubeBench.CloudProvider,
   123  			JobNamespace:       c.KubeBench.JobNamespace,
   124  		},
   125  		Delta: &castaipb.DeltaConfig{
   126  			Enabled:        c.Delta.Enabled,
   127  			Interval:       c.Delta.Interval.String(),
   128  			InitialDeltay:  c.Delta.InitialDeltay.String(),
   129  			SendTimeout:    c.Delta.SendTimeout.String(),
   130  			UseCompression: c.Delta.UseCompression,
   131  		},
   132  		JobsCleanup: &castaipb.JobsCleanupConfig{
   133  			CleanupInterval: c.JobsCleanup.CleanupInterval.String(),
   134  			CleanupJobAge:   c.JobsCleanup.CleanupJobAge.String(),
   135  			Namespace:       c.JobsCleanup.Namespace,
   136  		},
   137  		AgentConfig: &castaipb.ControllerAgentConfig{
   138  			Enabled: c.AgentConfig.Enabled,
   139  		},
   140  	}
   141  }
   142  
   143  func New(cfg Config, clientset kubernetes.Interface) *App {
   144  	if err := validator.New().Struct(cfg); err != nil {
   145  		panic(fmt.Errorf("invalid config: %w", err).Error())
   146  	}
   147  	return &App{cfg: cfg, kubeClient: clientset}
   148  }
   149  
   150  type App struct {
   151  	cfg Config
   152  
   153  	kubeClient kubernetes.Interface
   154  }
   155  
   156  func (a *App) Run(ctx context.Context) error {
   157  	cfg := a.cfg
   158  	clientset := a.kubeClient
   159  
   160  	var log *logging.Logger
   161  	logCfg := &logging.Config{
   162  		AddSource: true,
   163  		Level:     logging.MustParseLevel(cfg.LogLevel),
   164  		RateLimiter: logging.RateLimiterConfig{
   165  			Limit:  rate.Every(cfg.LogRateInterval),
   166  			Burst:  cfg.LogRateBurst,
   167  			Inform: true,
   168  		},
   169  	}
   170  	var castaiClient *castai.Client
   171  	if a.cfg.CastaiEnv.Valid() {
   172  		var err error
   173  		castaiClient, err = castai.NewClient(fmt.Sprintf("kvisor-controller/%s", cfg.Version), cfg.CastaiEnv)
   174  		if err != nil {
   175  			return fmt.Errorf("setting up castai api client: %w", err)
   176  		}
   177  		defer castaiClient.Close()
   178  		castaiLogsExporter := castai.NewLogsExporter(castaiClient)
   179  		go castaiLogsExporter.Run(ctx) //nolint:errcheck
   180  		logCfg.Export = logging.ExportConfig{
   181  			ExportFunc: castaiLogsExporter.ExportFunc(),
   182  			MinLevel:   slog.LevelInfo,
   183  		}
   184  		log = logging.New(logCfg)
   185  	} else {
   186  		log = logging.New(logCfg)
   187  	}
   188  
   189  	log.Infof("running kvisor-controller, cluster_id=%s, grpc_addr=%s, version=%s", cfg.CastaiEnv.ClusterID, cfg.CastaiEnv.APIGrpcAddr, cfg.Version)
   190  
   191  	if cfg.PyroscopeAddr != "" {
   192  		withPyroscope(cfg.PyroscopeAddr)
   193  	}
   194  
   195  	// Setup kubernetes client and watcher.
   196  	informersFactory := informers.NewSharedInformerFactory(clientset, 0)
   197  	k8sVersion, err := kube.GetVersion(clientset)
   198  	if err != nil {
   199  		return err
   200  	}
   201  	kubeClient := kube.NewClient(log, cfg.PodName, cfg.PodNamespace, k8sVersion, clientset)
   202  	kubeClient.RegisterHandlers(informersFactory)
   203  
   204  	errg, ctx := errgroup.WithContext(ctx)
   205  	errg.Go(func() error {
   206  		return kubeClient.Run(ctx)
   207  	})
   208  
   209  	// CAST AI specific logic.
   210  	if castaiClient != nil {
   211  		errg.Go(func() error {
   212  			castaiCtrl := state.NewCastaiController(log, cfg.CastaiController, cfg.Proto(), kubeClient, castaiClient)
   213  			return castaiCtrl.Run(ctx)
   214  		})
   215  
   216  		errg.Go(func() error {
   217  			jobsCleanupCtrl := state.NewJobsCleanupController(log, clientset, cfg.JobsCleanup)
   218  			return jobsCleanupCtrl.Run(ctx)
   219  		})
   220  
   221  		if cfg.Delta.Enabled {
   222  			deltaCtrl := delta.NewController(log, cfg.Delta, castaiClient.GRPC, kubeClient)
   223  			kubeClient.RegisterKubernetesChangeListener(deltaCtrl)
   224  			errg.Go(func() error {
   225  				return deltaCtrl.Run(ctx)
   226  			})
   227  		}
   228  
   229  		if cfg.ImageScan.Enabled {
   230  			imageScanner := imagescan.NewImageScanner(clientset, cfg.ImageScan, cfg.PodNamespace)
   231  			imageScanCtrl := imagescan.NewController(log, cfg.ImageScan, imageScanner, castaiClient.GRPC, kubeClient)
   232  			kubeClient.RegisterKubernetesChangeListener(imageScanCtrl)
   233  			errg.Go(func() error {
   234  				return imageScanCtrl.Run(ctx)
   235  			})
   236  		}
   237  
   238  		if cfg.Linter.Enabled {
   239  			linter, err := kubelinter.New(lo.Keys(kubelinter.LinterRuleMap))
   240  			if err != nil {
   241  				return err
   242  			}
   243  			linterCtrl := kubelinter.NewController(log, a.cfg.Linter, linter, castaiClient.GRPC)
   244  			kubeClient.RegisterKubernetesChangeListener(linterCtrl)
   245  			errg.Go(func() error {
   246  				return linterCtrl.Run(ctx)
   247  			})
   248  		}
   249  
   250  		if cfg.KubeBench.Enabled {
   251  			logsReader := kube.NewPodLogReader(clientset)
   252  			kubeBenchCtrl := kubebench.NewController(log, clientset, a.cfg.KubeBench, castaiClient.GRPC, logsReader, kubeClient, []string{})
   253  			kubeClient.RegisterKubernetesChangeListener(kubeBenchCtrl)
   254  			errg.Go(func() error {
   255  				return kubeBenchCtrl.Run(ctx)
   256  			})
   257  		}
   258  	}
   259  
   260  	errg.Go(func() error {
   261  		return a.runHTTPServer(ctx, log)
   262  	})
   263  
   264  	errg.Go(func() error {
   265  		return a.runKubeServer(ctx, log, kubeClient)
   266  	})
   267  
   268  	if cfg.MetricsHTTPListenPort != 0 {
   269  		errg.Go(func() error {
   270  			return a.runMetricsHTTPServer(ctx, log)
   271  		})
   272  	}
   273  
   274  	// Kubernetes informers should start after update and delete handlers are added.
   275  	informersFactory.Start(ctx.Done())
   276  	informersFactory.WaitForCacheSync(ctx.Done())
   277  	// We need to register pods informers later since they depend on deployments, daemon sets etc.
   278  	kubeClient.RegisterPodsHandlers(informersFactory)
   279  	informersFactory.Start(ctx.Done())
   280  	informersFactory.WaitForCacheSync(ctx.Done())
   281  
   282  	select {
   283  	case <-ctx.Done():
   284  		return waitWithTimeout(errg, 60*time.Second)
   285  	}
   286  }
   287  
   288  func waitWithTimeout(errg *errgroup.Group, timeout time.Duration) error {
   289  	errc := make(chan error, 1)
   290  	go func() {
   291  		errc <- errg.Wait()
   292  	}()
   293  	select {
   294  	case <-time.After(timeout):
   295  		return errors.New("timeout waiting for shutdown")
   296  	case err := <-errc:
   297  		return err
   298  	}
   299  }
   300  
   301  func (a *App) runHTTPServer(ctx context.Context, log *logging.Logger) error {
   302  	e := echo.New()
   303  	e.HideBanner = true
   304  	e.Debug = false
   305  
   306  	e.Use(middleware.Recover())
   307  	e.GET("/healthz", func(c echo.Context) error {
   308  		type res struct {
   309  			Msg string `json:"msg"`
   310  		}
   311  		return c.JSON(http.StatusOK, res{Msg: "Ok"})
   312  	})
   313  
   314  	blobsCacheSrv := blobscache.NewServer(log)
   315  	blobsCacheSrv.RegisterHandlers(e)
   316  
   317  	srv := http.Server{
   318  		Addr:         fmt.Sprintf(":%d", a.cfg.HTTPListenPort),
   319  		Handler:      e,
   320  		ReadTimeout:  10 * time.Second,
   321  		WriteTimeout: 1 * time.Minute,
   322  	}
   323  	go func() {
   324  		<-ctx.Done()
   325  		log.Info("shutting down http server")
   326  		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   327  		defer cancel()
   328  		if err := srv.Shutdown(ctx); err != nil {
   329  			log.Error(err.Error())
   330  		}
   331  	}()
   332  	log.Infof("running http server, port=%d", a.cfg.HTTPListenPort)
   333  	if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
   334  		return err
   335  	}
   336  	return nil
   337  }
   338  
   339  func (a *App) runKubeServer(ctx context.Context, log *logging.Logger, client *kube.Client) error {
   340  	lis, err := net.Listen("tcp", fmt.Sprintf(":%d", a.cfg.KubeServerListenPort))
   341  	if err != nil {
   342  		return err
   343  	}
   344  
   345  	s := grpc.NewServer()
   346  	kubepb.RegisterKubeAPIServer(s, kube.NewServer(client))
   347  
   348  	go func() {
   349  		<-ctx.Done()
   350  		log.Info("shutting kube grpc server")
   351  		s.GracefulStop()
   352  	}()
   353  	log.Infof("running kube server, port=%d", a.cfg.KubeServerListenPort)
   354  	if err := s.Serve(lis); err != nil && !errors.Is(err, grpc.ErrServerStopped) {
   355  		return err
   356  	}
   357  	return nil
   358  }
   359  
   360  func (a *App) runMetricsHTTPServer(ctx context.Context, log *logging.Logger) error {
   361  	e := echo.New()
   362  	e.HideBanner = true
   363  	e.Debug = false
   364  
   365  	e.Use(middleware.Recover())
   366  	e.GET("/metrics", echo.WrapHandler(promhttp.Handler()))
   367  	e.GET("/debug/pprof/*item", echo.WrapHandler(http.DefaultServeMux))
   368  	srv := http.Server{
   369  		Addr:         fmt.Sprintf(":%d", a.cfg.MetricsHTTPListenPort),
   370  		Handler:      e,
   371  		ReadTimeout:  10 * time.Second,
   372  		WriteTimeout: 1 * time.Minute,
   373  	}
   374  	go func() {
   375  		<-ctx.Done()
   376  		log.Info("shutting metrics down http server")
   377  		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   378  		defer cancel()
   379  		if err := srv.Shutdown(ctx); err != nil {
   380  			log.Error(err.Error())
   381  		}
   382  	}()
   383  	log.Infof("running metrics server, port=%d", a.cfg.MetricsHTTPListenPort)
   384  	if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
   385  		return err
   386  	}
   387  	return nil
   388  }
   389  
   390  func withPyroscope(addr string) {
   391  	if _, err := pyroscope.Start(pyroscope.Config{
   392  		ApplicationName: "kvisor-controller",
   393  		ServerAddress:   addr,
   394  		ProfileTypes: []pyroscope.ProfileType{
   395  			pyroscope.ProfileCPU,
   396  			pyroscope.ProfileAllocObjects,
   397  			pyroscope.ProfileAllocSpace,
   398  			pyroscope.ProfileInuseObjects,
   399  			pyroscope.ProfileInuseSpace,
   400  			pyroscope.ProfileGoroutines,
   401  		},
   402  	}); err != nil {
   403  		panic(err)
   404  	}
   405  }