github.com/castai/kvisor@v1.7.1-0.20240516114728-b3572a2607b5/cmd/controller/app/app.go (about) 1 package app 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "log/slog" 8 "net" 9 "net/http" 10 _ "net/http/pprof" //nolint:gosec // TODO: Fix this, should not use default pprof. 11 "time" 12 13 kubepb "github.com/castai/kvisor/api/v1/kube" 14 castaipb "github.com/castai/kvisor/api/v1/runtime" 15 "github.com/castai/kvisor/cmd/controller/kube" 16 "github.com/castai/kvisor/cmd/controller/state" 17 "github.com/castai/kvisor/cmd/controller/state/delta" 18 "github.com/castai/kvisor/cmd/controller/state/imagescan" 19 "github.com/castai/kvisor/cmd/controller/state/kubebench" 20 "github.com/castai/kvisor/cmd/controller/state/kubelinter" 21 "github.com/castai/kvisor/pkg/blobscache" 22 "github.com/castai/kvisor/pkg/castai" 23 "github.com/castai/kvisor/pkg/logging" 24 "github.com/go-playground/validator/v10" 25 "github.com/grafana/pyroscope-go" 26 "github.com/labstack/echo/v4" 27 "github.com/labstack/echo/v4/middleware" 28 "github.com/prometheus/client_golang/prometheus/promhttp" 29 "github.com/samber/lo" 30 "golang.org/x/sync/errgroup" 31 "golang.org/x/time/rate" 32 "google.golang.org/grpc" 33 "k8s.io/client-go/informers" 34 "k8s.io/client-go/kubernetes" 35 ) 36 37 type Config struct { 38 // Logging configuration. 39 LogLevel string 40 LogRateInterval time.Duration 41 LogRateBurst int 42 43 // Built binary version. 44 Version string 45 ChartVersion string 46 47 // Current running pod metadata. 48 PodNamespace string `validate:"required"` 49 PodName string `validate:"required"` 50 51 // HTTPListenPort is internal http servers listen port. 52 HTTPListenPort int `validate:"required"` 53 MetricsHTTPListenPort int 54 KubeServerListenPort int `validate:"required"` 55 56 // PyroscopeAddr is optional pyroscope addr to send traces. 57 PyroscopeAddr string 58 59 CastaiController state.CastaiConfig 60 CastaiEnv castai.Config 61 ImageScan imagescan.Config 62 Linter kubelinter.Config 63 KubeBench kubebench.Config 64 Delta delta.Config 65 JobsCleanup state.JobsCleanupConfig 66 AgentConfig AgentConfig 67 } 68 69 type AgentConfig struct { 70 Enabled bool 71 } 72 73 func (c Config) Proto() *castaipb.ControllerConfig { 74 return &castaipb.ControllerConfig{ 75 LogLevel: c.LogLevel, 76 LogRateInterval: c.LogRateInterval.String(), 77 LogRateBurst: int32(c.LogRateBurst), 78 Version: c.Version, 79 ChartVersion: c.ChartVersion, 80 PodNamespace: c.PodNamespace, 81 PodName: c.PodName, 82 HttpListenPort: int32(c.HTTPListenPort), 83 MetricsHttpListenPort: int32(c.MetricsHTTPListenPort), 84 PyroscopeAddr: c.PyroscopeAddr, 85 CastaiController: &castaipb.CastaiControllerConfig{ 86 RemoteConfigSyncDuration: c.CastaiController.RemoteConfigSyncDuration.String(), 87 }, 88 CastaiEnv: &castaipb.CastaiConfig{ 89 ClusterId: c.CastaiEnv.ClusterID, 90 ApiGrpcAddr: c.CastaiEnv.APIGrpcAddr, 91 Insecure: c.CastaiEnv.Insecure, 92 }, 93 ImageScan: &castaipb.ImageScanConfig{ 94 Enabled: c.ImageScan.Enabled, 95 CastaiSecretRefName: c.ImageScan.CastaiSecretRefName, 96 ScanInterval: c.ImageScan.ScanInterval.String(), 97 ScanTimeout: c.ImageScan.ScanTimeout.String(), 98 MaxConcurrentScans: c.ImageScan.MaxConcurrentScans, 99 ScanJobImagePullPolicy: c.ImageScan.ScanJobImagePullPolicy, 100 Mode: c.ImageScan.Mode, 101 CpuRequest: c.ImageScan.CPURequest, 102 CpuLimit: c.ImageScan.CPULimit, 103 MemoryRequest: c.ImageScan.MemoryRequest, 104 MemoryLimit: c.ImageScan.MemoryLimit, 105 ProfileEnabled: c.ImageScan.ProfileEnabled, 106 PhlareEnabled: c.ImageScan.PhlareEnabled, 107 PrivateRegistryPullSecret: c.ImageScan.PrivateRegistryPullSecret, 108 ServiceAccount: c.ImageScan.ServiceAccount, 109 InitDelay: c.ImageScan.InitDelay.String(), 110 ImageScanBlobsCacheUrl: c.ImageScan.ImageScanBlobsCacheURL, 111 }, 112 Linter: &castaipb.LinterConfig{ 113 Enabled: c.Linter.Enabled, 114 ScanInterval: c.Linter.ScanInterval.String(), 115 InitDelay: c.Linter.InitDelay.String(), 116 }, 117 KubeBench: &castaipb.KubeBenchConfig{ 118 Enabled: c.KubeBench.Enabled, 119 Force: c.KubeBench.Force, 120 ScanInterval: c.KubeBench.ScanInterval.String(), 121 JobImagePullPolicy: c.KubeBench.JobImagePullPolicy, 122 CloudProvider: c.KubeBench.CloudProvider, 123 JobNamespace: c.KubeBench.JobNamespace, 124 }, 125 Delta: &castaipb.DeltaConfig{ 126 Enabled: c.Delta.Enabled, 127 Interval: c.Delta.Interval.String(), 128 InitialDeltay: c.Delta.InitialDeltay.String(), 129 SendTimeout: c.Delta.SendTimeout.String(), 130 UseCompression: c.Delta.UseCompression, 131 }, 132 JobsCleanup: &castaipb.JobsCleanupConfig{ 133 CleanupInterval: c.JobsCleanup.CleanupInterval.String(), 134 CleanupJobAge: c.JobsCleanup.CleanupJobAge.String(), 135 Namespace: c.JobsCleanup.Namespace, 136 }, 137 AgentConfig: &castaipb.ControllerAgentConfig{ 138 Enabled: c.AgentConfig.Enabled, 139 }, 140 } 141 } 142 143 func New(cfg Config, clientset kubernetes.Interface) *App { 144 if err := validator.New().Struct(cfg); err != nil { 145 panic(fmt.Errorf("invalid config: %w", err).Error()) 146 } 147 return &App{cfg: cfg, kubeClient: clientset} 148 } 149 150 type App struct { 151 cfg Config 152 153 kubeClient kubernetes.Interface 154 } 155 156 func (a *App) Run(ctx context.Context) error { 157 cfg := a.cfg 158 clientset := a.kubeClient 159 160 var log *logging.Logger 161 logCfg := &logging.Config{ 162 AddSource: true, 163 Level: logging.MustParseLevel(cfg.LogLevel), 164 RateLimiter: logging.RateLimiterConfig{ 165 Limit: rate.Every(cfg.LogRateInterval), 166 Burst: cfg.LogRateBurst, 167 Inform: true, 168 }, 169 } 170 var castaiClient *castai.Client 171 if a.cfg.CastaiEnv.Valid() { 172 var err error 173 castaiClient, err = castai.NewClient(fmt.Sprintf("kvisor-controller/%s", cfg.Version), cfg.CastaiEnv) 174 if err != nil { 175 return fmt.Errorf("setting up castai api client: %w", err) 176 } 177 defer castaiClient.Close() 178 castaiLogsExporter := castai.NewLogsExporter(castaiClient) 179 go castaiLogsExporter.Run(ctx) //nolint:errcheck 180 logCfg.Export = logging.ExportConfig{ 181 ExportFunc: castaiLogsExporter.ExportFunc(), 182 MinLevel: slog.LevelInfo, 183 } 184 log = logging.New(logCfg) 185 } else { 186 log = logging.New(logCfg) 187 } 188 189 log.Infof("running kvisor-controller, cluster_id=%s, grpc_addr=%s, version=%s", cfg.CastaiEnv.ClusterID, cfg.CastaiEnv.APIGrpcAddr, cfg.Version) 190 191 if cfg.PyroscopeAddr != "" { 192 withPyroscope(cfg.PyroscopeAddr) 193 } 194 195 // Setup kubernetes client and watcher. 196 informersFactory := informers.NewSharedInformerFactory(clientset, 0) 197 k8sVersion, err := kube.GetVersion(clientset) 198 if err != nil { 199 return err 200 } 201 kubeClient := kube.NewClient(log, cfg.PodName, cfg.PodNamespace, k8sVersion, clientset) 202 kubeClient.RegisterHandlers(informersFactory) 203 204 errg, ctx := errgroup.WithContext(ctx) 205 errg.Go(func() error { 206 return kubeClient.Run(ctx) 207 }) 208 209 // CAST AI specific logic. 210 if castaiClient != nil { 211 errg.Go(func() error { 212 castaiCtrl := state.NewCastaiController(log, cfg.CastaiController, cfg.Proto(), kubeClient, castaiClient) 213 return castaiCtrl.Run(ctx) 214 }) 215 216 errg.Go(func() error { 217 jobsCleanupCtrl := state.NewJobsCleanupController(log, clientset, cfg.JobsCleanup) 218 return jobsCleanupCtrl.Run(ctx) 219 }) 220 221 if cfg.Delta.Enabled { 222 deltaCtrl := delta.NewController(log, cfg.Delta, castaiClient.GRPC, kubeClient) 223 kubeClient.RegisterKubernetesChangeListener(deltaCtrl) 224 errg.Go(func() error { 225 return deltaCtrl.Run(ctx) 226 }) 227 } 228 229 if cfg.ImageScan.Enabled { 230 imageScanner := imagescan.NewImageScanner(clientset, cfg.ImageScan, cfg.PodNamespace) 231 imageScanCtrl := imagescan.NewController(log, cfg.ImageScan, imageScanner, castaiClient.GRPC, kubeClient) 232 kubeClient.RegisterKubernetesChangeListener(imageScanCtrl) 233 errg.Go(func() error { 234 return imageScanCtrl.Run(ctx) 235 }) 236 } 237 238 if cfg.Linter.Enabled { 239 linter, err := kubelinter.New(lo.Keys(kubelinter.LinterRuleMap)) 240 if err != nil { 241 return err 242 } 243 linterCtrl := kubelinter.NewController(log, a.cfg.Linter, linter, castaiClient.GRPC) 244 kubeClient.RegisterKubernetesChangeListener(linterCtrl) 245 errg.Go(func() error { 246 return linterCtrl.Run(ctx) 247 }) 248 } 249 250 if cfg.KubeBench.Enabled { 251 logsReader := kube.NewPodLogReader(clientset) 252 kubeBenchCtrl := kubebench.NewController(log, clientset, a.cfg.KubeBench, castaiClient.GRPC, logsReader, kubeClient, []string{}) 253 kubeClient.RegisterKubernetesChangeListener(kubeBenchCtrl) 254 errg.Go(func() error { 255 return kubeBenchCtrl.Run(ctx) 256 }) 257 } 258 } 259 260 errg.Go(func() error { 261 return a.runHTTPServer(ctx, log) 262 }) 263 264 errg.Go(func() error { 265 return a.runKubeServer(ctx, log, kubeClient) 266 }) 267 268 if cfg.MetricsHTTPListenPort != 0 { 269 errg.Go(func() error { 270 return a.runMetricsHTTPServer(ctx, log) 271 }) 272 } 273 274 // Kubernetes informers should start after update and delete handlers are added. 275 informersFactory.Start(ctx.Done()) 276 informersFactory.WaitForCacheSync(ctx.Done()) 277 // We need to register pods informers later since they depend on deployments, daemon sets etc. 278 kubeClient.RegisterPodsHandlers(informersFactory) 279 informersFactory.Start(ctx.Done()) 280 informersFactory.WaitForCacheSync(ctx.Done()) 281 282 select { 283 case <-ctx.Done(): 284 return waitWithTimeout(errg, 60*time.Second) 285 } 286 } 287 288 func waitWithTimeout(errg *errgroup.Group, timeout time.Duration) error { 289 errc := make(chan error, 1) 290 go func() { 291 errc <- errg.Wait() 292 }() 293 select { 294 case <-time.After(timeout): 295 return errors.New("timeout waiting for shutdown") 296 case err := <-errc: 297 return err 298 } 299 } 300 301 func (a *App) runHTTPServer(ctx context.Context, log *logging.Logger) error { 302 e := echo.New() 303 e.HideBanner = true 304 e.Debug = false 305 306 e.Use(middleware.Recover()) 307 e.GET("/healthz", func(c echo.Context) error { 308 type res struct { 309 Msg string `json:"msg"` 310 } 311 return c.JSON(http.StatusOK, res{Msg: "Ok"}) 312 }) 313 314 blobsCacheSrv := blobscache.NewServer(log) 315 blobsCacheSrv.RegisterHandlers(e) 316 317 srv := http.Server{ 318 Addr: fmt.Sprintf(":%d", a.cfg.HTTPListenPort), 319 Handler: e, 320 ReadTimeout: 10 * time.Second, 321 WriteTimeout: 1 * time.Minute, 322 } 323 go func() { 324 <-ctx.Done() 325 log.Info("shutting down http server") 326 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 327 defer cancel() 328 if err := srv.Shutdown(ctx); err != nil { 329 log.Error(err.Error()) 330 } 331 }() 332 log.Infof("running http server, port=%d", a.cfg.HTTPListenPort) 333 if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { 334 return err 335 } 336 return nil 337 } 338 339 func (a *App) runKubeServer(ctx context.Context, log *logging.Logger, client *kube.Client) error { 340 lis, err := net.Listen("tcp", fmt.Sprintf(":%d", a.cfg.KubeServerListenPort)) 341 if err != nil { 342 return err 343 } 344 345 s := grpc.NewServer() 346 kubepb.RegisterKubeAPIServer(s, kube.NewServer(client)) 347 348 go func() { 349 <-ctx.Done() 350 log.Info("shutting kube grpc server") 351 s.GracefulStop() 352 }() 353 log.Infof("running kube server, port=%d", a.cfg.KubeServerListenPort) 354 if err := s.Serve(lis); err != nil && !errors.Is(err, grpc.ErrServerStopped) { 355 return err 356 } 357 return nil 358 } 359 360 func (a *App) runMetricsHTTPServer(ctx context.Context, log *logging.Logger) error { 361 e := echo.New() 362 e.HideBanner = true 363 e.Debug = false 364 365 e.Use(middleware.Recover()) 366 e.GET("/metrics", echo.WrapHandler(promhttp.Handler())) 367 e.GET("/debug/pprof/*item", echo.WrapHandler(http.DefaultServeMux)) 368 srv := http.Server{ 369 Addr: fmt.Sprintf(":%d", a.cfg.MetricsHTTPListenPort), 370 Handler: e, 371 ReadTimeout: 10 * time.Second, 372 WriteTimeout: 1 * time.Minute, 373 } 374 go func() { 375 <-ctx.Done() 376 log.Info("shutting metrics down http server") 377 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 378 defer cancel() 379 if err := srv.Shutdown(ctx); err != nil { 380 log.Error(err.Error()) 381 } 382 }() 383 log.Infof("running metrics server, port=%d", a.cfg.MetricsHTTPListenPort) 384 if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { 385 return err 386 } 387 return nil 388 } 389 390 func withPyroscope(addr string) { 391 if _, err := pyroscope.Start(pyroscope.Config{ 392 ApplicationName: "kvisor-controller", 393 ServerAddress: addr, 394 ProfileTypes: []pyroscope.ProfileType{ 395 pyroscope.ProfileCPU, 396 pyroscope.ProfileAllocObjects, 397 pyroscope.ProfileAllocSpace, 398 pyroscope.ProfileInuseObjects, 399 pyroscope.ProfileInuseSpace, 400 pyroscope.ProfileGoroutines, 401 }, 402 }); err != nil { 403 panic(err) 404 } 405 }