github.com/castai/kvisor@v1.7.1-0.20240516114728-b3572a2607b5/cmd/agent/daemon/app/app.go (about) 1 package app 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "log/slog" 8 "net/http" 9 "net/http/pprof" 10 "os" 11 "runtime" 12 "time" 13 14 "github.com/ClickHouse/clickhouse-go/v2" 15 kubepb "github.com/castai/kvisor/api/v1/kube" 16 castaipb "github.com/castai/kvisor/api/v1/runtime" 17 "github.com/castai/kvisor/cmd/agent/daemon/conntrack" 18 "github.com/castai/kvisor/cmd/agent/daemon/enrichment" 19 "github.com/castai/kvisor/cmd/agent/daemon/netstats" 20 "github.com/castai/kvisor/cmd/agent/daemon/state" 21 "github.com/castai/kvisor/pkg/castai" 22 "github.com/castai/kvisor/pkg/cgroup" 23 "github.com/castai/kvisor/pkg/containers" 24 "github.com/castai/kvisor/pkg/ebpftracer" 25 "github.com/castai/kvisor/pkg/ebpftracer/events" 26 "github.com/castai/kvisor/pkg/ebpftracer/signature" 27 "github.com/castai/kvisor/pkg/ebpftracer/types" 28 "github.com/castai/kvisor/pkg/logging" 29 "github.com/castai/kvisor/pkg/proc" 30 "github.com/go-playground/validator/v10" 31 "github.com/grafana/pyroscope-go" 32 "github.com/prometheus/client_golang/prometheus/promhttp" 33 "golang.org/x/sync/errgroup" 34 "golang.org/x/time/rate" 35 "google.golang.org/grpc" 36 "google.golang.org/grpc/credentials/insecure" 37 ) 38 39 type Config struct { 40 LogLevel string 41 LogRateInterval time.Duration 42 LogRateBurst int 43 SendLogsLevel string 44 Version string 45 BTFPath string 46 PyroscopeAddr string 47 ContainerdSockPath string 48 HostCgroupsDir string 49 MetricsHTTPListenPort int 50 State state.Config 51 EBPFEventsPerCPUBuffer int `validate:"required"` 52 EBPFEventsOutputChanSize int `validate:"required"` 53 EBPFEventsStdioExporterEnabled bool 54 MutedNamespaces []string 55 SignatureEngineConfig signature.SignatureEngineConfig 56 Castai castai.Config 57 EnricherConfig EnricherConfig 58 Netflow NetflowConfig 59 Clickhouse ClickhouseConfig 60 KubeAPIServiceAddr string 61 ExportersQueueSize int `validate:"required"` 62 } 63 64 func (c Config) Proto() *castaipb.AgentConfig { 65 return &castaipb.AgentConfig{ 66 LogLevel: c.LogLevel, 67 LogRateInterval: c.LogRateInterval.String(), 68 LogRateBurst: int32(c.LogRateBurst), 69 SendLogsLevel: c.SendLogsLevel, 70 Version: c.Version, 71 BtfPath: c.BTFPath, 72 PyroscopeAddr: c.PyroscopeAddr, 73 ContainerdSockPath: c.ContainerdSockPath, 74 HostCgroupsDir: c.HostCgroupsDir, 75 MetricsHttpListenPort: int32(c.MetricsHTTPListenPort), 76 State: &castaipb.AgentStateControllerConfig{ 77 ContainerStatsScrapeInterval: c.State.ContainerStatsScrapeInterval.String(), 78 }, 79 EbpfEventsPerCpuBuffer: int32(c.EBPFEventsPerCPUBuffer), 80 EbpfEventsOutputChanSize: int32(c.EBPFEventsOutputChanSize), 81 MutedNamespaces: c.MutedNamespaces, 82 SignatureEngineConfig: &castaipb.SignatureEngineConfig{ 83 InputChanSize: int32(c.SignatureEngineConfig.InputChanSize), 84 OutputChanSize: int32(c.SignatureEngineConfig.OutputChanSize), 85 TtyDetectedSignatureEnabled: c.SignatureEngineConfig.DefaultSignatureConfig.TTYDetectedSignatureEnabled, 86 Socks5DetectedSignatureEnabled: c.SignatureEngineConfig.DefaultSignatureConfig.SOCKS5DetectedSignatureEnabled, 87 Socks5DetectedSignatureConfig: &castaipb.SOCKS5DetectedSignatureConfig{ 88 CacheSize: c.SignatureEngineConfig.DefaultSignatureConfig.SOCKS5DetectedSignatureConfig.CacheSize, 89 }, 90 }, 91 CastaiEnv: &castaipb.CastaiConfig{ 92 ClusterId: c.Castai.ClusterID, 93 ApiGrpcAddr: c.Castai.APIGrpcAddr, 94 Insecure: c.Castai.Insecure, 95 }, 96 EnricherConfig: &castaipb.EnricherConfig{ 97 EnableFileHashEnricher: c.EnricherConfig.EnableFileHashEnricher, 98 }, 99 Netflow: &castaipb.NetflowConfig{ 100 Enabled: c.Netflow.Enabled, 101 SampleSubmitIntervalSeconds: c.Netflow.SampleSubmitIntervalSeconds, 102 }, 103 } 104 } 105 106 type EnricherConfig struct { 107 EnableFileHashEnricher bool 108 } 109 110 type NetflowConfig struct { 111 Enabled bool 112 SampleSubmitIntervalSeconds uint64 113 OutputChanSize int 114 } 115 116 type ClickhouseConfig struct { 117 Addr string 118 Database string 119 Username string 120 Password string 121 } 122 123 func New(cfg *Config) *App { 124 if err := validator.New().Struct(cfg); err != nil { 125 panic(fmt.Errorf("invalid config: %w", err).Error()) 126 } 127 return &App{cfg: cfg} 128 } 129 130 type App struct { 131 cfg *Config 132 } 133 134 func (a *App) Run(ctx context.Context) error { 135 cfg := a.cfg 136 logCfg := &logging.Config{ 137 Level: logging.MustParseLevel(a.cfg.LogLevel), 138 AddSource: true, 139 RateLimiter: logging.RateLimiterConfig{ 140 Limit: rate.Every(a.cfg.LogRateInterval), 141 Burst: a.cfg.LogRateBurst, 142 Inform: true, 143 }, 144 } 145 var log *logging.Logger 146 var exporters *state.Exporters 147 148 // Castai specific spetup if config is valid. 149 if cfg.Castai.Valid() { 150 castaiClient, err := castai.NewClient(fmt.Sprintf("kvisor-agent/%s", cfg.Version), cfg.Castai) 151 if err != nil { 152 return fmt.Errorf("setting up castai api client: %w", err) 153 } 154 if err := a.syncRemoteConfig(ctx, castaiClient); err != nil { 155 return fmt.Errorf("sync remote config: %w", err) 156 } 157 if a.cfg.SendLogsLevel != "" && a.cfg.Castai.Valid() { 158 castaiLogsExporter := castai.NewLogsExporter(castaiClient) 159 go castaiLogsExporter.Run(ctx) //nolint:errcheck 160 161 logCfg.Export = logging.ExportConfig{ 162 ExportFunc: castaiLogsExporter.ExportFunc(), 163 MinLevel: logging.MustParseLevel(a.cfg.SendLogsLevel), 164 } 165 log = logging.New(logCfg) 166 } 167 exporters = state.NewExporters(log) 168 exporters.Events = append(exporters.Events, state.NewCastaiEventsExporter(log, castaiClient, a.cfg.ExportersQueueSize)) 169 exporters.ContainerStats = append(exporters.ContainerStats, state.NewCastaiContainerStatsExporter(log, castaiClient, a.cfg.ExportersQueueSize)) 170 } else { 171 log = logging.New(logCfg) 172 exporters = state.NewExporters(log) 173 } 174 175 kubeAPIServiceConn, err := grpc.Dial( 176 cfg.KubeAPIServiceAddr, 177 grpc.WithTransportCredentials(insecure.NewCredentials()), 178 ) 179 if err != nil { 180 return fmt.Errorf("kube api service grpc server dial: %w", err) 181 } 182 defer kubeAPIServiceConn.Close() 183 kubeAPIServerClient := kubepb.NewKubeAPIClient(kubeAPIServiceConn) 184 185 if cfg.Clickhouse.Addr != "" { 186 storageConn, err := clickhouse.Open(&clickhouse.Options{ 187 Addr: []string{cfg.Clickhouse.Addr}, 188 Auth: clickhouse.Auth{ 189 Database: cfg.Clickhouse.Database, 190 Username: cfg.Clickhouse.Username, 191 Password: cfg.Clickhouse.Password, 192 }, 193 Settings: clickhouse.Settings{ 194 "allow_experimental_object_type": "1", 195 }, 196 MaxOpenConns: 20, 197 }) 198 if err != nil { 199 return err 200 } 201 defer storageConn.Close() 202 203 clickhouseNetflowExporter := state.NewClickhouseNetflowExporter(log, storageConn, a.cfg.ExportersQueueSize) 204 exporters.Netflow = append(exporters.Netflow, clickhouseNetflowExporter) 205 } 206 207 if cfg.EBPFEventsStdioExporterEnabled { 208 exporters.Events = append(exporters.Events, state.NewStdioEventsExporter(log)) 209 } 210 211 if exporters.Empty() { 212 return errors.New("no configured exporters") 213 } 214 215 log.Infof("running kvisor agent, version=%s", a.cfg.Version) 216 defer log.Infof("stopping kvisor agent, version=%s", a.cfg.Version) 217 218 if addr := a.cfg.PyroscopeAddr; addr != "" { 219 withPyroscope(addr) 220 } 221 222 cgroupClient, err := cgroup.NewClient(log, a.cfg.HostCgroupsDir) 223 if err != nil { 224 return err 225 } 226 containersClient, err := containers.NewClient(log, cgroupClient, a.cfg.ContainerdSockPath) 227 if err != nil { 228 return err 229 } 230 ct, err := conntrack.NewClient(log) 231 if err != nil { 232 return fmt.Errorf("conntrack: %w", err) 233 } 234 defer ct.Close() 235 236 activeSignatures, err := signature.DefaultSignatures(log, a.cfg.SignatureEngineConfig.DefaultSignatureConfig) 237 if err != nil { 238 return fmt.Errorf("error while configuring signatures: %w", err) 239 } 240 241 signatureEngine := signature.NewEngine(activeSignatures, log, a.cfg.SignatureEngineConfig) 242 243 procHandler := proc.New() 244 mountNamespacePIDStore, err := getInitializedMountNamespaceStore(procHandler) 245 if err != nil { 246 return fmt.Errorf("mount namespace PID store: %w", err) 247 } 248 249 enrichmentService := enrichment.NewService(log, enrichment.Config{ 250 WorkerCount: runtime.NumCPU(), 251 EventEnrichers: getActiveEnrichers(a.cfg.EnricherConfig, log, mountNamespacePIDStore), 252 }) 253 254 pidNSID, err := procHandler.GetCurrentPIDNSID() 255 if err != nil { 256 return fmt.Errorf("proc handler: %w", err) 257 } 258 259 tracer := ebpftracer.New(log, ebpftracer.Config{ 260 BTFPath: a.cfg.BTFPath, 261 EventsPerCPUBuffer: a.cfg.EBPFEventsPerCPUBuffer, 262 EventsOutputChanSize: a.cfg.EBPFEventsOutputChanSize, 263 DefaultCgroupsVersion: cgroupClient.DefaultCgroupVersion().String(), 264 ContainerClient: containersClient, 265 CgroupClient: cgroupClient, 266 MountNamespacePIDStore: mountNamespacePIDStore, 267 HomePIDNS: pidNSID, 268 NetflowOutputChanSize: a.cfg.Netflow.OutputChanSize, 269 NetflowSampleSubmitIntervalSeconds: a.cfg.Netflow.SampleSubmitIntervalSeconds, 270 SignatureEngine: signatureEngine, 271 }) 272 if err := tracer.Load(); err != nil { 273 return fmt.Errorf("loading tracer: %w", err) 274 } 275 defer tracer.Close() 276 277 policy := &ebpftracer.Policy{ 278 SystemEvents: []events.ID{ 279 events.SignalCgroupMkdir, 280 events.SignalCgroupRmdir, 281 }, 282 Events: []*ebpftracer.EventPolicy{}, 283 } 284 285 if len(exporters.Events) > 0 { 286 policy.SignatureEvents = signatureEngine.TargetEvents() 287 policy.Events = append(policy.Events, []*ebpftracer.EventPolicy{ 288 {ID: events.SchedProcessExec}, 289 { 290 ID: events.SockSetState, 291 PreFilterGenerator: ebpftracer.PreRateLimit(ebpftracer.RateLimitPolicy{ 292 Rate: 5, 293 Burst: 1, 294 }), 295 }, 296 { 297 ID: events.NetPacketDNSBase, 298 FilterGenerator: ebpftracer.FilterAnd( 299 ebpftracer.FilterEmptyDnsAnswers(log), 300 ebpftracer.DeduplicateDnsEvents(log, 100, 60*time.Second), 301 ), 302 }, 303 {ID: events.TrackSyscallStats}, 304 { 305 ID: events.FileModification, 306 PreFilterGenerator: ebpftracer.PreRateLimit(ebpftracer.RateLimitPolicy{ 307 Interval: 15 * time.Second, 308 }), 309 }, 310 {ID: events.ProcessOomKilled}, // OOM events should not happen too often and we want to know about all of them 311 {ID: events.MagicWrite}, 312 }...) 313 } 314 if len(exporters.Netflow) > 0 { 315 policy.Events = append(policy.Events, &ebpftracer.EventPolicy{ 316 ID: events.NetFlowBase, 317 }) 318 } 319 // TODO: Allow to change policy on the fly. We should be able to change it from remote config. 320 if err := tracer.ApplyPolicy(policy); err != nil { 321 return fmt.Errorf("apply policy: %w", err) 322 } 323 324 netStatsReader := netstats.NewReader(proc.Path) 325 326 ctrl := state.NewController( 327 log, 328 a.cfg.State, 329 exporters, 330 containersClient, 331 netStatsReader, 332 ct, 333 tracer, 334 signatureEngine, 335 enrichmentService, 336 kubeAPIServerClient, 337 ) 338 339 errg, ctx := errgroup.WithContext(ctx) 340 errg.Go(func() error { 341 return a.runHTTPServer(ctx, log) 342 }) 343 344 errg.Go(func() error { 345 return exporters.Run(ctx) 346 }) 347 348 errg.Go(func() error { 349 return signatureEngine.Run(ctx) 350 }) 351 352 errg.Go(func() error { 353 return ctrl.Run(ctx) 354 }) 355 356 errg.Go(func() error { 357 return enrichmentService.Run(ctx) 358 }) 359 360 // Tracer should not run in err group because it can block event if context is canceled 361 // during event read. 362 tracererr := make(chan error, 1) 363 go func() { 364 tracererr <- tracer.Run(ctx) 365 }() 366 367 for _, namespace := range a.cfg.MutedNamespaces { 368 err := ctrl.MuteNamespace(namespace) 369 if err != nil { 370 log.Warnf("error while muting namespace: %v", err) 371 } 372 } 373 374 select { 375 case err := <-tracererr: 376 return err 377 case <-ctx.Done(): 378 return waitWithTimeout(errg, 10*time.Second) 379 } 380 } 381 382 func (a *App) syncRemoteConfig(ctx context.Context, client *castai.Client) error { 383 for { 384 select { 385 case <-ctx.Done(): 386 return ctx.Err() 387 default: 388 } 389 _, err := client.GRPC.GetConfiguration(ctx, &castaipb.GetConfigurationRequest{ 390 CurrentConfig: &castaipb.GetConfigurationRequest_Agent{ 391 Agent: a.cfg.Proto(), 392 }, 393 }) 394 if err != nil { 395 slog.Error(fmt.Sprintf("fetching initial config: %v", err)) 396 time.Sleep(5 * time.Second) 397 continue 398 } 399 slog.Info("initial config synced") 400 return nil 401 } 402 } 403 404 func getActiveEnrichers(cfg EnricherConfig, log *logging.Logger, mountNamespacePIDStore *types.PIDsPerNamespace) []enrichment.EventEnricher { 405 var result []enrichment.EventEnricher 406 407 if cfg.EnableFileHashEnricher { 408 result = append(result, enrichment.EnrichWithFileHash(log, mountNamespacePIDStore, proc.GetFS())) 409 } 410 411 return result 412 } 413 414 func getInitializedMountNamespaceStore(procHandler *proc.Proc) (*types.PIDsPerNamespace, error) { 415 mountNamespacePIDStore, err := types.NewPIDsPerNamespaceCache(2048, 5) 416 if err != nil { 417 return nil, err 418 } 419 420 processes, err := procHandler.LoadMountNSOldestProcesses() 421 if err != nil { 422 return nil, err 423 } 424 425 for ns, pid := range processes { 426 mountNamespacePIDStore.ForceAddToBucket(ns, pid) 427 } 428 429 return mountNamespacePIDStore, nil 430 } 431 432 func (a *App) runHTTPServer(ctx context.Context, log *logging.Logger) error { 433 log.Info("running http server") 434 defer log.Info("stopping http server") 435 436 mux := http.NewServeMux() 437 mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { 438 _, _ = w.Write([]byte("ok")) 439 }) 440 mux.Handle("/metrics", promhttp.Handler()) 441 mux.HandleFunc("/debug/pprof/", pprof.Index) 442 mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline) 443 mux.HandleFunc("/debug/pprof/profile", pprof.Profile) 444 mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol) 445 mux.HandleFunc("/debug/pprof/trace", pprof.Trace) 446 srv := http.Server{ 447 Addr: fmt.Sprintf(":%d", a.cfg.MetricsHTTPListenPort), 448 Handler: mux, 449 ReadTimeout: 10 * time.Second, 450 WriteTimeout: 1 * time.Minute, 451 } 452 453 go func() { 454 <-ctx.Done() 455 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 456 defer cancel() 457 _ = srv.Shutdown(ctx) 458 }() 459 460 if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { 461 return fmt.Errorf("http serve: %w", err) 462 } 463 464 return nil 465 } 466 467 func waitWithTimeout(errg *errgroup.Group, timeout time.Duration) error { 468 errc := make(chan error, 1) 469 go func() { 470 errc <- errg.Wait() 471 }() 472 select { 473 case <-time.After(timeout): 474 return errors.New("timeout waiting for shutdown") // TODO(anjmao): Getting this error on tilt. 475 case err := <-errc: 476 return err 477 } 478 } 479 480 func withPyroscope(addr string) { 481 if _, err := pyroscope.Start(pyroscope.Config{ 482 ApplicationName: "kvisor-agent", 483 ServerAddress: addr, 484 Tags: map[string]string{ 485 "pod": os.Getenv("POD_NAME"), 486 }, 487 ProfileTypes: []pyroscope.ProfileType{ 488 pyroscope.ProfileCPU, 489 pyroscope.ProfileAllocObjects, 490 pyroscope.ProfileAllocSpace, 491 pyroscope.ProfileInuseObjects, 492 pyroscope.ProfileInuseSpace, 493 pyroscope.ProfileGoroutines, 494 }, 495 }); err != nil { 496 panic(err) 497 } 498 }