github.com/castai/kvisor@v1.7.1-0.20240516114728-b3572a2607b5/cmd/agent/daemon/app/app.go (about)

     1  package app
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"log/slog"
     8  	"net/http"
     9  	"net/http/pprof"
    10  	"os"
    11  	"runtime"
    12  	"time"
    13  
    14  	"github.com/ClickHouse/clickhouse-go/v2"
    15  	kubepb "github.com/castai/kvisor/api/v1/kube"
    16  	castaipb "github.com/castai/kvisor/api/v1/runtime"
    17  	"github.com/castai/kvisor/cmd/agent/daemon/conntrack"
    18  	"github.com/castai/kvisor/cmd/agent/daemon/enrichment"
    19  	"github.com/castai/kvisor/cmd/agent/daemon/netstats"
    20  	"github.com/castai/kvisor/cmd/agent/daemon/state"
    21  	"github.com/castai/kvisor/pkg/castai"
    22  	"github.com/castai/kvisor/pkg/cgroup"
    23  	"github.com/castai/kvisor/pkg/containers"
    24  	"github.com/castai/kvisor/pkg/ebpftracer"
    25  	"github.com/castai/kvisor/pkg/ebpftracer/events"
    26  	"github.com/castai/kvisor/pkg/ebpftracer/signature"
    27  	"github.com/castai/kvisor/pkg/ebpftracer/types"
    28  	"github.com/castai/kvisor/pkg/logging"
    29  	"github.com/castai/kvisor/pkg/proc"
    30  	"github.com/go-playground/validator/v10"
    31  	"github.com/grafana/pyroscope-go"
    32  	"github.com/prometheus/client_golang/prometheus/promhttp"
    33  	"golang.org/x/sync/errgroup"
    34  	"golang.org/x/time/rate"
    35  	"google.golang.org/grpc"
    36  	"google.golang.org/grpc/credentials/insecure"
    37  )
    38  
    39  type Config struct {
    40  	LogLevel                       string
    41  	LogRateInterval                time.Duration
    42  	LogRateBurst                   int
    43  	SendLogsLevel                  string
    44  	Version                        string
    45  	BTFPath                        string
    46  	PyroscopeAddr                  string
    47  	ContainerdSockPath             string
    48  	HostCgroupsDir                 string
    49  	MetricsHTTPListenPort          int
    50  	State                          state.Config
    51  	EBPFEventsPerCPUBuffer         int `validate:"required"`
    52  	EBPFEventsOutputChanSize       int `validate:"required"`
    53  	EBPFEventsStdioExporterEnabled bool
    54  	MutedNamespaces                []string
    55  	SignatureEngineConfig          signature.SignatureEngineConfig
    56  	Castai                         castai.Config
    57  	EnricherConfig                 EnricherConfig
    58  	Netflow                        NetflowConfig
    59  	Clickhouse                     ClickhouseConfig
    60  	KubeAPIServiceAddr             string
    61  	ExportersQueueSize             int `validate:"required"`
    62  }
    63  
    64  func (c Config) Proto() *castaipb.AgentConfig {
    65  	return &castaipb.AgentConfig{
    66  		LogLevel:              c.LogLevel,
    67  		LogRateInterval:       c.LogRateInterval.String(),
    68  		LogRateBurst:          int32(c.LogRateBurst),
    69  		SendLogsLevel:         c.SendLogsLevel,
    70  		Version:               c.Version,
    71  		BtfPath:               c.BTFPath,
    72  		PyroscopeAddr:         c.PyroscopeAddr,
    73  		ContainerdSockPath:    c.ContainerdSockPath,
    74  		HostCgroupsDir:        c.HostCgroupsDir,
    75  		MetricsHttpListenPort: int32(c.MetricsHTTPListenPort),
    76  		State: &castaipb.AgentStateControllerConfig{
    77  			ContainerStatsScrapeInterval: c.State.ContainerStatsScrapeInterval.String(),
    78  		},
    79  		EbpfEventsPerCpuBuffer:   int32(c.EBPFEventsPerCPUBuffer),
    80  		EbpfEventsOutputChanSize: int32(c.EBPFEventsOutputChanSize),
    81  		MutedNamespaces:          c.MutedNamespaces,
    82  		SignatureEngineConfig: &castaipb.SignatureEngineConfig{
    83  			InputChanSize:                  int32(c.SignatureEngineConfig.InputChanSize),
    84  			OutputChanSize:                 int32(c.SignatureEngineConfig.OutputChanSize),
    85  			TtyDetectedSignatureEnabled:    c.SignatureEngineConfig.DefaultSignatureConfig.TTYDetectedSignatureEnabled,
    86  			Socks5DetectedSignatureEnabled: c.SignatureEngineConfig.DefaultSignatureConfig.SOCKS5DetectedSignatureEnabled,
    87  			Socks5DetectedSignatureConfig: &castaipb.SOCKS5DetectedSignatureConfig{
    88  				CacheSize: c.SignatureEngineConfig.DefaultSignatureConfig.SOCKS5DetectedSignatureConfig.CacheSize,
    89  			},
    90  		},
    91  		CastaiEnv: &castaipb.CastaiConfig{
    92  			ClusterId:   c.Castai.ClusterID,
    93  			ApiGrpcAddr: c.Castai.APIGrpcAddr,
    94  			Insecure:    c.Castai.Insecure,
    95  		},
    96  		EnricherConfig: &castaipb.EnricherConfig{
    97  			EnableFileHashEnricher: c.EnricherConfig.EnableFileHashEnricher,
    98  		},
    99  		Netflow: &castaipb.NetflowConfig{
   100  			Enabled:                     c.Netflow.Enabled,
   101  			SampleSubmitIntervalSeconds: c.Netflow.SampleSubmitIntervalSeconds,
   102  		},
   103  	}
   104  }
   105  
   106  type EnricherConfig struct {
   107  	EnableFileHashEnricher bool
   108  }
   109  
   110  type NetflowConfig struct {
   111  	Enabled                     bool
   112  	SampleSubmitIntervalSeconds uint64
   113  	OutputChanSize              int
   114  }
   115  
   116  type ClickhouseConfig struct {
   117  	Addr     string
   118  	Database string
   119  	Username string
   120  	Password string
   121  }
   122  
   123  func New(cfg *Config) *App {
   124  	if err := validator.New().Struct(cfg); err != nil {
   125  		panic(fmt.Errorf("invalid config: %w", err).Error())
   126  	}
   127  	return &App{cfg: cfg}
   128  }
   129  
   130  type App struct {
   131  	cfg *Config
   132  }
   133  
   134  func (a *App) Run(ctx context.Context) error {
   135  	cfg := a.cfg
   136  	logCfg := &logging.Config{
   137  		Level:     logging.MustParseLevel(a.cfg.LogLevel),
   138  		AddSource: true,
   139  		RateLimiter: logging.RateLimiterConfig{
   140  			Limit:  rate.Every(a.cfg.LogRateInterval),
   141  			Burst:  a.cfg.LogRateBurst,
   142  			Inform: true,
   143  		},
   144  	}
   145  	var log *logging.Logger
   146  	var exporters *state.Exporters
   147  
   148  	// Castai specific spetup if config is valid.
   149  	if cfg.Castai.Valid() {
   150  		castaiClient, err := castai.NewClient(fmt.Sprintf("kvisor-agent/%s", cfg.Version), cfg.Castai)
   151  		if err != nil {
   152  			return fmt.Errorf("setting up castai api client: %w", err)
   153  		}
   154  		if err := a.syncRemoteConfig(ctx, castaiClient); err != nil {
   155  			return fmt.Errorf("sync remote config: %w", err)
   156  		}
   157  		if a.cfg.SendLogsLevel != "" && a.cfg.Castai.Valid() {
   158  			castaiLogsExporter := castai.NewLogsExporter(castaiClient)
   159  			go castaiLogsExporter.Run(ctx) //nolint:errcheck
   160  
   161  			logCfg.Export = logging.ExportConfig{
   162  				ExportFunc: castaiLogsExporter.ExportFunc(),
   163  				MinLevel:   logging.MustParseLevel(a.cfg.SendLogsLevel),
   164  			}
   165  			log = logging.New(logCfg)
   166  		}
   167  		exporters = state.NewExporters(log)
   168  		exporters.Events = append(exporters.Events, state.NewCastaiEventsExporter(log, castaiClient, a.cfg.ExportersQueueSize))
   169  		exporters.ContainerStats = append(exporters.ContainerStats, state.NewCastaiContainerStatsExporter(log, castaiClient, a.cfg.ExportersQueueSize))
   170  	} else {
   171  		log = logging.New(logCfg)
   172  		exporters = state.NewExporters(log)
   173  	}
   174  
   175  	kubeAPIServiceConn, err := grpc.Dial(
   176  		cfg.KubeAPIServiceAddr,
   177  		grpc.WithTransportCredentials(insecure.NewCredentials()),
   178  	)
   179  	if err != nil {
   180  		return fmt.Errorf("kube api service grpc server dial: %w", err)
   181  	}
   182  	defer kubeAPIServiceConn.Close()
   183  	kubeAPIServerClient := kubepb.NewKubeAPIClient(kubeAPIServiceConn)
   184  
   185  	if cfg.Clickhouse.Addr != "" {
   186  		storageConn, err := clickhouse.Open(&clickhouse.Options{
   187  			Addr: []string{cfg.Clickhouse.Addr},
   188  			Auth: clickhouse.Auth{
   189  				Database: cfg.Clickhouse.Database,
   190  				Username: cfg.Clickhouse.Username,
   191  				Password: cfg.Clickhouse.Password,
   192  			},
   193  			Settings: clickhouse.Settings{
   194  				"allow_experimental_object_type": "1",
   195  			},
   196  			MaxOpenConns: 20,
   197  		})
   198  		if err != nil {
   199  			return err
   200  		}
   201  		defer storageConn.Close()
   202  
   203  		clickhouseNetflowExporter := state.NewClickhouseNetflowExporter(log, storageConn, a.cfg.ExportersQueueSize)
   204  		exporters.Netflow = append(exporters.Netflow, clickhouseNetflowExporter)
   205  	}
   206  
   207  	if cfg.EBPFEventsStdioExporterEnabled {
   208  		exporters.Events = append(exporters.Events, state.NewStdioEventsExporter(log))
   209  	}
   210  
   211  	if exporters.Empty() {
   212  		return errors.New("no configured exporters")
   213  	}
   214  
   215  	log.Infof("running kvisor agent, version=%s", a.cfg.Version)
   216  	defer log.Infof("stopping kvisor agent, version=%s", a.cfg.Version)
   217  
   218  	if addr := a.cfg.PyroscopeAddr; addr != "" {
   219  		withPyroscope(addr)
   220  	}
   221  
   222  	cgroupClient, err := cgroup.NewClient(log, a.cfg.HostCgroupsDir)
   223  	if err != nil {
   224  		return err
   225  	}
   226  	containersClient, err := containers.NewClient(log, cgroupClient, a.cfg.ContainerdSockPath)
   227  	if err != nil {
   228  		return err
   229  	}
   230  	ct, err := conntrack.NewClient(log)
   231  	if err != nil {
   232  		return fmt.Errorf("conntrack: %w", err)
   233  	}
   234  	defer ct.Close()
   235  
   236  	activeSignatures, err := signature.DefaultSignatures(log, a.cfg.SignatureEngineConfig.DefaultSignatureConfig)
   237  	if err != nil {
   238  		return fmt.Errorf("error while configuring signatures: %w", err)
   239  	}
   240  
   241  	signatureEngine := signature.NewEngine(activeSignatures, log, a.cfg.SignatureEngineConfig)
   242  
   243  	procHandler := proc.New()
   244  	mountNamespacePIDStore, err := getInitializedMountNamespaceStore(procHandler)
   245  	if err != nil {
   246  		return fmt.Errorf("mount namespace PID store: %w", err)
   247  	}
   248  
   249  	enrichmentService := enrichment.NewService(log, enrichment.Config{
   250  		WorkerCount:    runtime.NumCPU(),
   251  		EventEnrichers: getActiveEnrichers(a.cfg.EnricherConfig, log, mountNamespacePIDStore),
   252  	})
   253  
   254  	pidNSID, err := procHandler.GetCurrentPIDNSID()
   255  	if err != nil {
   256  		return fmt.Errorf("proc handler: %w", err)
   257  	}
   258  
   259  	tracer := ebpftracer.New(log, ebpftracer.Config{
   260  		BTFPath:                            a.cfg.BTFPath,
   261  		EventsPerCPUBuffer:                 a.cfg.EBPFEventsPerCPUBuffer,
   262  		EventsOutputChanSize:               a.cfg.EBPFEventsOutputChanSize,
   263  		DefaultCgroupsVersion:              cgroupClient.DefaultCgroupVersion().String(),
   264  		ContainerClient:                    containersClient,
   265  		CgroupClient:                       cgroupClient,
   266  		MountNamespacePIDStore:             mountNamespacePIDStore,
   267  		HomePIDNS:                          pidNSID,
   268  		NetflowOutputChanSize:              a.cfg.Netflow.OutputChanSize,
   269  		NetflowSampleSubmitIntervalSeconds: a.cfg.Netflow.SampleSubmitIntervalSeconds,
   270  		SignatureEngine:                    signatureEngine,
   271  	})
   272  	if err := tracer.Load(); err != nil {
   273  		return fmt.Errorf("loading tracer: %w", err)
   274  	}
   275  	defer tracer.Close()
   276  
   277  	policy := &ebpftracer.Policy{
   278  		SystemEvents: []events.ID{
   279  			events.SignalCgroupMkdir,
   280  			events.SignalCgroupRmdir,
   281  		},
   282  		Events: []*ebpftracer.EventPolicy{},
   283  	}
   284  
   285  	if len(exporters.Events) > 0 {
   286  		policy.SignatureEvents = signatureEngine.TargetEvents()
   287  		policy.Events = append(policy.Events, []*ebpftracer.EventPolicy{
   288  			{ID: events.SchedProcessExec},
   289  			{
   290  				ID: events.SockSetState,
   291  				PreFilterGenerator: ebpftracer.PreRateLimit(ebpftracer.RateLimitPolicy{
   292  					Rate:  5,
   293  					Burst: 1,
   294  				}),
   295  			},
   296  			{
   297  				ID: events.NetPacketDNSBase,
   298  				FilterGenerator: ebpftracer.FilterAnd(
   299  					ebpftracer.FilterEmptyDnsAnswers(log),
   300  					ebpftracer.DeduplicateDnsEvents(log, 100, 60*time.Second),
   301  				),
   302  			},
   303  			{ID: events.TrackSyscallStats},
   304  			{
   305  				ID: events.FileModification,
   306  				PreFilterGenerator: ebpftracer.PreRateLimit(ebpftracer.RateLimitPolicy{
   307  					Interval: 15 * time.Second,
   308  				}),
   309  			},
   310  			{ID: events.ProcessOomKilled}, // OOM events should not happen too often and we want to know about all of them
   311  			{ID: events.MagicWrite},
   312  		}...)
   313  	}
   314  	if len(exporters.Netflow) > 0 {
   315  		policy.Events = append(policy.Events, &ebpftracer.EventPolicy{
   316  			ID: events.NetFlowBase,
   317  		})
   318  	}
   319  	// TODO: Allow to change policy on the fly. We should be able to change it from remote config.
   320  	if err := tracer.ApplyPolicy(policy); err != nil {
   321  		return fmt.Errorf("apply policy: %w", err)
   322  	}
   323  
   324  	netStatsReader := netstats.NewReader(proc.Path)
   325  
   326  	ctrl := state.NewController(
   327  		log,
   328  		a.cfg.State,
   329  		exporters,
   330  		containersClient,
   331  		netStatsReader,
   332  		ct,
   333  		tracer,
   334  		signatureEngine,
   335  		enrichmentService,
   336  		kubeAPIServerClient,
   337  	)
   338  
   339  	errg, ctx := errgroup.WithContext(ctx)
   340  	errg.Go(func() error {
   341  		return a.runHTTPServer(ctx, log)
   342  	})
   343  
   344  	errg.Go(func() error {
   345  		return exporters.Run(ctx)
   346  	})
   347  
   348  	errg.Go(func() error {
   349  		return signatureEngine.Run(ctx)
   350  	})
   351  
   352  	errg.Go(func() error {
   353  		return ctrl.Run(ctx)
   354  	})
   355  
   356  	errg.Go(func() error {
   357  		return enrichmentService.Run(ctx)
   358  	})
   359  
   360  	// Tracer should not run in err group because it can block event if context is canceled
   361  	// during event read.
   362  	tracererr := make(chan error, 1)
   363  	go func() {
   364  		tracererr <- tracer.Run(ctx)
   365  	}()
   366  
   367  	for _, namespace := range a.cfg.MutedNamespaces {
   368  		err := ctrl.MuteNamespace(namespace)
   369  		if err != nil {
   370  			log.Warnf("error while muting namespace: %v", err)
   371  		}
   372  	}
   373  
   374  	select {
   375  	case err := <-tracererr:
   376  		return err
   377  	case <-ctx.Done():
   378  		return waitWithTimeout(errg, 10*time.Second)
   379  	}
   380  }
   381  
   382  func (a *App) syncRemoteConfig(ctx context.Context, client *castai.Client) error {
   383  	for {
   384  		select {
   385  		case <-ctx.Done():
   386  			return ctx.Err()
   387  		default:
   388  		}
   389  		_, err := client.GRPC.GetConfiguration(ctx, &castaipb.GetConfigurationRequest{
   390  			CurrentConfig: &castaipb.GetConfigurationRequest_Agent{
   391  				Agent: a.cfg.Proto(),
   392  			},
   393  		})
   394  		if err != nil {
   395  			slog.Error(fmt.Sprintf("fetching initial config: %v", err))
   396  			time.Sleep(5 * time.Second)
   397  			continue
   398  		}
   399  		slog.Info("initial config synced")
   400  		return nil
   401  	}
   402  }
   403  
   404  func getActiveEnrichers(cfg EnricherConfig, log *logging.Logger, mountNamespacePIDStore *types.PIDsPerNamespace) []enrichment.EventEnricher {
   405  	var result []enrichment.EventEnricher
   406  
   407  	if cfg.EnableFileHashEnricher {
   408  		result = append(result, enrichment.EnrichWithFileHash(log, mountNamespacePIDStore, proc.GetFS()))
   409  	}
   410  
   411  	return result
   412  }
   413  
   414  func getInitializedMountNamespaceStore(procHandler *proc.Proc) (*types.PIDsPerNamespace, error) {
   415  	mountNamespacePIDStore, err := types.NewPIDsPerNamespaceCache(2048, 5)
   416  	if err != nil {
   417  		return nil, err
   418  	}
   419  
   420  	processes, err := procHandler.LoadMountNSOldestProcesses()
   421  	if err != nil {
   422  		return nil, err
   423  	}
   424  
   425  	for ns, pid := range processes {
   426  		mountNamespacePIDStore.ForceAddToBucket(ns, pid)
   427  	}
   428  
   429  	return mountNamespacePIDStore, nil
   430  }
   431  
   432  func (a *App) runHTTPServer(ctx context.Context, log *logging.Logger) error {
   433  	log.Info("running http server")
   434  	defer log.Info("stopping http server")
   435  
   436  	mux := http.NewServeMux()
   437  	mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) {
   438  		_, _ = w.Write([]byte("ok"))
   439  	})
   440  	mux.Handle("/metrics", promhttp.Handler())
   441  	mux.HandleFunc("/debug/pprof/", pprof.Index)
   442  	mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
   443  	mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
   444  	mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
   445  	mux.HandleFunc("/debug/pprof/trace", pprof.Trace)
   446  	srv := http.Server{
   447  		Addr:         fmt.Sprintf(":%d", a.cfg.MetricsHTTPListenPort),
   448  		Handler:      mux,
   449  		ReadTimeout:  10 * time.Second,
   450  		WriteTimeout: 1 * time.Minute,
   451  	}
   452  
   453  	go func() {
   454  		<-ctx.Done()
   455  		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   456  		defer cancel()
   457  		_ = srv.Shutdown(ctx)
   458  	}()
   459  
   460  	if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
   461  		return fmt.Errorf("http serve: %w", err)
   462  	}
   463  
   464  	return nil
   465  }
   466  
   467  func waitWithTimeout(errg *errgroup.Group, timeout time.Duration) error {
   468  	errc := make(chan error, 1)
   469  	go func() {
   470  		errc <- errg.Wait()
   471  	}()
   472  	select {
   473  	case <-time.After(timeout):
   474  		return errors.New("timeout waiting for shutdown") // TODO(anjmao): Getting this error on tilt.
   475  	case err := <-errc:
   476  		return err
   477  	}
   478  }
   479  
   480  func withPyroscope(addr string) {
   481  	if _, err := pyroscope.Start(pyroscope.Config{
   482  		ApplicationName: "kvisor-agent",
   483  		ServerAddress:   addr,
   484  		Tags: map[string]string{
   485  			"pod": os.Getenv("POD_NAME"),
   486  		},
   487  		ProfileTypes: []pyroscope.ProfileType{
   488  			pyroscope.ProfileCPU,
   489  			pyroscope.ProfileAllocObjects,
   490  			pyroscope.ProfileAllocSpace,
   491  			pyroscope.ProfileInuseObjects,
   492  			pyroscope.ProfileInuseSpace,
   493  			pyroscope.ProfileGoroutines,
   494  		},
   495  	}); err != nil {
   496  		panic(err)
   497  	}
   498  }