github.com/castai/kvisor@v1.7.1-0.20240516114728-b3572a2607b5/cmd/agent/daemon/daemon.go (about)

     1  package main
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"log/slog"
     8  	"os"
     9  	"os/signal"
    10  	"syscall"
    11  	"time"
    12  
    13  	"github.com/castai/kvisor/cmd/agent/daemon/app"
    14  	"github.com/castai/kvisor/cmd/agent/daemon/state"
    15  	"github.com/castai/kvisor/pkg/castai"
    16  	"github.com/castai/kvisor/pkg/ebpftracer/signature"
    17  	"github.com/spf13/cobra"
    18  	"github.com/spf13/pflag"
    19  )
    20  
    21  func lookupConfigVariable(name string) (string, error) {
    22  	key, found := os.LookupEnv("CASTAI_" + name)
    23  	if found {
    24  		return key, nil
    25  	}
    26  
    27  	key, found = os.LookupEnv(name)
    28  	if found {
    29  		return key, nil
    30  	}
    31  
    32  	return "", fmt.Errorf("environment variable missing: please provide either `CAST_%s` or `%s`", name, name)
    33  }
    34  
    35  func NewRunCommand(version string) *cobra.Command {
    36  	var (
    37  		logLevel        = pflag.String("log-level", slog.LevelInfo.String(), "log level")
    38  		logRateInterval = pflag.Duration("log-rate-iterval", 100*time.Millisecond, "Log rate limit interval")
    39  		logRateBurst    = pflag.Int("log-rate-burst", 100, "Log rate burst")
    40  
    41  		sendLogLevel                 = pflag.String("send-logs-level", "", "send logs level")
    42  		containerdSockPath           = pflag.String("containerd-sock", "/run/containerd/containerd.sock", "Path to containerd socket file")
    43  		metricsHTTPListenPort        = pflag.Int("metrics-http-listen-port", 6060, "metrics http listen port")
    44  		pyroscopeAddr                = pflag.String("pyroscope-addr", "", "Enable pyroscope tracing")
    45  		hostCgroupsDir               = pflag.String("host-cgroups", "/cgroups", "Host /sys/fs/cgroups directory name mounted to container")
    46  		containerStatsScrapeInterval = pflag.Duration("container-stats-scrape-interval", 60*time.Second, "Container resources scrape interval")
    47  
    48  		btfPath                        = pflag.String("btf-path", "/sys/kernel/btf/vmlinux", "btf file path")
    49  		ebpfEventsStdioExporterEnabled = pflag.Bool("ebpf-events-stdio-exporter-enabled", false, "Export ebpf event to stdio")
    50  		ebpfEventsPerCPUBuffer         = pflag.Int("ebpf-events-per-cpu-buffer", os.Getpagesize()*64, "Ebpf per cpu buffer size")
    51  		ebpfEventsOutputChanSize       = pflag.Int("ebpf-events-output-queue-size", 4096, "Ebpf user spaces output channel size")
    52  
    53  		signatureEngineInputEventChanSize  = pflag.Int("signature-engine-input-queue-size", 1000, "Input queue size for the signature engine.")
    54  		signatureEngineOutputEventChanSize = pflag.Int("signature-engine-output-queue-size", 1000, "Output queue size for the signature engine.")
    55  
    56  		mutedNamespaces = pflag.StringArray("mute-namespace", []string{"kube-system", "calico", "calico-system"}, "List of namespaces to ignore tracing events for. To mute multiple namespaces, provide this flag multiple times.")
    57  
    58  		fileHashEnrichedEnabled           = pflag.Bool("file-hash-enricher-enabled", false, "Enables the file has event enricher for exec events")
    59  		ttyDetectionSignatureEnabled      = pflag.Bool("signature-tty-detection-enabled", false, "Enables the tty detection signature")
    60  		socks5DetectionSignatureEnabled   = pflag.Bool("signature-socks5-detection-enabled", false, "Enables the socks5 detection signature")
    61  		socks5DetectionSignatureCacheSize = pflag.Uint32("signature-socks5-detection-cache-size", 1024, "Configures the amount of state machine cache entries to detect socks5 information")
    62  
    63  		netflowEnabled                     = pflag.Bool("netflow-enabled", false, "Enables netflow tracking")
    64  		netflowSampleSubmitIntervalSeconds = pflag.Uint64("netflow-sample-submit-interval-seconds", 15, "Netflow sample submit interval")
    65  		netflowOutputChanSize              = pflag.Int("netflow-output-queue-size", 4096, "Netflow output queue size")
    66  		netflowExportInterval              = pflag.Duration("netflow-export-interval", 15*time.Second, "Netflow export interval")
    67  		netflowCleanupInterval             = pflag.Duration("netflow-cleanup-interval", 60*time.Second, "Netflow cleanup interval")
    68  
    69  		clickhouseAddr     = pflag.String("clickhouse-addr", "", "Clickhouse address to send events to")
    70  		clickhouseDatabase = pflag.String("clickhouse-database", "", "Clickhouse database name")
    71  		clickhouseUsername = pflag.String("clickhouse-username", "", "Clickhouse username")
    72  
    73  		castaiServerInsecure = pflag.Bool("castai-server-insecure", false, "Use insecure connection to castai grpc server. Used for e2e.")
    74  
    75  		kubeAPIServiceAddr = pflag.String("kube-api-service-addr", "", "Custom kube API service grpc address")
    76  
    77  		exportersQueueSize = pflag.Int("exporters-queue-size", 4096, "Exporters queue size")
    78  	)
    79  
    80  	command := &cobra.Command{
    81  		Use: "run",
    82  		Run: func(cmd *cobra.Command, args []string) {
    83  			pflag.Parse()
    84  
    85  			ctx, stop := signal.NotifyContext(cmd.Context(), syscall.SIGINT, syscall.SIGTERM)
    86  			defer stop()
    87  
    88  			castaiClientCfg, err := resolveCastaiConfig(*castaiServerInsecure)
    89  			if err != nil {
    90  				slog.Warn(fmt.Errorf("skipping CAST AI integration: %w", err).Error())
    91  			}
    92  
    93  			if err := app.New(&app.Config{
    94  				LogLevel:              *logLevel,
    95  				LogRateInterval:       *logRateInterval,
    96  				LogRateBurst:          *logRateBurst,
    97  				SendLogsLevel:         *sendLogLevel,
    98  				Version:               version,
    99  				BTFPath:               *btfPath,
   100  				PyroscopeAddr:         *pyroscopeAddr,
   101  				ContainerdSockPath:    *containerdSockPath,
   102  				HostCgroupsDir:        *hostCgroupsDir,
   103  				MetricsHTTPListenPort: *metricsHTTPListenPort,
   104  				State: state.Config{
   105  					ContainerStatsScrapeInterval: *containerStatsScrapeInterval,
   106  					NetflowExportInterval:        *netflowExportInterval,
   107  					NetflowCleanupInterval:       *netflowCleanupInterval,
   108  				},
   109  				EBPFEventsStdioExporterEnabled: *ebpfEventsStdioExporterEnabled,
   110  				EBPFEventsPerCPUBuffer:         *ebpfEventsPerCPUBuffer,
   111  				EBPFEventsOutputChanSize:       *ebpfEventsOutputChanSize,
   112  				MutedNamespaces:                *mutedNamespaces,
   113  				SignatureEngineConfig: signature.SignatureEngineConfig{
   114  					InputChanSize:  *signatureEngineInputEventChanSize,
   115  					OutputChanSize: *signatureEngineOutputEventChanSize,
   116  					DefaultSignatureConfig: signature.DefaultSignatureConfig{
   117  						TTYDetectedSignatureEnabled:    *ttyDetectionSignatureEnabled,
   118  						SOCKS5DetectedSignatureEnabled: *socks5DetectionSignatureEnabled,
   119  						SOCKS5DetectedSignatureConfig: signature.SOCKS5DetectionSignatureConfig{
   120  							CacheSize: *socks5DetectionSignatureCacheSize,
   121  						},
   122  					},
   123  				},
   124  				Castai: castaiClientCfg,
   125  				EnricherConfig: app.EnricherConfig{
   126  					EnableFileHashEnricher: *fileHashEnrichedEnabled,
   127  				},
   128  				Netflow: app.NetflowConfig{
   129  					Enabled:                     *netflowEnabled,
   130  					SampleSubmitIntervalSeconds: *netflowSampleSubmitIntervalSeconds,
   131  					OutputChanSize:              *netflowOutputChanSize,
   132  				},
   133  				Clickhouse: app.ClickhouseConfig{
   134  					Addr:     *clickhouseAddr,
   135  					Database: *clickhouseDatabase,
   136  					Username: *clickhouseUsername,
   137  					Password: os.Getenv("CLICKHOUSE_PASSWORD"),
   138  				},
   139  				KubeAPIServiceAddr: *kubeAPIServiceAddr,
   140  				ExportersQueueSize: *exportersQueueSize,
   141  			}).Run(ctx); err != nil && !errors.Is(err, context.Canceled) {
   142  				slog.Error(err.Error())
   143  				os.Exit(1)
   144  			}
   145  		},
   146  	}
   147  	return command
   148  }
   149  
   150  func resolveCastaiConfig(castaiServerInsecure bool) (castai.Config, error) {
   151  	castaiGRPCAddress, found := os.LookupEnv("CASTAI_API_GRPC_ADDR")
   152  	if !found {
   153  		return castai.Config{}, fmt.Errorf("missing environment variable: CASTAI_API_GRPC_ADDR")
   154  	}
   155  	castaiClusterID, found := os.LookupEnv("CASTAI_CLUSTER_ID")
   156  	if !found {
   157  		return castai.Config{}, fmt.Errorf("missing environment variable: CASTAI_CLUSTER_ID")
   158  	}
   159  
   160  	apiKey, err := lookupConfigVariable("API_KEY")
   161  	if err != nil {
   162  		return castai.Config{}, err
   163  	}
   164  
   165  	return castai.Config{
   166  		APIKey:      apiKey,
   167  		APIGrpcAddr: castaiGRPCAddress,
   168  		ClusterID:   castaiClusterID,
   169  		Insecure:    castaiServerInsecure,
   170  	}, nil
   171  }