github.com/castai/kvisor@v1.7.1-0.20240516114728-b3572a2607b5/cmd/controller/state/delta/controller.go (about) 1 package delta 2 3 import ( 4 "context" 5 "encoding/json" 6 "errors" 7 "fmt" 8 "io" 9 "reflect" 10 "strconv" 11 "sync" 12 "time" 13 14 castaipb "github.com/castai/kvisor/api/v1/runtime" 15 "github.com/castai/kvisor/cmd/controller/kube" 16 "github.com/castai/kvisor/pkg/logging" 17 "github.com/cenkalti/backoff/v4" 18 "github.com/google/uuid" 19 "github.com/samber/lo" 20 "google.golang.org/grpc" 21 "google.golang.org/grpc/encoding/gzip" 22 "google.golang.org/grpc/metadata" 23 "google.golang.org/protobuf/types/known/timestamppb" 24 appsv1 "k8s.io/api/apps/v1" 25 batchv1 "k8s.io/api/batch/v1" 26 batchv1beta1 "k8s.io/api/batch/v1beta1" 27 corev1 "k8s.io/api/core/v1" 28 networkingv1 "k8s.io/api/networking/v1" 29 rbacv1 "k8s.io/api/rbac/v1" 30 ) 31 32 type castaiClient interface { 33 KubernetesDeltaIngest(ctx context.Context, opts ...grpc.CallOption) (castaipb.RuntimeSecurityAgentAPI_KubernetesDeltaIngestClient, error) 34 } 35 36 type kubeClient interface { 37 GetOwnerUID(obj kube.Object) string 38 } 39 40 type Config struct { 41 Enabled bool 42 Interval time.Duration `validate:"required"` 43 InitialDeltay time.Duration 44 SendTimeout time.Duration `validate:"required"` 45 UseCompression bool 46 } 47 48 func NewController( 49 log *logging.Logger, 50 cfg Config, 51 castaiClient castaiClient, 52 kubeClient kubeClient, 53 ) *Controller { 54 return &Controller{ 55 log: log.WithField("component", "delta"), 56 cfg: cfg, 57 castaiClient: castaiClient, 58 kubeClient: kubeClient, 59 pendingItems: map[string]deltaItem{}, 60 deltaSendMaxTries: 3, 61 deltaItemSendMaxTries: 3, 62 deltaRetryWait: 100 * time.Millisecond, 63 firstDeltaReport: true, 64 } 65 } 66 67 type Controller struct { 68 log *logging.Logger 69 cfg Config 70 castaiClient castaiClient 71 kubeClient kubeClient 72 73 pendingItems map[string]deltaItem 74 deltasMu sync.Mutex 75 deltaSendMaxTries uint64 76 deltaItemSendMaxTries uint64 77 deltaRetryWait time.Duration 78 firstDeltaReport bool 79 } 80 81 func (c *Controller) Run(ctx context.Context) error { 82 c.log.Info("running") 83 defer c.log.Infof("stopping") 84 85 // Wait for initial deltas sync before starting deltas send loop. 86 select { 87 case <-ctx.Done(): 88 return ctx.Err() 89 case <-time.After(c.cfg.InitialDeltay): 90 } 91 92 t := time.NewTicker(c.cfg.Interval) 93 defer t.Stop() 94 95 for { 96 select { 97 case <-ctx.Done(): 98 return ctx.Err() 99 case <-t.C: 100 if err := c.process(ctx); err != nil { 101 return err 102 } 103 } 104 } 105 } 106 107 func (c *Controller) RequiredTypes() []reflect.Type { 108 return []reflect.Type{ 109 reflect.TypeOf(&corev1.Pod{}), 110 reflect.TypeOf(&corev1.Namespace{}), 111 reflect.TypeOf(&corev1.Service{}), 112 reflect.TypeOf(&corev1.Node{}), 113 reflect.TypeOf(&appsv1.Deployment{}), 114 reflect.TypeOf(&appsv1.ReplicaSet{}), 115 reflect.TypeOf(&appsv1.DaemonSet{}), 116 reflect.TypeOf(&appsv1.StatefulSet{}), 117 reflect.TypeOf(&rbacv1.ClusterRoleBinding{}), 118 reflect.TypeOf(&rbacv1.RoleBinding{}), 119 reflect.TypeOf(&rbacv1.ClusterRole{}), 120 reflect.TypeOf(&rbacv1.Role{}), 121 reflect.TypeOf(&batchv1.Job{}), 122 reflect.TypeOf(&batchv1.CronJob{}), 123 reflect.TypeOf(&batchv1beta1.CronJob{}), 124 reflect.TypeOf(&networkingv1.Ingress{}), 125 reflect.TypeOf(&networkingv1.NetworkPolicy{}), 126 } 127 } 128 129 func (c *Controller) OnAdd(obj kube.Object) { 130 c.recordDeltaEvent(castaipb.KubernetesDeltaItemEvent_DELTA_ADD, obj) 131 } 132 133 func (c *Controller) OnUpdate(obj kube.Object) { 134 c.recordDeltaEvent(castaipb.KubernetesDeltaItemEvent_DELTA_UPDATE, obj) 135 } 136 137 func (c *Controller) OnDelete(obj kube.Object) { 138 c.recordDeltaEvent(castaipb.KubernetesDeltaItemEvent_DELTA_REMOVE, obj) 139 } 140 141 func (c *Controller) process(ctx context.Context) error { 142 pendingDeltas := c.popPendingItems() 143 144 if err := withExponentialRetry(ctx, c.log, func() error { 145 return c.sendDeltas(ctx, pendingDeltas) 146 }, c.deltaSendMaxTries); err != nil { 147 if c.firstDeltaReport { 148 // If we fail to send initial delta controller if be terminated and start again. 149 return fmt.Errorf("sending initial deltas: %w", err) 150 } 151 c.log.Errorf("sending deltas: %v", err) 152 return nil 153 } 154 155 if c.firstDeltaReport { 156 c.firstDeltaReport = false 157 } 158 return nil 159 } 160 161 func (c *Controller) sendDeltas(ctx context.Context, pendingDeltas []deltaItem) error { 162 if len(pendingDeltas) == 0 { 163 return nil 164 } 165 start := time.Now() 166 167 // Cancel context to close stream after deltas are sent. 168 ctx, cancel := context.WithTimeout(ctx, c.cfg.SendTimeout) 169 defer cancel() 170 171 deltaID := uuid.NewString() 172 meta := []string{ 173 "x-delta-id", deltaID, 174 "x-delta-count", strconv.Itoa(len(pendingDeltas)), 175 } 176 if c.firstDeltaReport { 177 meta = append(meta, "x-delta-full-snapshot", "true") 178 } 179 180 ctx = metadata.AppendToOutgoingContext(ctx, meta...) 181 var opts []grpc.CallOption 182 if c.cfg.UseCompression { 183 opts = append(opts, grpc.UseCompressor(gzip.Name)) 184 } 185 deltaStream, err := c.castaiClient.KubernetesDeltaIngest(ctx, opts...) 186 if err != nil && !errors.Is(err, context.Canceled) { 187 return err 188 } 189 defer func() { 190 _ = deltaStream.CloseSend() 191 }() 192 193 var sentDeltasCount int 194 for _, item := range pendingDeltas { 195 item := item 196 pbItem := c.toCastaiDelta(item) 197 if err := c.sendDeltaItem(ctx, deltaStream, pbItem); err != nil { 198 // Return any remaining items back to pending list. 199 c.upsertPendingItems(pendingDeltas[sentDeltasCount:]) 200 return err 201 } 202 sentDeltasCount++ 203 } 204 c.log.Infof("sent deltas, id=%v, count=%d/%d, duration=%v", deltaID, len(pendingDeltas), sentDeltasCount, time.Since(start)) 205 return nil 206 } 207 208 func (c *Controller) sendDeltaItem(ctx context.Context, stream castaipb.RuntimeSecurityAgentAPI_KubernetesDeltaIngestClient, item *castaipb.KubernetesDeltaItem) error { 209 return withExponentialRetry(ctx, c.log, func() error { 210 if err := stream.Send(item); err != nil { 211 if !isRetryableErr(err) { 212 return backoff.Permanent(err) 213 } 214 return fmt.Errorf("sending delta item: %w", err) 215 } 216 if _, err := stream.Recv(); err != nil { 217 if !isRetryableErr(err) { 218 return backoff.Permanent(err) 219 } 220 return fmt.Errorf("receiving delta ack: %w", err) 221 } 222 return nil 223 }, c.deltaItemSendMaxTries) 224 } 225 226 func withExponentialRetry(ctx context.Context, log *logging.Logger, fn func() error, max uint64) error { 227 return backoff.RetryNotify(fn, backoff.WithContext( 228 backoff.WithMaxRetries( 229 backoff.NewExponentialBackOff(), max, 230 ), ctx, 231 ), func(err error, duration time.Duration) { 232 if err != nil { 233 log.Warnf("action failed, duration=%v: %v", duration, err) 234 } 235 }) 236 } 237 238 func isRetryableErr(err error) bool { 239 if errors.Is(err, io.EOF) { 240 return false 241 } 242 if errors.Is(err, context.Canceled) { 243 return false 244 } 245 if errors.Is(err, context.DeadlineExceeded) { 246 return false 247 } 248 return true 249 } 250 251 func (c *Controller) recordDeltaEvent(action castaipb.KubernetesDeltaItemEvent, obj kube.Object) { 252 c.deltasMu.Lock() 253 defer c.deltasMu.Unlock() 254 255 c.pendingItems[string(obj.GetUID())] = deltaItem{ 256 object: obj, 257 action: action, 258 } 259 } 260 261 func (c *Controller) popPendingItems() []deltaItem { 262 c.deltasMu.Lock() 263 defer c.deltasMu.Unlock() 264 265 values := lo.Values(c.pendingItems) 266 c.pendingItems = map[string]deltaItem{} 267 268 return values 269 } 270 271 func (c *Controller) upsertPendingItems(items []deltaItem) { 272 c.deltasMu.Lock() 273 defer c.deltasMu.Unlock() 274 275 for _, item := range items { 276 key := string(item.object.GetUID()) 277 if v, ok := c.pendingItems[key]; ok { 278 item.action = v.action 279 c.pendingItems[key] = item 280 } else { 281 c.pendingItems[key] = item 282 } 283 } 284 } 285 286 func (c *Controller) toCastaiDelta(item deltaItem) *castaipb.KubernetesDeltaItem { 287 obj := item.object 288 objectUID := string(obj.GetUID()) 289 290 ownerUID := c.kubeClient.GetOwnerUID(obj) 291 containers, status, err := getContainersAndStatus(obj) 292 if err != nil { 293 c.log.Errorf("getting object status json for `%s`: %v", objectUID, err) 294 } 295 296 spec, err := getObjectSpec(obj) 297 if err != nil { 298 c.log.Errorf("getting object spec json for `%s`: %v", objectUID, err) 299 } 300 301 gvr := obj.GetObjectKind().GroupVersionKind() 302 303 return &castaipb.KubernetesDeltaItem{ 304 Event: item.action, 305 ObjectUid: objectUID, 306 ObjectName: obj.GetName(), 307 ObjectNamespace: obj.GetNamespace(), 308 ObjectKind: gvr.Kind, 309 ObjectApiVersion: gvr.GroupVersion().String(), 310 ObjectCreatedAt: timestamppb.New(obj.GetCreationTimestamp().UTC()), 311 ObjectContainers: containers, 312 ObjectOwnerUid: ownerUID, 313 ObjectLabels: obj.GetLabels(), 314 ObjectAnnotations: getAnnotations(obj), 315 ObjectStatus: status, 316 ObjectSpec: spec, 317 } 318 } 319 320 type deltaItem struct { 321 object kube.Object 322 action castaipb.KubernetesDeltaItemEvent 323 } 324 325 func getContainersAndStatus(obj kube.Object) ([]*castaipb.Container, []byte, error) { 326 var containers []corev1.Container 327 appendContainers := func(podSpec corev1.PodSpec) { 328 containers = append(containers, podSpec.Containers...) 329 containers = append(containers, podSpec.InitContainers...) 330 } 331 var st []byte 332 var err error 333 switch v := obj.(type) { 334 case *batchv1.Job: 335 st, err = json.Marshal(v.Status) 336 appendContainers(v.Spec.Template.Spec) 337 case *batchv1.CronJob: 338 st, err = json.Marshal(v.Status) 339 appendContainers(v.Spec.JobTemplate.Spec.Template.Spec) 340 case *corev1.Pod: 341 st, err = json.Marshal(v.Status) 342 appendContainers(v.Spec) 343 case *appsv1.Deployment: 344 st, err = json.Marshal(v.Status) 345 appendContainers(v.Spec.Template.Spec) 346 case *appsv1.StatefulSet: 347 st, err = json.Marshal(v.Status) 348 appendContainers(v.Spec.Template.Spec) 349 case *appsv1.DaemonSet: 350 st, err = json.Marshal(v.Status) 351 appendContainers(v.Spec.Template.Spec) 352 case *networkingv1.Ingress: 353 st, err = json.Marshal(v.Status) 354 case *corev1.Service: 355 st, err = json.Marshal(v.Status) 356 case *corev1.Node: 357 st, err = json.Marshal(v.Status) 358 default: 359 return nil, nil, nil 360 } 361 362 res := make([]*castaipb.Container, len(containers)) 363 for i, cont := range containers { 364 res[i] = &castaipb.Container{ 365 Name: cont.Name, 366 ImageName: cont.Image, 367 } 368 } 369 return res, st, err 370 } 371 372 func getAnnotations(obj kube.Object) map[string]string { 373 switch v := obj.(type) { 374 case *corev1.Service, *networkingv1.Ingress: 375 return v.GetAnnotations() 376 default: 377 return nil 378 } 379 } 380 381 func getObjectSpec(obj kube.Object) ([]byte, error) { 382 switch v := obj.(type) { 383 case *networkingv1.Ingress: 384 return json.Marshal(v.Spec) 385 case *corev1.Service: 386 return json.Marshal(v.Spec) 387 case *appsv1.Deployment: 388 return json.Marshal(v.Spec) 389 case *appsv1.StatefulSet: 390 return json.Marshal(v.Spec) 391 case *appsv1.DaemonSet: 392 return json.Marshal(v.Spec) 393 default: 394 return nil, nil 395 } 396 }