github.com/castai/kvisor@v1.7.1-0.20240516114728-b3572a2607b5/cmd/controller/state/imagescan/controller.go (about) 1 package imagescan 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "reflect" 8 "sort" 9 "sync" 10 "time" 11 12 castaipb "github.com/castai/kvisor/api/v1/runtime" 13 imagescanconfig "github.com/castai/kvisor/cmd/imagescan/config" 14 "github.com/castai/kvisor/pkg/metrics" 15 "google.golang.org/grpc" 16 corev1 "k8s.io/api/core/v1" 17 18 "github.com/castai/kvisor/cmd/controller/kube" 19 "github.com/castai/kvisor/pkg/logging" 20 "github.com/samber/lo" 21 ) 22 23 type castaiClient interface { 24 GetSyncState(ctx context.Context, in *castaipb.GetSyncStateRequest, opts ...grpc.CallOption) (*castaipb.GetSyncStateResponse, error) 25 UpdateSyncState(ctx context.Context, in *castaipb.UpdateSyncStateRequest, opts ...grpc.CallOption) (*castaipb.UpdateSyncStateResponse, error) 26 } 27 28 type Config struct { 29 Enabled bool 30 CastaiSecretRefName string 31 ScanInterval time.Duration `validate:"required"` 32 ScanTimeout time.Duration 33 MaxConcurrentScans int64 `validate:"required"` 34 ScanJobImagePullPolicy string 35 Mode string 36 CPURequest string 37 CPULimit string 38 MemoryRequest string 39 MemoryLimit string 40 ProfileEnabled bool 41 PhlareEnabled bool 42 PrivateRegistryPullSecret string 43 ServiceAccount string 44 InitDelay time.Duration 45 CastaiGRPCAddress string 46 CastaiClusterID string 47 CastaiGrpcInsecure bool 48 ImageScanBlobsCacheURL string 49 CloudProvider string 50 } 51 52 type ImageScanImage struct { 53 PullPolicy string `envconfig:"IMAGE_SCAN_IMAGE_PULL_POLICY" yaml:"pullPolicy"` 54 } 55 56 func NewController( 57 log *logging.Logger, 58 cfg Config, 59 imageScanner imageScanner, 60 client castaiClient, 61 kubeController kubeClient, 62 ) *Controller { 63 log = log.WithField("component", "imagescan") 64 return &Controller{ 65 imageScanner: imageScanner, 66 client: client, 67 kubeController: kubeController, 68 delta: newDeltaState(kubeController), 69 log: log, 70 cfg: cfg, 71 timeGetter: timeGetter(), 72 initialScansDelay: cfg.InitDelay, 73 } 74 } 75 76 func timeGetter() func() time.Time { 77 return func() time.Time { 78 return time.Now().UTC() 79 } 80 } 81 82 type Controller struct { 83 delta *deltaState 84 imageScanner imageScanner 85 client castaiClient 86 kubeController kubeClient 87 log *logging.Logger 88 cfg Config 89 timeGetter func() time.Time 90 91 initialScansDelay time.Duration 92 fullSnapshotSent bool 93 } 94 95 func (c *Controller) RequiredTypes() []reflect.Type { 96 return []reflect.Type{ 97 reflect.TypeOf(&corev1.Pod{}), 98 reflect.TypeOf(&corev1.Node{}), 99 } 100 } 101 102 func (c *Controller) Run(ctx context.Context) error { 103 c.log.Info("running") 104 defer c.log.Infof("stopping") 105 106 // Before starting normal scans and deltas processing 107 // we need to spend some time processing only deltas to make sure 108 // we have full images view. 109 if err := c.waitInitialDeltaQueueSync(ctx); err != nil { 110 return err 111 } 112 113 scanTicker := time.NewTicker(c.cfg.ScanInterval) 114 defer scanTicker.Stop() 115 for { 116 select { 117 case <-ctx.Done(): 118 return ctx.Err() 119 case <-scanTicker.C: 120 if err := c.scheduleScans(ctx); err != nil { 121 c.log.Errorf("images scan failed: %v", err) 122 } 123 } 124 } 125 } 126 127 func (c *Controller) waitInitialDeltaQueueSync(ctx context.Context) error { 128 waitTimeout := time.After(c.initialScansDelay) 129 for { 130 select { 131 case <-ctx.Done(): 132 return ctx.Err() 133 case <-waitTimeout: 134 return nil 135 } 136 } 137 } 138 139 func (c *Controller) OnAdd(obj kube.Object) { 140 c.delta.Upsert(obj) 141 } 142 143 func (c *Controller) OnUpdate(obj kube.Object) { 144 c.delta.Upsert(obj) 145 } 146 147 func (c *Controller) OnDelete(obj kube.Object) { 148 c.delta.Delete(obj) 149 } 150 151 func (c *Controller) scheduleScans(ctx context.Context) (rerr error) { 152 c.syncFromRemoteState(ctx) 153 154 images := c.delta.GetImagesCopy() 155 if err := c.updateImageStatuses(ctx, images); err != nil { 156 c.log.Errorf("sending images resources changes: %v", err) 157 } 158 // Scan pending images. 159 pendingImages := c.findPendingImages(images) 160 concurrentScans := int(c.cfg.MaxConcurrentScans) 161 imagesForScan := pendingImages 162 if len(imagesForScan) > concurrentScans { 163 imagesForScan = imagesForScan[:concurrentScans] 164 } 165 166 if l := len(imagesForScan); l > 0 { 167 c.log.Infof("scheduling %d images scans", l) 168 if err := c.scanImages(ctx, imagesForScan); err != nil { 169 return err 170 } 171 c.log.Info("images scan finished") 172 } else { 173 c.log.Debug("skipping images scan, no pending images") 174 } 175 176 return nil 177 } 178 179 func (c *Controller) findPendingImages(images []*image) []*image { 180 now := c.timeGetter() 181 182 privateImagesCount := lo.CountBy(images, func(v *image) bool { 183 return isImagePrivate(v) 184 }) 185 pendingImages := lo.Filter(images, func(v *image, _ int) bool { 186 return isImagePending(v, now) 187 }) 188 sort.Slice(pendingImages, func(i, j int) bool { 189 return pendingImages[i].failures < pendingImages[j].failures 190 }) 191 c.log.Infof("found %d images, pending images %d", len(images), len(pendingImages)) 192 metrics.ControllerImagesCount.Set(float64(len(images))) 193 metrics.ControllerPendingImagesCount.Set(float64(len(pendingImages))) 194 if privateImagesCount > 0 { 195 c.log.Warnf("skipping %d private images", privateImagesCount) 196 } 197 return pendingImages 198 } 199 200 func (c *Controller) scanImages(ctx context.Context, images []*image) error { 201 var wg sync.WaitGroup 202 for _, img := range images { 203 if img.name == "" { 204 return fmt.Errorf("no image name set, image_id=%s", img.id) 205 } 206 207 wg.Add(1) 208 go func(img *image) { 209 defer wg.Done() 210 211 if ctx.Err() != nil { 212 return 213 } 214 215 ctx, cancel := context.WithTimeout(ctx, c.cfg.ScanTimeout) 216 defer cancel() 217 218 log := c.log.WithField("image", img.name) 219 log.Info("scanning image") 220 if err := c.scanImage(ctx, img); err != nil { 221 log.Errorf("image scan failed: %v", err) 222 parsedErr := parseErrorFromLog(err) 223 c.delta.SetImageScanError(img.key, parsedErr) 224 if err := c.updateImageStatusAsFailed(ctx, img, parsedErr); err != nil { 225 c.log.Errorf("sending images resources changes: %v", err) 226 } 227 return 228 } 229 log.Info("image scan finished") 230 c.delta.SetImageScanned(img.key) 231 }(img) 232 } 233 234 done := make(chan struct{}) 235 go func() { 236 wg.Wait() 237 close(done) 238 }() 239 240 select { 241 case <-done: 242 return nil 243 case <-ctx.Done(): 244 return ctx.Err() 245 } 246 } 247 248 func (c *Controller) scanImage(ctx context.Context, img *image) (rerr error) { 249 ctx, cancel := context.WithTimeout(ctx, 10*time.Minute) 250 defer cancel() 251 252 agentImageDetails, found := c.kubeController.GetKvisorAgentImageDetails() 253 if !found { 254 return errors.New("kvisor image details not found") 255 } 256 257 return c.imageScanner.ScanImage(ctx, ScanImageParams{ 258 ImageName: img.name, 259 ImageID: img.id, 260 ContainerRuntime: string(img.containerRuntime), 261 Mode: string(imagescanconfig.ModeRemote), 262 ResourceIDs: lo.Keys(img.owners), 263 DeleteFinishedJob: true, 264 WaitForCompletion: true, 265 WaitDurationAfterCompletion: 30 * time.Second, 266 Architecture: img.architecture, 267 Os: img.os, 268 ScanImageDetails: agentImageDetails, 269 }) 270 } 271 272 func (c *Controller) updateImageStatuses(ctx context.Context, images []*image) error { 273 if c.fullSnapshotSent { 274 images = lo.Filter(images, func(item *image, index int) bool { 275 return item.ownerChangedAt.After(item.resourcesUpdatedAt) 276 }) 277 } 278 if len(images) == 0 { 279 return nil 280 } 281 now := c.timeGetter() 282 var imagesChanges []*castaipb.Image 283 for _, img := range images { 284 resourceIds := lo.Keys(img.owners) 285 286 var updatedStatus castaipb.ImageScanStatus 287 if isImagePending(img, now) { 288 updatedStatus = castaipb.ImageScanStatus_IMAGE_SCAN_STATUS_PENDING 289 } 290 imagesChanges = append(imagesChanges, &castaipb.Image{ 291 Id: img.id, 292 Architecture: img.architecture, 293 ResourceIds: resourceIds, 294 Name: img.name, 295 ScanStatus: updatedStatus, 296 }) 297 } 298 299 c.log.Info("sending images sync state") 300 report := &castaipb.UpdateSyncStateRequest{ 301 FullSnapshot: !c.fullSnapshotSent, 302 Images: imagesChanges, 303 } 304 _, err := c.client.UpdateSyncState(ctx, report) 305 if err != nil { 306 return err 307 } 308 c.delta.SetResourcesUpdatedAt(images, now) 309 c.fullSnapshotSent = true 310 return nil 311 } 312 313 func (c *Controller) updateImageStatusAsFailed(ctx context.Context, image *image, scanJobError error) error { 314 if image == nil { 315 return errors.New("image is missing") 316 } 317 var errorMsg string 318 if scanJobError != nil { 319 errorMsg = scanJobError.Error() 320 } 321 322 updatedImage := &castaipb.Image{ 323 Id: image.id, 324 Name: image.name, 325 Architecture: image.architecture, 326 ScanStatus: castaipb.ImageScanStatus_IMAGE_SCAN_STATUS_SCAN_ERROR, 327 ScanError: errorMsg, 328 } 329 330 c.log.Info("sending image failed status") 331 req := &castaipb.UpdateSyncStateRequest{ 332 Images: []*castaipb.Image{updatedImage}, 333 } 334 _, err := c.client.UpdateSyncState(ctx, req) 335 return err 336 } 337 338 func (c *Controller) syncFromRemoteState(ctx context.Context) { 339 images := c.delta.GetImagesCopy() 340 341 now := c.timeGetter().UTC() 342 imagesWithNotSyncedState := lo.Filter(images, func(item *image, index int) bool { 343 return !item.scanned && item.lastRemoteSyncAt.Before(now.Add(-10*time.Minute)) 344 }) 345 346 if len(imagesWithNotSyncedState) == 0 { 347 return 348 } 349 350 imagesIds := lo.Map(imagesWithNotSyncedState, func(item *image, index int) string { 351 return item.id 352 }) 353 c.log.Debugf("sync images state from remote") 354 resp, err := c.client.GetSyncState(ctx, &castaipb.GetSyncStateRequest{ImageIds: imagesIds}) 355 if err != nil { 356 c.log.Errorf("getting images sync state from remote: %v", err) 357 return 358 } 359 if resp.Images == nil { 360 return 361 } 362 363 // Set sync state for all these images to prevent constant api calls. 364 c.delta.UpdateRemoteSyncedAt(imagesWithNotSyncedState, now) 365 366 // Set images as scanned from remote response. 367 c.delta.SetScannedImages(resp.Images.Images) 368 369 // If full resources resync is required it will be sent during next scheduled scan. 370 if resp.Images.FullResyncRequired { 371 c.fullSnapshotSent = false 372 } 373 c.log.Infof("images updated from remote state, full_resync=%v, scanned_images=%d", resp.Images.FullResyncRequired, len(resp.Images.Images)) 374 } 375 376 func isImagePending(v *image, now time.Time) bool { 377 return !v.scanned && 378 len(v.owners) > 0 && 379 !isImagePrivate(v) && 380 (v.nextScan.IsZero() || v.nextScan.Before(now)) 381 } 382 383 func isImagePrivate(v *image) bool { 384 return errors.Is(v.lastScanErr, errPrivateImage) 385 }