github.com/castai/kvisor@v1.7.1-0.20240516114728-b3572a2607b5/cmd/controller/state/imagescan/controller.go (about)

     1  package imagescan
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"reflect"
     8  	"sort"
     9  	"sync"
    10  	"time"
    11  
    12  	castaipb "github.com/castai/kvisor/api/v1/runtime"
    13  	imagescanconfig "github.com/castai/kvisor/cmd/imagescan/config"
    14  	"github.com/castai/kvisor/pkg/metrics"
    15  	"google.golang.org/grpc"
    16  	corev1 "k8s.io/api/core/v1"
    17  
    18  	"github.com/castai/kvisor/cmd/controller/kube"
    19  	"github.com/castai/kvisor/pkg/logging"
    20  	"github.com/samber/lo"
    21  )
    22  
    23  type castaiClient interface {
    24  	GetSyncState(ctx context.Context, in *castaipb.GetSyncStateRequest, opts ...grpc.CallOption) (*castaipb.GetSyncStateResponse, error)
    25  	UpdateSyncState(ctx context.Context, in *castaipb.UpdateSyncStateRequest, opts ...grpc.CallOption) (*castaipb.UpdateSyncStateResponse, error)
    26  }
    27  
    28  type Config struct {
    29  	Enabled                   bool
    30  	CastaiSecretRefName       string
    31  	ScanInterval              time.Duration `validate:"required"`
    32  	ScanTimeout               time.Duration
    33  	MaxConcurrentScans        int64 `validate:"required"`
    34  	ScanJobImagePullPolicy    string
    35  	Mode                      string
    36  	CPURequest                string
    37  	CPULimit                  string
    38  	MemoryRequest             string
    39  	MemoryLimit               string
    40  	ProfileEnabled            bool
    41  	PhlareEnabled             bool
    42  	PrivateRegistryPullSecret string
    43  	ServiceAccount            string
    44  	InitDelay                 time.Duration
    45  	CastaiGRPCAddress         string
    46  	CastaiClusterID           string
    47  	CastaiGrpcInsecure        bool
    48  	ImageScanBlobsCacheURL    string
    49  	CloudProvider             string
    50  }
    51  
    52  type ImageScanImage struct {
    53  	PullPolicy string `envconfig:"IMAGE_SCAN_IMAGE_PULL_POLICY" yaml:"pullPolicy"`
    54  }
    55  
    56  func NewController(
    57  	log *logging.Logger,
    58  	cfg Config,
    59  	imageScanner imageScanner,
    60  	client castaiClient,
    61  	kubeController kubeClient,
    62  ) *Controller {
    63  	log = log.WithField("component", "imagescan")
    64  	return &Controller{
    65  		imageScanner:      imageScanner,
    66  		client:            client,
    67  		kubeController:    kubeController,
    68  		delta:             newDeltaState(kubeController),
    69  		log:               log,
    70  		cfg:               cfg,
    71  		timeGetter:        timeGetter(),
    72  		initialScansDelay: cfg.InitDelay,
    73  	}
    74  }
    75  
    76  func timeGetter() func() time.Time {
    77  	return func() time.Time {
    78  		return time.Now().UTC()
    79  	}
    80  }
    81  
    82  type Controller struct {
    83  	delta          *deltaState
    84  	imageScanner   imageScanner
    85  	client         castaiClient
    86  	kubeController kubeClient
    87  	log            *logging.Logger
    88  	cfg            Config
    89  	timeGetter     func() time.Time
    90  
    91  	initialScansDelay time.Duration
    92  	fullSnapshotSent  bool
    93  }
    94  
    95  func (c *Controller) RequiredTypes() []reflect.Type {
    96  	return []reflect.Type{
    97  		reflect.TypeOf(&corev1.Pod{}),
    98  		reflect.TypeOf(&corev1.Node{}),
    99  	}
   100  }
   101  
   102  func (c *Controller) Run(ctx context.Context) error {
   103  	c.log.Info("running")
   104  	defer c.log.Infof("stopping")
   105  
   106  	// Before starting normal scans and deltas processing
   107  	// we need to spend some time processing only deltas to make sure
   108  	// we have full images view.
   109  	if err := c.waitInitialDeltaQueueSync(ctx); err != nil {
   110  		return err
   111  	}
   112  
   113  	scanTicker := time.NewTicker(c.cfg.ScanInterval)
   114  	defer scanTicker.Stop()
   115  	for {
   116  		select {
   117  		case <-ctx.Done():
   118  			return ctx.Err()
   119  		case <-scanTicker.C:
   120  			if err := c.scheduleScans(ctx); err != nil {
   121  				c.log.Errorf("images scan failed: %v", err)
   122  			}
   123  		}
   124  	}
   125  }
   126  
   127  func (c *Controller) waitInitialDeltaQueueSync(ctx context.Context) error {
   128  	waitTimeout := time.After(c.initialScansDelay)
   129  	for {
   130  		select {
   131  		case <-ctx.Done():
   132  			return ctx.Err()
   133  		case <-waitTimeout:
   134  			return nil
   135  		}
   136  	}
   137  }
   138  
   139  func (c *Controller) OnAdd(obj kube.Object) {
   140  	c.delta.Upsert(obj)
   141  }
   142  
   143  func (c *Controller) OnUpdate(obj kube.Object) {
   144  	c.delta.Upsert(obj)
   145  }
   146  
   147  func (c *Controller) OnDelete(obj kube.Object) {
   148  	c.delta.Delete(obj)
   149  }
   150  
   151  func (c *Controller) scheduleScans(ctx context.Context) (rerr error) {
   152  	c.syncFromRemoteState(ctx)
   153  
   154  	images := c.delta.GetImagesCopy()
   155  	if err := c.updateImageStatuses(ctx, images); err != nil {
   156  		c.log.Errorf("sending images resources changes: %v", err)
   157  	}
   158  	// Scan pending images.
   159  	pendingImages := c.findPendingImages(images)
   160  	concurrentScans := int(c.cfg.MaxConcurrentScans)
   161  	imagesForScan := pendingImages
   162  	if len(imagesForScan) > concurrentScans {
   163  		imagesForScan = imagesForScan[:concurrentScans]
   164  	}
   165  
   166  	if l := len(imagesForScan); l > 0 {
   167  		c.log.Infof("scheduling %d images scans", l)
   168  		if err := c.scanImages(ctx, imagesForScan); err != nil {
   169  			return err
   170  		}
   171  		c.log.Info("images scan finished")
   172  	} else {
   173  		c.log.Debug("skipping images scan, no pending images")
   174  	}
   175  
   176  	return nil
   177  }
   178  
   179  func (c *Controller) findPendingImages(images []*image) []*image {
   180  	now := c.timeGetter()
   181  
   182  	privateImagesCount := lo.CountBy(images, func(v *image) bool {
   183  		return isImagePrivate(v)
   184  	})
   185  	pendingImages := lo.Filter(images, func(v *image, _ int) bool {
   186  		return isImagePending(v, now)
   187  	})
   188  	sort.Slice(pendingImages, func(i, j int) bool {
   189  		return pendingImages[i].failures < pendingImages[j].failures
   190  	})
   191  	c.log.Infof("found %d images, pending images %d", len(images), len(pendingImages))
   192  	metrics.ControllerImagesCount.Set(float64(len(images)))
   193  	metrics.ControllerPendingImagesCount.Set(float64(len(pendingImages)))
   194  	if privateImagesCount > 0 {
   195  		c.log.Warnf("skipping %d private images", privateImagesCount)
   196  	}
   197  	return pendingImages
   198  }
   199  
   200  func (c *Controller) scanImages(ctx context.Context, images []*image) error {
   201  	var wg sync.WaitGroup
   202  	for _, img := range images {
   203  		if img.name == "" {
   204  			return fmt.Errorf("no image name set, image_id=%s", img.id)
   205  		}
   206  
   207  		wg.Add(1)
   208  		go func(img *image) {
   209  			defer wg.Done()
   210  
   211  			if ctx.Err() != nil {
   212  				return
   213  			}
   214  
   215  			ctx, cancel := context.WithTimeout(ctx, c.cfg.ScanTimeout)
   216  			defer cancel()
   217  
   218  			log := c.log.WithField("image", img.name)
   219  			log.Info("scanning image")
   220  			if err := c.scanImage(ctx, img); err != nil {
   221  				log.Errorf("image scan failed: %v", err)
   222  				parsedErr := parseErrorFromLog(err)
   223  				c.delta.SetImageScanError(img.key, parsedErr)
   224  				if err := c.updateImageStatusAsFailed(ctx, img, parsedErr); err != nil {
   225  					c.log.Errorf("sending images resources changes: %v", err)
   226  				}
   227  				return
   228  			}
   229  			log.Info("image scan finished")
   230  			c.delta.SetImageScanned(img.key)
   231  		}(img)
   232  	}
   233  
   234  	done := make(chan struct{})
   235  	go func() {
   236  		wg.Wait()
   237  		close(done)
   238  	}()
   239  
   240  	select {
   241  	case <-done:
   242  		return nil
   243  	case <-ctx.Done():
   244  		return ctx.Err()
   245  	}
   246  }
   247  
   248  func (c *Controller) scanImage(ctx context.Context, img *image) (rerr error) {
   249  	ctx, cancel := context.WithTimeout(ctx, 10*time.Minute)
   250  	defer cancel()
   251  
   252  	agentImageDetails, found := c.kubeController.GetKvisorAgentImageDetails()
   253  	if !found {
   254  		return errors.New("kvisor image details not found")
   255  	}
   256  
   257  	return c.imageScanner.ScanImage(ctx, ScanImageParams{
   258  		ImageName:                   img.name,
   259  		ImageID:                     img.id,
   260  		ContainerRuntime:            string(img.containerRuntime),
   261  		Mode:                        string(imagescanconfig.ModeRemote),
   262  		ResourceIDs:                 lo.Keys(img.owners),
   263  		DeleteFinishedJob:           true,
   264  		WaitForCompletion:           true,
   265  		WaitDurationAfterCompletion: 30 * time.Second,
   266  		Architecture:                img.architecture,
   267  		Os:                          img.os,
   268  		ScanImageDetails:            agentImageDetails,
   269  	})
   270  }
   271  
   272  func (c *Controller) updateImageStatuses(ctx context.Context, images []*image) error {
   273  	if c.fullSnapshotSent {
   274  		images = lo.Filter(images, func(item *image, index int) bool {
   275  			return item.ownerChangedAt.After(item.resourcesUpdatedAt)
   276  		})
   277  	}
   278  	if len(images) == 0 {
   279  		return nil
   280  	}
   281  	now := c.timeGetter()
   282  	var imagesChanges []*castaipb.Image
   283  	for _, img := range images {
   284  		resourceIds := lo.Keys(img.owners)
   285  
   286  		var updatedStatus castaipb.ImageScanStatus
   287  		if isImagePending(img, now) {
   288  			updatedStatus = castaipb.ImageScanStatus_IMAGE_SCAN_STATUS_PENDING
   289  		}
   290  		imagesChanges = append(imagesChanges, &castaipb.Image{
   291  			Id:           img.id,
   292  			Architecture: img.architecture,
   293  			ResourceIds:  resourceIds,
   294  			Name:         img.name,
   295  			ScanStatus:   updatedStatus,
   296  		})
   297  	}
   298  
   299  	c.log.Info("sending images sync state")
   300  	report := &castaipb.UpdateSyncStateRequest{
   301  		FullSnapshot: !c.fullSnapshotSent,
   302  		Images:       imagesChanges,
   303  	}
   304  	_, err := c.client.UpdateSyncState(ctx, report)
   305  	if err != nil {
   306  		return err
   307  	}
   308  	c.delta.SetResourcesUpdatedAt(images, now)
   309  	c.fullSnapshotSent = true
   310  	return nil
   311  }
   312  
   313  func (c *Controller) updateImageStatusAsFailed(ctx context.Context, image *image, scanJobError error) error {
   314  	if image == nil {
   315  		return errors.New("image is missing")
   316  	}
   317  	var errorMsg string
   318  	if scanJobError != nil {
   319  		errorMsg = scanJobError.Error()
   320  	}
   321  
   322  	updatedImage := &castaipb.Image{
   323  		Id:           image.id,
   324  		Name:         image.name,
   325  		Architecture: image.architecture,
   326  		ScanStatus:   castaipb.ImageScanStatus_IMAGE_SCAN_STATUS_SCAN_ERROR,
   327  		ScanError:    errorMsg,
   328  	}
   329  
   330  	c.log.Info("sending image failed status")
   331  	req := &castaipb.UpdateSyncStateRequest{
   332  		Images: []*castaipb.Image{updatedImage},
   333  	}
   334  	_, err := c.client.UpdateSyncState(ctx, req)
   335  	return err
   336  }
   337  
   338  func (c *Controller) syncFromRemoteState(ctx context.Context) {
   339  	images := c.delta.GetImagesCopy()
   340  
   341  	now := c.timeGetter().UTC()
   342  	imagesWithNotSyncedState := lo.Filter(images, func(item *image, index int) bool {
   343  		return !item.scanned && item.lastRemoteSyncAt.Before(now.Add(-10*time.Minute))
   344  	})
   345  
   346  	if len(imagesWithNotSyncedState) == 0 {
   347  		return
   348  	}
   349  
   350  	imagesIds := lo.Map(imagesWithNotSyncedState, func(item *image, index int) string {
   351  		return item.id
   352  	})
   353  	c.log.Debugf("sync images state from remote")
   354  	resp, err := c.client.GetSyncState(ctx, &castaipb.GetSyncStateRequest{ImageIds: imagesIds})
   355  	if err != nil {
   356  		c.log.Errorf("getting images sync state from remote: %v", err)
   357  		return
   358  	}
   359  	if resp.Images == nil {
   360  		return
   361  	}
   362  
   363  	// Set sync state for all these images to prevent constant api calls.
   364  	c.delta.UpdateRemoteSyncedAt(imagesWithNotSyncedState, now)
   365  
   366  	// Set images as scanned from remote response.
   367  	c.delta.SetScannedImages(resp.Images.Images)
   368  
   369  	// If full resources resync is required it will be sent during next scheduled scan.
   370  	if resp.Images.FullResyncRequired {
   371  		c.fullSnapshotSent = false
   372  	}
   373  	c.log.Infof("images updated from remote state, full_resync=%v, scanned_images=%d", resp.Images.FullResyncRequired, len(resp.Images.Images))
   374  }
   375  
   376  func isImagePending(v *image, now time.Time) bool {
   377  	return !v.scanned &&
   378  		len(v.owners) > 0 &&
   379  		!isImagePrivate(v) &&
   380  		(v.nextScan.IsZero() || v.nextScan.Before(now))
   381  }
   382  
   383  func isImagePrivate(v *image) bool {
   384  	return errors.Is(v.lastScanErr, errPrivateImage)
   385  }