github.com/Azure/aad-pod-identity@v1.8.17/pkg/mic/mic.go (about) 1 package mic 2 3 import ( 4 "context" 5 "fmt" 6 "os" 7 "reflect" 8 "sort" 9 "strings" 10 "sync" 11 "sync/atomic" 12 "time" 13 14 aadpodid "github.com/Azure/aad-pod-identity/pkg/apis/aadpodidentity" 15 "github.com/Azure/aad-pod-identity/pkg/cloudprovider" 16 "github.com/Azure/aad-pod-identity/pkg/crd" 17 "github.com/Azure/aad-pod-identity/pkg/filewatcher" 18 "github.com/Azure/aad-pod-identity/pkg/metrics" 19 "github.com/Azure/aad-pod-identity/pkg/pod" 20 "github.com/Azure/aad-pod-identity/pkg/stats" 21 "github.com/Azure/aad-pod-identity/pkg/utils" 22 "github.com/Azure/aad-pod-identity/version" 23 24 "github.com/fsnotify/fsnotify" 25 "golang.org/x/sync/semaphore" 26 corev1 "k8s.io/api/core/v1" 27 apierrors "k8s.io/apimachinery/pkg/api/errors" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 "k8s.io/client-go/informers" 31 "k8s.io/client-go/kubernetes" 32 "k8s.io/client-go/kubernetes/scheme" 33 typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" 34 "k8s.io/client-go/rest" 35 "k8s.io/client-go/tools/leaderelection" 36 "k8s.io/client-go/tools/leaderelection/resourcelock" 37 "k8s.io/client-go/tools/record" 38 "k8s.io/klog/v2" 39 ) 40 41 const ( 42 stopped = int32(0) 43 running = int32(1) 44 ) 45 46 // NodeGetter is an abstraction used to get Kubernetes node info. 47 type NodeGetter interface { 48 Get(name string) (*corev1.Node, error) 49 Start(<-chan struct{}) 50 } 51 52 // TypeUpgradeConfig - configuration aspects of type related changes required for client-go upgrade. 53 type TypeUpgradeConfig struct { 54 // Key in the config map which indicates if a type upgrade has been performed. 55 TypeUpgradeStatusKey string 56 EnableTypeUpgrade bool 57 } 58 59 // CMConfig - config map for aad-pod-identity 60 type CMConfig struct { 61 Namespace string 62 Name string 63 } 64 65 // LeaderElectionConfig - used to keep track of leader election config. 66 type LeaderElectionConfig struct { 67 Namespace string 68 Name string 69 Duration time.Duration 70 Instance string 71 } 72 73 // UpdateUserMSIConfig - parameters for retrying cloudprovider's UpdateUserMSI function 74 type UpdateUserMSIConfig struct { 75 MaxRetry int 76 RetryInterval time.Duration 77 } 78 79 // Client has the required pointers to talk to the api server 80 // and interact with the CRD related data structure. 81 type Client struct { 82 CRDClient crd.ClientInt 83 CloudClient cloudprovider.ClientInt 84 PodClient pod.ClientInt 85 CloudConfigWatcher filewatcher.ClientInt 86 EventRecorder record.EventRecorder 87 EventChannel chan aadpodid.EventType 88 NodeClient NodeGetter 89 IsNamespaced bool 90 SyncLoopStarted bool 91 syncRetryInterval time.Duration 92 createDeleteBatch int64 93 ImmutableUserMSIsMap map[string]bool 94 identityAssignmentReconcileInterval time.Duration 95 96 syncing int32 // protect against concurrent sync's 97 98 leaderElector *leaderelection.LeaderElector 99 *LeaderElectionConfig 100 Reporter *metrics.Reporter 101 TypeUpgradeCfg *TypeUpgradeConfig 102 CMCfg *CMConfig 103 CMClient typedcorev1.ConfigMapInterface 104 } 105 106 // Config - MIC Config 107 type Config struct { 108 CloudCfgPath string 109 RestConfig *rest.Config 110 IsNamespaced bool 111 SyncRetryInterval time.Duration 112 LeaderElectionCfg *LeaderElectionConfig 113 CreateDeleteBatch int64 114 ImmutableUserMSIsList []string 115 CMcfg *CMConfig 116 TypeUpgradeCfg *TypeUpgradeConfig 117 UpdateUserMSICfg *UpdateUserMSIConfig 118 IdentityAssignmentReconcileInterval time.Duration 119 } 120 121 // ClientInt is an abstraction used to perform an MIC sync cycle. 122 type ClientInt interface { 123 Start(exit <-chan struct{}) 124 Sync(exit <-chan struct{}) 125 } 126 127 type trackUserAssignedMSIIds struct { 128 addUserAssignedMSIIDs []string 129 removeUserAssignedMSIIDs []string 130 assignedIDsToCreate []aadpodid.AzureAssignedIdentity 131 assignedIDsToDelete []aadpodid.AzureAssignedIdentity 132 assignedIDsToUpdate []aadpodid.AzureAssignedIdentity 133 isvmss bool 134 } 135 136 // NewMICClient returns new mic client 137 func NewMICClient(cfg *Config) (*Client, error) { 138 klog.Infof("starting to create the pod identity client. Version: %v. Build date: %v", version.MICVersion, version.BuildDate) 139 140 clientSet := kubernetes.NewForConfigOrDie(cfg.RestConfig) 141 142 k8sVersion, err := clientSet.ServerVersion() 143 if err == nil { 144 klog.Infof("Kubernetes server version: %s", k8sVersion.String()) 145 } 146 147 informer := informers.NewSharedInformerFactory(clientSet, 30*time.Second) 148 149 cloudClient, err := cloudprovider.NewCloudProvider(cfg.CloudCfgPath, cfg.UpdateUserMSICfg.MaxRetry, cfg.UpdateUserMSICfg.RetryInterval) 150 if err != nil { 151 return nil, err 152 } 153 klog.V(1).Infof("cloud provider initialized") 154 155 eventCh := make(chan aadpodid.EventType, 100) 156 157 crdClient, err := crd.NewCRDClient(cfg.RestConfig, eventCh) 158 if err != nil { 159 return nil, err 160 } 161 klog.V(1).Infof("CRD client initialized") 162 163 podClient := pod.NewPodClient(informer, eventCh) 164 klog.V(1).Infof("pod Client initialized") 165 166 cloudConfigWatcher, err := filewatcher.NewFileWatcher( 167 func(event fsnotify.Event) { 168 if event.Op&fsnotify.Write == fsnotify.Write { 169 if err := cloudClient.Init(); err != nil { 170 return 171 } 172 klog.V(1).Infof("cloud provider re-initialized") 173 } 174 }, func(err error) { 175 klog.Errorf("failed to handle fsnotify event, error: %+v", err) 176 }) 177 if err != nil { 178 return nil, err 179 } 180 if err := cloudConfigWatcher.Add(cfg.CloudCfgPath); err != nil { 181 return nil, err 182 } 183 klog.V(1).Infof("cloud config watcher initialized") 184 185 eventBroadcaster := record.NewBroadcaster() 186 eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: clientSet.CoreV1().Events("")}) 187 recorder := eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: aadpodid.CRDGroup}) 188 189 immutableUserMSIsMap := make(map[string]bool) 190 if len(cfg.ImmutableUserMSIsList) > 0 { 191 for _, item := range cfg.ImmutableUserMSIsList { 192 immutableUserMSIsMap[strings.ToLower(item)] = true 193 } 194 } 195 // Cluster identity used for cloud provider operations is also immutable. 196 // For clusters created with managed identity, the cluster identity is used for all 197 // cloud provider operations and is also used by MIC. If the user configures the cluster 198 // identity to be used by pod, we should not delete it when all pods are deleted. 199 clusterIdentity := cloudClient.GetClusterIdentity() 200 if clusterIdentity != "" { 201 immutableUserMSIsMap[clusterIdentity] = true 202 } 203 var cmClient typedcorev1.ConfigMapInterface 204 if cfg.TypeUpgradeCfg.EnableTypeUpgrade { 205 cmClient = clientSet.CoreV1().ConfigMaps(cfg.CMcfg.Namespace) 206 } 207 208 c := &Client{ 209 CRDClient: crdClient, 210 CloudClient: cloudClient, 211 PodClient: podClient, 212 CloudConfigWatcher: cloudConfigWatcher, 213 EventRecorder: recorder, 214 EventChannel: eventCh, 215 NodeClient: &NodeClient{informer.Core().V1().Nodes()}, 216 IsNamespaced: cfg.IsNamespaced, 217 syncRetryInterval: cfg.SyncRetryInterval, 218 createDeleteBatch: cfg.CreateDeleteBatch, 219 ImmutableUserMSIsMap: immutableUserMSIsMap, 220 TypeUpgradeCfg: cfg.TypeUpgradeCfg, 221 CMCfg: cfg.CMcfg, 222 CMClient: cmClient, 223 identityAssignmentReconcileInterval: cfg.IdentityAssignmentReconcileInterval, 224 } 225 226 leaderElector, err := c.NewLeaderElector(clientSet, recorder, cfg.LeaderElectionCfg) 227 if err != nil { 228 return nil, fmt.Errorf("failed to create new leader elector, error: %+v", err) 229 } 230 c.leaderElector = leaderElector 231 232 reporter, err := metrics.NewReporter() 233 if err != nil { 234 return nil, fmt.Errorf("failed to create reporter for metrics, error: %+v", err) 235 } 236 c.Reporter = reporter 237 return c, nil 238 } 239 240 // Run - Initiates the leader election run call to find if its leader and run it 241 func (c *Client) Run() { 242 klog.Info("initiating MIC Leader election") 243 // counter to track number of mic election 244 c.Reporter.Report(metrics.MICNewLeaderElectionCountM.M(1)) 245 c.leaderElector.Run(context.Background()) 246 } 247 248 // NewLeaderElector - does the required leader election initialization 249 func (c *Client) NewLeaderElector(clientSet *kubernetes.Clientset, recorder record.EventRecorder, leaderElectionConfig *LeaderElectionConfig) (*leaderelection.LeaderElector, error) { 250 c.LeaderElectionConfig = leaderElectionConfig 251 resourceLock, err := resourcelock.New(resourcelock.EndpointsResourceLock, 252 c.Namespace, 253 c.Name, 254 clientSet.CoreV1(), 255 clientSet.CoordinationV1(), 256 resourcelock.ResourceLockConfig{ 257 Identity: c.Instance, 258 EventRecorder: recorder}) 259 if err != nil { 260 return nil, fmt.Errorf("failed to create resource lock for leader election, error: %+v", err) 261 } 262 config := leaderelection.LeaderElectionConfig{ 263 LeaseDuration: c.Duration, 264 RenewDeadline: c.Duration / 2, 265 RetryPeriod: c.Duration / 4, 266 Callbacks: leaderelection.LeaderCallbacks{ 267 OnStartedLeading: func(ctx context.Context) { 268 c.Start(ctx.Done()) 269 }, 270 OnStoppedLeading: func() { 271 klog.Error("lost leader lease") 272 klog.Flush() 273 os.Exit(1) 274 }, 275 }, 276 Lock: resourceLock, 277 } 278 279 leaderElector, err := leaderelection.NewLeaderElector(config) 280 if err != nil { 281 return nil, err 282 } 283 return leaderElector, nil 284 } 285 286 // UpgradeTypeIfRequired performs type upgrade for all aad-pod-identity CRDs if required. 287 func (c *Client) UpgradeTypeIfRequired() error { 288 if c.TypeUpgradeCfg.EnableTypeUpgrade { 289 cm, err := c.CMClient.Get(context.TODO(), c.CMCfg.Name, v1.GetOptions{}) 290 // If we get an error and its not NotFound then return, because we cannot proceed. 291 if err != nil && !apierrors.IsNotFound(err) { 292 return fmt.Errorf("failed to get ConfigMap %s/%s, error: %+v", c.CMCfg.Namespace, c.CMCfg.Name, err) 293 } 294 295 // Now either the configmap is not there or we successfully got the configmap 296 // Handle the case where the configmap is not found. 297 if err != nil && apierrors.IsNotFound(err) { 298 // Create the configmap 299 newCfgMap := &corev1.ConfigMap{ 300 ObjectMeta: v1.ObjectMeta{ 301 Namespace: c.CMCfg.Namespace, 302 Name: c.CMCfg.Name, 303 }, 304 } 305 if cm, err = c.CMClient.Create(context.TODO(), newCfgMap, metav1.CreateOptions{}); err != nil { 306 return fmt.Errorf("failed to create ConfigMap %s/%s, error: %+v", c.CMCfg.Namespace, c.CMCfg.Name, err) 307 } 308 } 309 310 // We reach here only if the configmap is present or we created new one. 311 // Check if the key for type upgrade is present. If the key is present, 312 // then the upgrade is already performed. If not then go through the type upgrade 313 // process. 314 if v, ok := cm.Data[c.TypeUpgradeCfg.TypeUpgradeStatusKey]; !ok { 315 klog.Infof("upgrading the types to work with case sensitive go-client") 316 if err := c.CRDClient.UpgradeAll(); err != nil { 317 return fmt.Errorf("failed to upgrade type, error: %+v", err) 318 } 319 klog.Infof("type upgrade completed !!") 320 // Upgrade completed so update the data with the upgrade key. 321 if cm.Data == nil { 322 cm.Data = make(map[string]string) 323 } 324 cm.Data[c.TypeUpgradeCfg.TypeUpgradeStatusKey] = version.MICVersion 325 _, err = c.CMClient.Update(context.TODO(), cm, metav1.UpdateOptions{}) 326 if err != nil { 327 return fmt.Errorf("failed to update ConfigMap key %s failed, error: %+v", c.TypeUpgradeCfg.TypeUpgradeStatusKey, err) 328 } 329 } else { 330 klog.Infof("type upgrade status configmap found from version: %s. Skipping type upgrade!", v) 331 } 332 } 333 return nil 334 } 335 336 // Start starts various go routines to watch for any relevant changes that would trigger a MIC sync. 337 func (c *Client) Start(exit <-chan struct{}) { 338 klog.V(6).Infof("MIC client starting..") 339 340 if err := c.UpgradeTypeIfRequired(); err != nil { 341 klog.Fatalf("type upgrade failed with error: %+v", err) 342 return 343 } 344 345 var wg sync.WaitGroup 346 347 wg.Add(1) 348 go func() { 349 c.PodClient.Start(exit) 350 klog.V(6).Infof("pod client started") 351 wg.Done() 352 }() 353 354 wg.Add(1) 355 go func() { 356 c.CRDClient.Start(exit) 357 klog.V(6).Infof("CRD client started") 358 wg.Done() 359 }() 360 361 wg.Add(1) 362 go func() { 363 c.NodeClient.Start(exit) 364 klog.V(6).Infof("node client started") 365 wg.Done() 366 }() 367 368 wg.Add(1) 369 go func() { 370 c.CloudConfigWatcher.Start(exit) 371 klog.V(6).Infof("cloud config watcher started") 372 wg.Done() 373 }() 374 375 wg.Wait() 376 go c.Sync(exit) 377 } 378 379 func (c *Client) canSync() bool { 380 return atomic.CompareAndSwapInt32(&c.syncing, stopped, running) 381 } 382 383 func (c *Client) setStopped() { 384 atomic.StoreInt32(&c.syncing, stopped) 385 } 386 387 // Sync perform a sync cycle. 388 func (c *Client) Sync(exit <-chan struct{}) { 389 if !c.canSync() { 390 panic("concurrent syncs") 391 } 392 defer c.setStopped() 393 394 ticker := time.NewTicker(c.syncRetryInterval) 395 defer ticker.Stop() 396 397 identityAssignmentReconcileTicker := time.NewTicker(c.identityAssignmentReconcileInterval) 398 defer identityAssignmentReconcileTicker.Stop() 399 400 klog.Info("sync thread started.") 401 c.SyncLoopStarted = true 402 var event aadpodid.EventType 403 totalWorkDoneCycles := 0 404 totalSyncCycles := 0 405 406 for { 407 select { 408 case <-exit: 409 return 410 case event = <-c.EventChannel: 411 klog.V(6).Infof("received event: %v", event) 412 case <-ticker.C: 413 klog.V(6).Infof("running periodic sync loop") 414 case <-identityAssignmentReconcileTicker.C: 415 klog.V(6).Infof("reconciling identity assignment on Azure") 416 c.reconcileIdentityAssignment() 417 continue 418 } 419 totalSyncCycles++ 420 stats.Init() 421 // This is the only place where the AzureAssignedIdentity creation is initiated. 422 begin := time.Now() 423 workDone := false 424 425 cacheTime := time.Now() 426 427 // There is a delay in data propagation to cache. It's possible that the creates performed in the previous sync cycle 428 // are not propagated before this sync cycle began. In order to avoid redoing the cycle, we sync cache again. 429 c.CRDClient.SyncCacheAll(exit, false) 430 stats.Put(stats.CacheSync, time.Since(cacheTime)) 431 432 // List all pods in all namespaces 433 systemTime := time.Now() 434 listPods, err := c.PodClient.GetPods() 435 if err != nil { 436 klog.Errorf("failed to list pods, error: %+v", err) 437 continue 438 } 439 listBindings, err := c.CRDClient.ListBindings() 440 if err != nil { 441 continue 442 } 443 klog.V(6).Infof("number of bindings: %d", len(*listBindings)) 444 listIDs, err := c.CRDClient.ListIds() 445 if err != nil { 446 continue 447 } 448 klog.V(6).Infof("number of identities: %d", len(*listIDs)) 449 idMap, err := c.convertIDListToMap(*listIDs) 450 if err != nil { 451 klog.Errorf("failed to convert ID list to map, error: %+v", err) 452 continue 453 } 454 455 currentAssignedIDs, err := c.CRDClient.ListAssignedIDsInMap() 456 if err != nil { 457 continue 458 } 459 klog.V(6).Infof("number of assigned identities: %d", len(currentAssignedIDs)) 460 stats.Put(stats.System, time.Since(systemTime)) 461 462 beginNewListTime := time.Now() 463 newAssignedIDs, nodeRefs, err := c.createDesiredAssignedIdentityList(listPods, listBindings, idMap) 464 if err != nil { 465 klog.Errorf("failed to create a list of desired AzureAssignedIdentity, error: %+v", err) 466 continue 467 } 468 stats.Put(stats.CurrentState, time.Since(beginNewListTime)) 469 470 // Extract add list and delete list based on existing assigned ids in the system (currentAssignedIDs). 471 // and the ones we have arrived at in the volatile list (newAssignedIDs). 472 addList, err := c.getAzureAssignedIDsToCreate(currentAssignedIDs, newAssignedIDs) 473 if err != nil { 474 klog.Errorf("failed to get a list of AzureAssignedIdentities to create, error: %+v", err) 475 continue 476 } 477 deleteList, err := c.getAzureAssignedIDsToDelete(currentAssignedIDs, newAssignedIDs) 478 if err != nil { 479 klog.Errorf("failed to get a list of AzureAssignedIdentities to delete, error: %+v", err) 480 continue 481 } 482 beforeUpdateList, afterUpdateList := c.getAzureAssignedIdentitiesToUpdate(addList, deleteList) 483 klog.V(5).Infof("del: %v, add: %v, update: %v", deleteList, addList, afterUpdateList) 484 485 // the node map is used to track assigned ids to create/delete, identities to assign/remove 486 // for each node or vmss 487 nodeMap := make(map[string]trackUserAssignedMSIIds) 488 489 // separate the add, delete and update list per node 490 c.convertAssignedIDListToMap(addList, deleteList, afterUpdateList, nodeMap) 491 492 // process the delete and add list 493 // determine the list of identities that need to updated, create a node to identity list mapping for add and delete 494 if len(deleteList) > 0 || len(beforeUpdateList) > 0 { 495 workDone = true 496 c.getListOfIdsToDelete(deleteList, beforeUpdateList, afterUpdateList, newAssignedIDs, nodeMap, nodeRefs) 497 } 498 if len(addList) > 0 || len(afterUpdateList) > 0 { 499 workDone = true 500 c.getListOfIdsToAssign(addList, afterUpdateList, nodeMap) 501 } 502 503 var wg sync.WaitGroup 504 505 // check if vmss and consolidate vmss nodes into vmss if necessary 506 c.consolidateVMSSNodes(nodeMap, &wg) 507 508 // one final createorupdate to each node or vmss in the map 509 c.updateNodeAndDeps(newAssignedIDs, nodeMap, nodeRefs, &wg) 510 511 wg.Wait() 512 513 if workDone || ((totalSyncCycles % 1000) == 0) { 514 if workDone { 515 totalWorkDoneCycles++ 516 } 517 idsFound := 0 518 bindingsFound := 0 519 if listIDs != nil { 520 idsFound = len(*listIDs) 521 } 522 if listBindings != nil { 523 bindingsFound = len(*listBindings) 524 } 525 klog.Infof("work done: %v. Found %d pods, %d ids, %d bindings", workDone, len(listPods), idsFound, bindingsFound) 526 klog.Infof("total work cycles: %d, out of which work was done in: %d", totalSyncCycles, totalWorkDoneCycles) 527 stats.Put(stats.Total, time.Since(begin)) 528 529 c.Reporter.Report( 530 metrics.MICCycleCountM.M(1), 531 metrics.MICCycleDurationM.M(metrics.SinceInSeconds(begin))) 532 533 stats.PrintSync() 534 if workDone { 535 // We need to synchronize the cache inorder to get the latest updates. 536 // Even though we sync at the beginning of every cycle, we are still seeing 537 // conflicts indicating the assigned identities are not reflecting in 538 // the cache. Continue to use the sleep workaround. 539 time.Sleep(time.Millisecond * 200) 540 } 541 } 542 } 543 } 544 545 func (c *Client) convertAssignedIDListToMap(addList, deleteList, updateList map[string]aadpodid.AzureAssignedIdentity, nodeMap map[string]trackUserAssignedMSIIds) { 546 for _, createID := range addList { 547 if trackList, ok := nodeMap[createID.Spec.NodeName]; ok { 548 trackList.assignedIDsToCreate = append(trackList.assignedIDsToCreate, createID) 549 nodeMap[createID.Spec.NodeName] = trackList 550 continue 551 } 552 nodeMap[createID.Spec.NodeName] = trackUserAssignedMSIIds{assignedIDsToCreate: []aadpodid.AzureAssignedIdentity{createID}} 553 } 554 555 for _, delID := range deleteList { 556 if trackList, ok := nodeMap[delID.Spec.NodeName]; ok { 557 trackList.assignedIDsToDelete = append(trackList.assignedIDsToDelete, delID) 558 nodeMap[delID.Spec.NodeName] = trackList 559 continue 560 } 561 nodeMap[delID.Spec.NodeName] = trackUserAssignedMSIIds{assignedIDsToDelete: []aadpodid.AzureAssignedIdentity{delID}} 562 } 563 564 for _, updateID := range updateList { 565 if trackList, ok := nodeMap[updateID.Spec.NodeName]; ok { 566 trackList.assignedIDsToUpdate = append(trackList.assignedIDsToUpdate, updateID) 567 nodeMap[updateID.Spec.NodeName] = trackList 568 continue 569 } 570 nodeMap[updateID.Spec.NodeName] = trackUserAssignedMSIIds{assignedIDsToUpdate: []aadpodid.AzureAssignedIdentity{updateID}} 571 } 572 } 573 574 func (c *Client) createDesiredAssignedIdentityList( 575 listPods []*corev1.Pod, listBindings *[]aadpodid.AzureIdentityBinding, idMap map[string]aadpodid.AzureIdentity) (map[string]aadpodid.AzureAssignedIdentity, map[string]bool, error) { 576 // For each pod, check what bindings are matching. For each binding create volatile azure assigned identity. 577 // Compare this list with the current list of azure assigned identities. 578 // For any new assigned identities found in this volatile list, create assigned identity and assign user assigned msis. 579 // For any assigned ids not present the volatile list, proceed with the deletion. 580 nodeRefs := make(map[string]bool) 581 newAssignedIDs := make(map[string]aadpodid.AzureAssignedIdentity) 582 583 for _, pod := range listPods { 584 klog.V(6).Infof("checking pod %s/%s", pod.Namespace, pod.Name) 585 if pod.Spec.NodeName == "" { 586 // Node is not yet allocated. In that case skip the pod 587 klog.Infof("pod %s/%s has no assigned node yet. it will be ignored", pod.Namespace, pod.Name) 588 continue 589 } 590 crdPodLabelVal := pod.Labels[aadpodid.CRDLabelKey] 591 klog.V(6).Infof("pod: %s/%s. Label value: %v", pod.Namespace, pod.Name, crdPodLabelVal) 592 if crdPodLabelVal == "" { 593 // No binding mentioned in the label. Just continue to the next pod 594 klog.Infof("pod %s/%s doesn't contain %s label field. it will be ignored", pod.Namespace, pod.Name, aadpodid.CRDLabelKey) 595 continue 596 } 597 var matchedBindings []aadpodid.AzureIdentityBinding 598 for _, allBinding := range *listBindings { 599 klog.V(6).Infof("check the binding (pod - %s/%s): %s", pod.Namespace, pod.Name, allBinding.Spec.Selector) 600 if allBinding.Spec.Selector == crdPodLabelVal { 601 klog.V(5).Infof("found binding match for pod %s/%s with binding %s/%s", pod.Namespace, pod.Name, allBinding.Namespace, allBinding.Name) 602 matchedBindings = append(matchedBindings, allBinding) 603 nodeRefs[pod.Spec.NodeName] = true 604 } 605 } 606 607 if len(matchedBindings) == 0 { 608 klog.Infof("No AzureIdentityBinding found for pod %s/%s that matches selector: %s. it will be ignored", pod.Namespace, pod.Name, crdPodLabelVal) 609 continue 610 } 611 612 // sort all matching bindings so we can iterate the slice 613 // in an deterministic fashion in different sync cycles 614 sort.Sort(aadpodid.AzureIdentityBindings(matchedBindings)) 615 616 for _, binding := range matchedBindings { 617 klog.V(5).Infof("looking up id map: %s/%s", binding.Namespace, binding.Spec.AzureIdentity) 618 if azureID, idPresent := idMap[getIDKey(binding.Namespace, binding.Spec.AzureIdentity)]; idPresent { 619 // working in Namespaced mode or this specific identity is namespaced 620 if c.IsNamespaced || aadpodid.IsNamespacedIdentity(&azureID) { 621 // They have to match all 622 if !(azureID.Namespace == binding.Namespace && binding.Namespace == pod.Namespace) { 623 klog.V(5).Infof("identity %s/%s was matched via binding %s/%s to %s/%s but namespaced identity is enforced, so it will be ignored", 624 azureID.Namespace, azureID.Name, binding.Namespace, binding.Name, pod.Namespace, pod.Name) 625 continue 626 } 627 } 628 klog.V(5).Infof("identity %s/%s assigned to %s/%s via %s/%s", azureID.Namespace, azureID.Name, pod.Namespace, pod.Name, binding.Namespace, binding.Name) 629 assignedID, err := c.makeAssignedIDs(azureID, binding, pod.Name, pod.Namespace, pod.Spec.NodeName) 630 631 if err != nil { 632 klog.Errorf("failed to create an AzureAssignedIdentity between pod %s/%s and AzureIdentity %s/%s, error: %+v", pod.Namespace, pod.Name, azureID.Namespace, azureID.Name, err) 633 continue 634 } 635 636 if a, ok := newAssignedIDs[assignedID.Name]; ok { 637 // see https://github.com/Azure/aad-pod-identity/issues/1065 638 klog.Warningf("AzureIdentity %s exists in both %s and %s namespace. Considering renaming it or enabling Namespace mode (https://azure.github.io/aad-pod-identity/docs/configure/match_pods_in_namespace)", 639 azureID.Name, a.Spec.AzureIdentityRef.Namespace, azureID.Namespace) 640 } else { 641 newAssignedIDs[assignedID.Name] = *assignedID 642 } 643 } else { 644 // This is the case where the identity has been deleted. 645 // In such a case, we will skip it from matching binding. 646 // This will ensure that the new assigned ids created will not have the 647 // one associated with this azure identity. 648 klog.Infof("%s identity not found when using %s/%s binding", binding.Spec.AzureIdentity, binding.Namespace, binding.Name) 649 } 650 } 651 } 652 return newAssignedIDs, nodeRefs, nil 653 } 654 655 // getListOfIdsToDelete will go over the delete list to determine if the id is required to be deleted 656 // only user assigned identity not in use are added to the remove list for the node 657 func (c *Client) getListOfIdsToDelete(deleteList, beforeUpdateList, afterUpdateList, newAssignedIDs map[string]aadpodid.AzureAssignedIdentity, 658 nodeMap map[string]trackUserAssignedMSIIds, 659 nodeRefs map[string]bool) { 660 vmssGroups, err := getVMSSGroups(c.NodeClient, nodeRefs) 661 if err != nil { 662 klog.Errorf("failed to get VMSS groups, error: %+v", err) 663 return 664 } 665 666 consolidatedMapToCheck := make(map[string]aadpodid.AzureAssignedIdentity) 667 for name, id := range newAssignedIDs { 668 consolidatedMapToCheck[name] = id 669 } 670 for name, id := range afterUpdateList { 671 consolidatedMapToCheck[name] = id 672 } 673 674 for _, delID := range deleteList { 675 err := c.shouldRemoveID(delID, consolidatedMapToCheck, nodeMap, vmssGroups) 676 if err != nil { 677 klog.Errorf("failed to check if identity should be removed, error: %+v", err) 678 } 679 } 680 // this loop checks the azure identity before it was updated and cleans up 681 // the old identity 682 for _, oldUpdateID := range beforeUpdateList { 683 err := c.shouldRemoveID(oldUpdateID, consolidatedMapToCheck, nodeMap, vmssGroups) 684 if err != nil { 685 klog.Errorf("failed to check if identity should be removed, error: %+v", err) 686 } 687 } 688 } 689 690 // getListOfIdsToAssign will add the id to the append list for node if it's user assigned identity 691 func (c *Client) getListOfIdsToAssign(addList, updateList map[string]aadpodid.AzureAssignedIdentity, nodeMap map[string]trackUserAssignedMSIIds) { 692 for _, createID := range addList { 693 c.shouldAssignID(createID, nodeMap) 694 } 695 for _, updateID := range updateList { 696 c.shouldAssignID(updateID, nodeMap) 697 } 698 } 699 700 func (c *Client) shouldAssignID(assignedID aadpodid.AzureAssignedIdentity, nodeMap map[string]trackUserAssignedMSIIds) { 701 id := assignedID.Spec.AzureIdentityRef 702 isUserAssignedMSI := c.checkIfUserAssignedMSI(*id) 703 704 if assignedID.Status.Status == "" || assignedID.Status.Status == aadpodid.AssignedIDCreated { 705 if isUserAssignedMSI { 706 c.appendToAddListForNode(id.Spec.ResourceID, assignedID.Spec.NodeName, nodeMap) 707 } 708 } 709 klog.V(5).Infof("binding applied: %+v", assignedID.Spec.AzureBindingRef) 710 } 711 712 func (c *Client) shouldRemoveID(assignedID aadpodid.AzureAssignedIdentity, 713 newAssignedIDs map[string]aadpodid.AzureAssignedIdentity, 714 nodeMap map[string]trackUserAssignedMSIIds, vmssGroups *vmssGroupList) error { 715 klog.V(5).Infof("deletion of id: %s", assignedID.Name) 716 inUse, err := c.checkIfInUse(assignedID, newAssignedIDs, vmssGroups) 717 if err != nil { 718 return err 719 } 720 721 id := assignedID.Spec.AzureIdentityRef 722 isUserAssignedMSI := c.checkIfUserAssignedMSI(*id) 723 isImmutableIdentity := c.checkIfIdentityImmutable(id.Spec.ClientID) 724 // this case includes Assigned state and empty state to ensure backward compatibility 725 if assignedID.Status.Status == aadpodid.AssignedIDAssigned || assignedID.Status.Status == "" { 726 // only user assigned identities that are not in use and are not defined as 727 // immutable will be removed from underlying node/vmss 728 if !inUse && isUserAssignedMSI && !isImmutableIdentity { 729 c.appendToRemoveListForNode(id.Spec.ResourceID, assignedID.Spec.NodeName, nodeMap) 730 } 731 } 732 klog.V(5).Infof("binding removed: %+v", assignedID.Spec.AzureBindingRef) 733 return nil 734 } 735 736 func (c *Client) matchAssignedID(x aadpodid.AzureAssignedIdentity, y aadpodid.AzureAssignedIdentity) bool { 737 bindingX := x.Spec.AzureBindingRef 738 bindingY := y.Spec.AzureBindingRef 739 740 idX := x.Spec.AzureIdentityRef 741 idY := y.Spec.AzureIdentityRef 742 743 klog.V(7).Infof("assignedidX - %+v\n", x) 744 klog.V(7).Infof("assignedidY - %+v\n", y) 745 746 klog.V(7).Infof("bindingX - %+v\n", bindingX) 747 klog.V(7).Infof("bindingY - %+v\n", bindingY) 748 749 klog.V(7).Infof("idX - %+v\n", idX) 750 klog.V(7).Infof("idY - %+v\n", idY) 751 752 return bindingX.Name == bindingY.Name && 753 bindingX.ResourceVersion == bindingY.ResourceVersion && 754 idX.Name == idY.Name && 755 idX.ResourceVersion == idY.ResourceVersion && 756 x.Spec.Pod == y.Spec.Pod && 757 x.Spec.PodNamespace == y.Spec.PodNamespace && 758 x.Spec.NodeName == y.Spec.NodeName 759 } 760 761 func (c *Client) getAzureAssignedIDsToCreate(old, new map[string]aadpodid.AzureAssignedIdentity) (map[string]aadpodid.AzureAssignedIdentity, error) { 762 // everything in new needs to be created 763 if len(old) == 0 { 764 return new, nil 765 } 766 767 create := make(map[string]aadpodid.AzureAssignedIdentity) 768 begin := time.Now() 769 770 for assignedIDName, newAssignedID := range new { 771 oldAssignedID, exists := old[assignedIDName] 772 idMatch := false 773 if exists { 774 idMatch = c.matchAssignedID(oldAssignedID, newAssignedID) 775 } 776 if idMatch && oldAssignedID.Status.Status == aadpodid.AssignedIDCreated { 777 // if the old assigned id is in created state, then the identity assignment to the node 778 // is not done. Adding to the list will ensure we retry identity assignment to node for 779 // this assigned identity. 780 klog.V(5).Infof("ok: %v, Create added: %s as assignedID in CREATED state", idMatch, assignedIDName) 781 create[assignedIDName] = oldAssignedID 782 } 783 if !idMatch { 784 // We are done checking that this new id is not present in the old 785 // list. So we will add it to the create list. 786 klog.V(5).Infof("ok: %v, Create added: %s", idMatch, assignedIDName) 787 create[assignedIDName] = newAssignedID 788 } 789 } 790 stats.Put(stats.FindAzureAssignedIdentitiesToCreate, time.Since(begin)) 791 return create, nil 792 } 793 794 func (c *Client) getAzureAssignedIDsToDelete(old, new map[string]aadpodid.AzureAssignedIdentity) (map[string]aadpodid.AzureAssignedIdentity, error) { 795 delete := make(map[string]aadpodid.AzureAssignedIdentity) 796 // nothing to delete 797 if len(old) == 0 { 798 return delete, nil 799 } 800 // delete everything as nothing in new 801 if len(new) == 0 { 802 return old, nil 803 } 804 805 begin := time.Now() 806 for assignedIDName, oldAssignedID := range old { 807 newAssignedID, exists := new[assignedIDName] 808 idMatch := false 809 if exists { 810 idMatch = c.matchAssignedID(oldAssignedID, newAssignedID) 811 } 812 // assigned identity exists in the desired list too which means 813 // it should not be deleted 814 if exists && idMatch { 815 continue 816 } 817 // We are done checking that this old id is not present in the new 818 // list. So we will add it to the delete list. 819 delete[assignedIDName] = oldAssignedID 820 } 821 stats.Put(stats.FindAzureAssignedIdentitiesToDelete, time.Since(begin)) 822 return delete, nil 823 } 824 825 // getAzureAssignedIdentitiesToUpdate returns a list of assignedIDs that need to be updated 826 // because of change in azureIdentity or azurerIdentityBinding 827 // returns 2 maps, first the assigned IDs currently on cluster, second the assignedID value to update with 828 func (c *Client) getAzureAssignedIdentitiesToUpdate(add, del map[string]aadpodid.AzureAssignedIdentity) (map[string]aadpodid.AzureAssignedIdentity, map[string]aadpodid.AzureAssignedIdentity) { 829 beforeUpdate := make(map[string]aadpodid.AzureAssignedIdentity) 830 afterUpdate := make(map[string]aadpodid.AzureAssignedIdentity) 831 // no updates required as assigned identities will not be in both lists 832 if len(add) == 0 || len(del) == 0 { 833 return beforeUpdate, afterUpdate 834 } 835 for assignedIDName, addAssignedID := range add { 836 if delAssignedID, exists := del[assignedIDName]; exists { 837 objMeta := delAssignedID.ObjectMeta 838 // the label should always be the latest as the pod could have moved to a different node 839 // with the same assigned identity 840 objMeta.SetLabels(addAssignedID.GetObjectMeta().GetLabels()) 841 addAssignedID.ObjectMeta = objMeta 842 // assigned identity exists in add and del list 843 // update the assigned identity to the latest 844 beforeUpdate[assignedIDName] = delAssignedID 845 afterUpdate[assignedIDName] = addAssignedID 846 // since this is part of update, remove the assignedID from the add and del list 847 delete(add, assignedIDName) 848 delete(del, assignedIDName) 849 } 850 } 851 return beforeUpdate, afterUpdate 852 } 853 854 func (c *Client) makeAssignedIDs(azID aadpodid.AzureIdentity, azBinding aadpodid.AzureIdentityBinding, podName, podNameSpace, nodeName string) (*aadpodid.AzureAssignedIdentity, error) { 855 binding := azBinding 856 id := azID 857 858 labels := make(map[string]string) 859 labels["nodename"] = nodeName 860 861 oMeta := v1.ObjectMeta{ 862 Name: c.getAssignedIDName(podName, podNameSpace, azID.Name), 863 Labels: labels, 864 } 865 assignedID := &aadpodid.AzureAssignedIdentity{ 866 ObjectMeta: oMeta, 867 Spec: aadpodid.AzureAssignedIdentitySpec{ 868 AzureIdentityRef: &id, 869 AzureBindingRef: &binding, 870 Pod: podName, 871 PodNamespace: podNameSpace, 872 NodeName: nodeName, 873 }, 874 Status: aadpodid.AzureAssignedIdentityStatus{ 875 AvailableReplicas: 1, 876 }, 877 } 878 // if we are in namespaced mode (or az identity is namespaced) 879 if c.IsNamespaced || aadpodid.IsNamespacedIdentity(&id) { 880 assignedID.Namespace = azID.Namespace 881 } else { 882 // eventually this should be identity namespace 883 // but to maintain back compat we will use existing 884 // behavior 885 assignedID.Namespace = "default" 886 } 887 888 klog.V(6).Infof("binding - %+v identity - %+v", azBinding, azID) 889 klog.V(5).Infof("making assigned ID: %+v", assignedID) 890 return assignedID, nil 891 } 892 893 func (c *Client) createAssignedIdentity(assignedID *aadpodid.AzureAssignedIdentity) error { 894 err := c.CRDClient.CreateAssignedIdentity(assignedID) 895 if err != nil { 896 return err 897 } 898 return nil 899 } 900 901 func (c *Client) removeAssignedIdentity(assignedID *aadpodid.AzureAssignedIdentity) error { 902 err := c.CRDClient.RemoveAssignedIdentity(assignedID) 903 if err != nil { 904 return err 905 } 906 return nil 907 } 908 909 func (c *Client) updateAssignedIdentity(assignedID *aadpodid.AzureAssignedIdentity) error { 910 return c.CRDClient.UpdateAssignedIdentity(assignedID) 911 } 912 913 func (c *Client) appendToRemoveListForNode(resourceID, nodeName string, nodeMap map[string]trackUserAssignedMSIIds) { 914 if trackList, ok := nodeMap[nodeName]; ok { 915 trackList.removeUserAssignedMSIIDs = append(trackList.removeUserAssignedMSIIDs, resourceID) 916 nodeMap[nodeName] = trackList 917 return 918 } 919 nodeMap[nodeName] = trackUserAssignedMSIIds{removeUserAssignedMSIIDs: []string{resourceID}} 920 } 921 922 func (c *Client) appendToAddListForNode(resourceID, nodeName string, nodeMap map[string]trackUserAssignedMSIIds) { 923 if trackList, ok := nodeMap[nodeName]; ok { 924 trackList.addUserAssignedMSIIDs = append(trackList.addUserAssignedMSIIDs, resourceID) 925 nodeMap[nodeName] = trackList 926 return 927 } 928 nodeMap[nodeName] = trackUserAssignedMSIIds{addUserAssignedMSIIDs: []string{resourceID}} 929 } 930 931 func (c *Client) checkIfUserAssignedMSI(id aadpodid.AzureIdentity) bool { 932 return id.Spec.Type == aadpodid.UserAssignedMSI 933 } 934 935 func (c *Client) getAssignedIDName(podName, podNameSpace, idName string) string { 936 return podName + "-" + podNameSpace + "-" + idName 937 } 938 939 func (c *Client) checkIfMSIExistsOnNode(id *aadpodid.AzureIdentity, nodeName string, nodeMSIList []string) bool { 940 for _, userAssignedMSI := range nodeMSIList { 941 if strings.EqualFold(userAssignedMSI, id.Spec.ResourceID) { 942 return true 943 } 944 } 945 return false 946 } 947 948 func (c *Client) getUserMSIListForNode(nodeOrVMSSName string, isvmss bool) ([]string, error) { 949 return c.CloudClient.GetUserMSIs(nodeOrVMSSName, isvmss) 950 } 951 952 func getIDKey(ns, name string) string { 953 return strings.Join([]string{ns, name}, "/") 954 } 955 956 func (c *Client) convertIDListToMap(azureIdentities []aadpodid.AzureIdentity) (map[string]aadpodid.AzureIdentity, error) { 957 m := make(map[string]aadpodid.AzureIdentity, len(azureIdentities)) 958 for _, azureIdentity := range azureIdentities { 959 // validate the resourceID in azure identity for type 0 (UserAssignedMSI) to ensure format is as expected 960 if c.checkIfUserAssignedMSI(azureIdentity) { 961 err := utils.ValidateResourceID(azureIdentity.Spec.ResourceID) 962 if err != nil { 963 klog.Errorf("ignoring azure identity %s/%s, error: %+v", azureIdentity.Namespace, azureIdentity.Name, err) 964 continue 965 } 966 } 967 m[getIDKey(azureIdentity.Namespace, azureIdentity.Name)] = azureIdentity 968 } 969 return m, nil 970 } 971 972 func (c *Client) checkIfInUse(checkAssignedID aadpodid.AzureAssignedIdentity, assignedIDMap map[string]aadpodid.AzureAssignedIdentity, vmssGroups *vmssGroupList) (bool, error) { 973 for _, assignedID := range assignedIDMap { 974 checkID := checkAssignedID.Spec.AzureIdentityRef 975 id := assignedID.Spec.AzureIdentityRef 976 // If they have the same client id, reside on the same node but the pod name is different, then the 977 // assigned id is in use. 978 // This is applicable only for user assigned MSI since that is node specific. Ignore other cases. 979 if checkID.Spec.Type != aadpodid.UserAssignedMSI { 980 continue 981 } 982 983 if checkAssignedID.Spec.Pod == assignedID.Spec.Pod { 984 // No need to do the rest of the checks in this case, since it's the same assignment 985 // The same identity won't be assigned to a pod twice, so it's the same reference. 986 continue 987 } 988 989 if checkID.Spec.ClientID != id.Spec.ClientID { 990 continue 991 } 992 993 if checkAssignedID.Spec.NodeName == assignedID.Spec.NodeName { 994 return true, nil 995 } 996 997 vmss, err := getVMSSGroupFromPossiblyUnreferencedNode(c.NodeClient, vmssGroups, checkAssignedID.Spec.NodeName) 998 if err != nil { 999 return false, err 1000 } 1001 1002 // check if this identity is used on another node in the same vmss 1003 // This check is needed because vmss identities currently operate on all nodes 1004 // in the vmss not just a single node. 1005 if vmss != nil && vmss.hasNode(assignedID.Spec.NodeName) { 1006 return true, nil 1007 } 1008 } 1009 1010 return false, nil 1011 } 1012 1013 func (c *Client) getUniqueIDs(idList []string) []string { 1014 idSet := make(map[string]struct{}) 1015 var uniqueList []string 1016 1017 for _, id := range idList { 1018 idSet[id] = struct{}{} 1019 } 1020 for id := range idSet { 1021 uniqueList = append(uniqueList, id) 1022 } 1023 return uniqueList 1024 } 1025 1026 func (c *Client) updateAssignedIdentityStatus(assignedID *aadpodid.AzureAssignedIdentity, status string) error { 1027 return c.CRDClient.UpdateAzureAssignedIdentityStatus(assignedID, status) 1028 } 1029 1030 func (c *Client) updateNodeAndDeps(newAssignedIDs map[string]aadpodid.AzureAssignedIdentity, nodeMap map[string]trackUserAssignedMSIIds, nodeRefs map[string]bool, wg *sync.WaitGroup) { 1031 for nodeName, nodeTrackList := range nodeMap { 1032 wg.Add(1) 1033 go c.updateUserMSI(newAssignedIDs, nodeName, nodeTrackList, nodeRefs, wg) 1034 } 1035 } 1036 1037 func (c *Client) updateUserMSI(newAssignedIDs map[string]aadpodid.AzureAssignedIdentity, nodeOrVMSSName string, nodeTrackList trackUserAssignedMSIIds, nodeRefs map[string]bool, wg *sync.WaitGroup) { 1038 defer wg.Done() 1039 beginAdding := time.Now() 1040 klog.Infof("processing node %s, add [%d], del [%d], update [%d]", nodeOrVMSSName, 1041 len(nodeTrackList.assignedIDsToCreate), len(nodeTrackList.assignedIDsToDelete), len(nodeTrackList.assignedIDsToUpdate)) 1042 1043 ctx := context.TODO() 1044 // We have to ensure that we don't overwhelm the API server with too many 1045 // requests in flight. We use a token based approach implemented using semaphore to 1046 // ensure that only given createDeleteBatch requests are in flight at any point in time. 1047 // Note that at this point in the code path, we are doing this in parallel per node/VMSS already. 1048 semCreateOrUpdate := semaphore.NewWeighted(c.createDeleteBatch) 1049 1050 for _, createID := range nodeTrackList.assignedIDsToCreate { 1051 if err := semCreateOrUpdate.Acquire(ctx, 1); err != nil { 1052 klog.Errorf("failed to acquire semaphore in the create loop, error: %+v", err) 1053 return 1054 } 1055 go func(assignedID aadpodid.AzureAssignedIdentity) { 1056 defer semCreateOrUpdate.Release(1) 1057 if assignedID.Status.Status == "" { 1058 binding := assignedID.Spec.AzureBindingRef 1059 1060 // this is the state when the azure assigned identity is yet to be created 1061 klog.V(5).Infof("initiating AzureAssignedIdentity creation for pod - %s, binding - %s", assignedID.Spec.Pod, binding.Name) 1062 1063 assignedID.Status.Status = aadpodid.AssignedIDCreated 1064 err := c.createAssignedIdentity(&assignedID) 1065 if err != nil { 1066 message := fmt.Sprintf("failed to create AzureAssignedIdentity %s/%s for pod %s/%s, error: %+v", assignedID.Name, assignedID.Namespace, assignedID.Spec.PodNamespace, assignedID.Spec.Pod, err) 1067 c.EventRecorder.Event(binding, corev1.EventTypeWarning, "binding apply error", message) 1068 klog.Error(message) 1069 } 1070 } 1071 }(createID) 1072 } 1073 1074 for _, updateID := range nodeTrackList.assignedIDsToUpdate { 1075 if err := semCreateOrUpdate.Acquire(ctx, 1); err != nil { 1076 klog.Errorf("failed to acquire semaphore in the update loop, error: %+v", err) 1077 return 1078 } 1079 go func(assignedID aadpodid.AzureAssignedIdentity) { 1080 defer semCreateOrUpdate.Release(1) 1081 if assignedID.Status.Status == "" { 1082 binding := assignedID.Spec.AzureBindingRef 1083 1084 // this is the state when the azure assigned identity is yet to be created 1085 klog.V(5).Infof("initiating assigned id creation for pod - %s, binding - %s", assignedID.Spec.Pod, binding.Name) 1086 1087 assignedID.Status.Status = aadpodid.AssignedIDCreated 1088 err := c.updateAssignedIdentity(&assignedID) 1089 if err != nil { 1090 message := fmt.Sprintf("failed to update AzureAssignedIdentity %s/%s for pod %s/%s, error: %+v", assignedID.Namespace, assignedID.Name, assignedID.Spec.Pod, assignedID.Spec.PodNamespace, err) 1091 c.EventRecorder.Event(binding, corev1.EventTypeWarning, "binding apply error", message) 1092 klog.Error(message) 1093 } 1094 } 1095 }(updateID) 1096 } 1097 1098 // Ensure that all creates are complete 1099 if err := semCreateOrUpdate.Acquire(ctx, c.createDeleteBatch); err != nil { 1100 klog.Errorf("failed to acquire semaphore at the end of creates, error: %+v", err) 1101 return 1102 } 1103 // generate unique list so we don't make multiple calls to assign/remove same id 1104 addUserAssignedMSIIDs := c.getUniqueIDs(nodeTrackList.addUserAssignedMSIIDs) 1105 removeUserAssignedMSIIDs := c.getUniqueIDs(nodeTrackList.removeUserAssignedMSIIDs) 1106 createOrUpdateList := append([]aadpodid.AzureAssignedIdentity{}, nodeTrackList.assignedIDsToCreate...) 1107 createOrUpdateList = append(createOrUpdateList, nodeTrackList.assignedIDsToUpdate...) 1108 1109 err := c.CloudClient.UpdateUserMSI(addUserAssignedMSIIDs, removeUserAssignedMSIIDs, nodeOrVMSSName, nodeTrackList.isvmss) 1110 if err != nil { 1111 klog.Errorf("failed to update user-assigned identities on node %s (add [%d], del [%d], update[%d]), error: %+v", nodeOrVMSSName, len(nodeTrackList.assignedIDsToCreate), len(nodeTrackList.assignedIDsToDelete), len(nodeTrackList.assignedIDsToUpdate), err) 1112 idList, getErr := c.getUserMSIListForNode(nodeOrVMSSName, nodeTrackList.isvmss) 1113 if getErr != nil { 1114 klog.Errorf("failed to get a list of user-assigned identites from node %s, error: %+v", nodeOrVMSSName, getErr) 1115 return 1116 } 1117 1118 for _, createID := range createOrUpdateList { 1119 createID := createID // avoid implicit memory aliasing in for loop 1120 id := createID.Spec.AzureIdentityRef 1121 binding := createID.Spec.AzureBindingRef 1122 1123 isUserAssignedMSI := c.checkIfUserAssignedMSI(*id) 1124 idExistsOnNode := c.checkIfMSIExistsOnNode(id, createID.Spec.NodeName, idList) 1125 1126 if isUserAssignedMSI && !idExistsOnNode { 1127 message := fmt.Sprintf("failed to apply binding %s/%s node %s for pod %s/%s, error: %+v", binding.Namespace, binding.Name, createID.Spec.NodeName, createID.Spec.PodNamespace, createID.Spec.Pod, err) 1128 c.EventRecorder.Event(binding, corev1.EventTypeWarning, "binding apply error", message) 1129 klog.Error(message) 1130 continue 1131 } 1132 // the identity was successfully assigned to the node 1133 c.EventRecorder.Event(binding, corev1.EventTypeNormal, "binding applied", 1134 fmt.Sprintf("binding %s applied on node %s for pod %s", binding.Name, createID.Spec.NodeName, createID.Name)) 1135 1136 klog.Infof("identity %s/%s has successfully been assigned to node %s", id.Namespace, id.Name, createID.Spec.NodeName) 1137 1138 // Identity is successfully assigned to node, so update the status of assigned identity to assigned 1139 if updateErr := c.updateAssignedIdentityStatus(&createID, aadpodid.AssignedIDAssigned); updateErr != nil { 1140 message := fmt.Sprintf("failed to update AzureAssignedIdentity %s/%s status to %s for pod %s/%s, error: %+v", createID.Namespace, createID.Name, aadpodid.AssignedIDAssigned, createID.Spec.PodNamespace, createID.Spec.Pod, updateErr) 1141 c.EventRecorder.Event(&createID, corev1.EventTypeWarning, "status update error", message) 1142 klog.Error(message) 1143 } 1144 1145 isCreateOperation := false 1146 for _, i := range nodeTrackList.assignedIDsToCreate { 1147 if reflect.DeepEqual(createID, i) { 1148 isCreateOperation = true 1149 break 1150 } 1151 } 1152 if isCreateOperation { 1153 stats.Increment(stats.TotalAzureAssignedIdentitiesCreated, 1) 1154 } else { 1155 stats.Increment(stats.TotalAzureAssignedIdentitiesUpdated, 1) 1156 } 1157 } 1158 1159 for _, delID := range nodeTrackList.assignedIDsToDelete { 1160 delID := delID // avoid implicit memory aliasing in for loop 1161 id := delID.Spec.AzureIdentityRef 1162 removedBinding := delID.Spec.AzureBindingRef 1163 isUserAssignedMSI := c.checkIfUserAssignedMSI(*id) 1164 idExistsOnNode := c.checkIfMSIExistsOnNode(id, delID.Spec.NodeName, idList) 1165 vmssGroups, getErr := getVMSSGroups(c.NodeClient, nodeRefs) 1166 if getErr != nil { 1167 klog.Errorf("failed to get VMSS groups, error: %+v", getErr) 1168 continue 1169 } 1170 inUse, checkErr := c.checkIfInUse(delID, newAssignedIDs, vmssGroups) 1171 if checkErr != nil { 1172 klog.Errorf("failed to check if identity is in use, error: %+v", getErr) 1173 continue 1174 } 1175 // the identity still exists on node, which means removing the identity from the node failed 1176 if isUserAssignedMSI && !inUse && idExistsOnNode { 1177 klog.Errorf("failed to remove AzureIdentityBinding %s from node %s for pod %s/%s, error: %+v", removedBinding.Name, delID.Spec.NodeName, delID.Spec.PodNamespace, delID.Spec.Pod, err) 1178 continue 1179 } 1180 1181 klog.Infof("updating msis on node %s failed, but identity %s/%s has successfully been removed from node", delID.Spec.NodeName, id.Namespace, id.Name) 1182 1183 // remove assigned identity crd from cluster as the identity has successfully been removed from the node 1184 err = c.removeAssignedIdentity(&delID) 1185 if err != nil { 1186 klog.Errorf("failed to remove AzureAssignedIdentity %s, error: %+v", delID.Name, err) 1187 continue 1188 } 1189 klog.Infof("deleted assigned identity %s/%s", delID.Namespace, delID.Name) 1190 stats.Increment(stats.TotalAzureAssignedIdentitiesDeleted, 1) 1191 } 1192 stats.Put(stats.TotalAzureAssignedIdentitiesCreateOrUpdate, time.Since(beginAdding)) 1193 return 1194 } 1195 1196 semUpdate := semaphore.NewWeighted(c.createDeleteBatch) 1197 1198 for _, createID := range createOrUpdateList { 1199 if err := semUpdate.Acquire(ctx, 1); err != nil { 1200 klog.Errorf("failed to acquire semaphore in the update loop, error: %+v", err) 1201 return 1202 } 1203 go func(assignedID aadpodid.AzureAssignedIdentity) { 1204 defer semUpdate.Release(1) 1205 binding := assignedID.Spec.AzureBindingRef 1206 // update the status to assigned for assigned identity as identity was successfully assigned to node. 1207 err := c.updateAssignedIdentityStatus(&assignedID, aadpodid.AssignedIDAssigned) 1208 if err != nil { 1209 message := fmt.Sprintf("failed to update AzureAssignedIdentity %s/%s status to %s for pod %s, error: %+v", assignedID.Namespace, assignedID.Name, aadpodid.AssignedIDAssigned, assignedID.Spec.Pod, err.Error()) 1210 c.EventRecorder.Event(&assignedID, corev1.EventTypeWarning, "status update error", message) 1211 klog.Error(message) 1212 return 1213 } 1214 c.EventRecorder.Event(binding, corev1.EventTypeNormal, "binding applied", 1215 fmt.Sprintf("Binding %s applied on node %s for pod %s", binding.Name, assignedID.Spec.NodeName, assignedID.Name)) 1216 }(createID) 1217 } 1218 1219 // Ensure that all updates are complete 1220 if err := semUpdate.Acquire(ctx, c.createDeleteBatch); err != nil { 1221 klog.Errorf("failed to acquire semaphore at the end of updates, error: %+v", err) 1222 return 1223 } 1224 1225 semDel := semaphore.NewWeighted(c.createDeleteBatch) 1226 1227 for _, delID := range nodeTrackList.assignedIDsToDelete { 1228 if err := semDel.Acquire(ctx, 1); err != nil { 1229 klog.Errorf("failed to acquire semaphore in the delete loop, error: %+v", err) 1230 return 1231 } 1232 go func(assignedID aadpodid.AzureAssignedIdentity) { 1233 defer semDel.Release(1) 1234 // update the status for the assigned identity to Unassigned as the identity has been successfully removed from node. 1235 // this will ensure on next sync loop we only try to delete the assigned identity instead of doing everything. 1236 err := c.updateAssignedIdentityStatus(&assignedID, aadpodid.AssignedIDUnAssigned) 1237 if err != nil { 1238 message := fmt.Sprintf("failed to update AzureAssignedIdentity %s/%s status to %s for pod %s/%s, error: %+v", assignedID.Namespace, assignedID.Name, aadpodid.AssignedIDUnAssigned, assignedID.Spec.PodNamespace, assignedID.Spec.Pod, err) 1239 c.EventRecorder.Event(&assignedID, corev1.EventTypeWarning, "status update error", message) 1240 klog.Error(message) 1241 return 1242 } 1243 // remove assigned identity crd from cluster as the identity has successfully been removed from the node 1244 err = c.removeAssignedIdentity(&assignedID) 1245 if err != nil { 1246 klog.Errorf("failed to remove AzureAssignedIdentity %s/%s, error: %+v", assignedID.Namespace, assignedID.Name, err) 1247 return 1248 } 1249 klog.V(1).Infof("deleted assigned identity %s/%s", assignedID.Namespace, assignedID.Name) 1250 }(delID) 1251 } 1252 1253 // Ensure that all deletes are complete 1254 if err := semDel.Acquire(ctx, c.createDeleteBatch); err != nil { 1255 klog.Errorf("failed to acquire semaphore at the end of deletes, error: %+v", err) 1256 return 1257 } 1258 1259 stats.Increment(stats.TotalAzureAssignedIdentitiesCreated, len(nodeTrackList.assignedIDsToCreate)) 1260 stats.Increment(stats.TotalAzureAssignedIdentitiesUpdated, len(nodeTrackList.assignedIDsToUpdate)) 1261 stats.Increment(stats.TotalAzureAssignedIdentitiesDeleted, len(nodeTrackList.assignedIDsToDelete)) 1262 stats.Put(stats.TotalAzureAssignedIdentitiesCreateOrUpdate, time.Since(beginAdding)) 1263 } 1264 1265 // cleanUpAllAssignedIdentitiesOnNode deletes all assigned identities associated with a the node 1266 func (c *Client) cleanUpAllAssignedIdentitiesOnNode(node string, nodeTrackList trackUserAssignedMSIIds, wg *sync.WaitGroup) { 1267 defer wg.Done() 1268 klog.Infof("deleting all assigned identites for %s as node not found", node) 1269 for _, deleteID := range nodeTrackList.assignedIDsToDelete { 1270 deleteID := deleteID // avoid implicit memory aliasing in for loop 1271 binding := deleteID.Spec.AzureBindingRef 1272 1273 err := c.removeAssignedIdentity(&deleteID) 1274 if err != nil { 1275 message := fmt.Sprintf("failed to remove AzureIdentityBinding %s/%s from node %s for pod %s/%s, error: %v", binding.Namespace, binding.Name, deleteID.Spec.NodeName, deleteID.Spec.PodNamespace, deleteID.Spec.Pod, err) 1276 c.EventRecorder.Event(binding, corev1.EventTypeWarning, "binding remove error", message) 1277 klog.Error(message) 1278 continue 1279 } 1280 c.EventRecorder.Event(binding, corev1.EventTypeNormal, "binding removed", 1281 fmt.Sprintf("Binding %s removed from node %s for pod %s", binding.Name, deleteID.Spec.NodeName, deleteID.Spec.Pod)) 1282 } 1283 } 1284 1285 // consolidateVMSSNodes takes a list of all nodes that are part of the current sync cycle, checks if the nodes are 1286 // part of vmss and combines the vmss nodes into vmss name. This consolidation is needed because vmss identities 1287 // currently operate on all nodes in the vmss not just a single node. 1288 func (c *Client) consolidateVMSSNodes(nodeMap map[string]trackUserAssignedMSIIds, wg *sync.WaitGroup) { 1289 vmssMap := make(map[string][]string) 1290 1291 for nodeName, nodeTrackList := range nodeMap { 1292 node, err := c.NodeClient.Get(nodeName) 1293 if err != nil && !strings.Contains(err.Error(), "not found") { 1294 klog.Errorf("failed to get node %s, error: %+v", nodeName, err) 1295 continue 1296 } 1297 if err != nil && strings.Contains(err.Error(), "not found") { 1298 klog.Warningf("failed to get node %s while updating user-assigned identities, error: %+v", nodeName, err) 1299 wg.Add(1) 1300 // node is no longer found in the cluster, all the assigned identities that were created in this sync loop 1301 // and those that already exist for this node need to be deleted. 1302 go c.cleanUpAllAssignedIdentitiesOnNode(nodeName, nodeTrackList, wg) 1303 delete(nodeMap, nodeName) 1304 continue 1305 } 1306 vmssName, isvmss, err := isVMSS(node) 1307 if err != nil { 1308 klog.Errorf("failed to check if node %s is VMSS, error: %+v", nodeName, err) 1309 continue 1310 } 1311 if isvmss { 1312 if nodes, ok := vmssMap[vmssName]; ok { 1313 nodes = append(nodes, nodeName) 1314 vmssMap[vmssName] = nodes 1315 continue 1316 } 1317 vmssMap[vmssName] = []string{nodeName} 1318 } 1319 } 1320 1321 // aggregate vmss nodes into vmss name 1322 for vmssName, vmssNodes := range vmssMap { 1323 if len(vmssNodes) < 1 { 1324 continue 1325 } 1326 1327 vmssTrackList := trackUserAssignedMSIIds{} 1328 for _, vmssNode := range vmssNodes { 1329 vmssTrackList.addUserAssignedMSIIDs = append(vmssTrackList.addUserAssignedMSIIDs, nodeMap[vmssNode].addUserAssignedMSIIDs...) 1330 vmssTrackList.removeUserAssignedMSIIDs = append(vmssTrackList.removeUserAssignedMSIIDs, nodeMap[vmssNode].removeUserAssignedMSIIDs...) 1331 vmssTrackList.assignedIDsToCreate = append(vmssTrackList.assignedIDsToCreate, nodeMap[vmssNode].assignedIDsToCreate...) 1332 vmssTrackList.assignedIDsToDelete = append(vmssTrackList.assignedIDsToDelete, nodeMap[vmssNode].assignedIDsToDelete...) 1333 vmssTrackList.assignedIDsToUpdate = append(vmssTrackList.assignedIDsToUpdate, nodeMap[vmssNode].assignedIDsToUpdate...) 1334 vmssTrackList.isvmss = true 1335 1336 delete(nodeMap, vmssNode) 1337 nodeMap[getVMSSName(vmssName)] = vmssTrackList 1338 } 1339 } 1340 } 1341 1342 // checkIfIdentityImmutable checks if the identity is immutable 1343 // if identity is immutable, then it will not be removed from underlying node/vmss 1344 // returns true if identity is immutable 1345 func (c *Client) checkIfIdentityImmutable(id string) bool { 1346 // no immutable identity list defined, then identity is not immutable and can be safely removed 1347 if c.ImmutableUserMSIsMap == nil { 1348 return false 1349 } 1350 // identity is immutable, so should not be deleted from the underlying node/vmss 1351 if _, exists := c.ImmutableUserMSIsMap[id]; exists { 1352 return true 1353 } 1354 return false 1355 } 1356 1357 // generateIdentityAssignmentState generates the current and desired state of each node's identity 1358 // assignments based on an existing list of AzureAssignedIdentity as the source of truth. 1359 func (c *Client) generateIdentityAssignmentState() (map[string]map[string]bool, map[string]map[string]bool, map[string]bool, error) { 1360 type nodeMetadata struct { 1361 nodeName string 1362 isVMSS bool 1363 } 1364 1365 assignedIDs, err := c.CRDClient.ListAssignedIDs() 1366 if err != nil { 1367 return nil, nil, nil, fmt.Errorf("failed to list AzureAssignedIdentities, error: %+v", err) 1368 } 1369 1370 nodeMetadataCache := make(map[string]nodeMetadata) 1371 isVMSSMap := make(map[string]bool) 1372 currentState := make(map[string]map[string]bool) 1373 desiredState := make(map[string]map[string]bool) 1374 for _, assignedID := range *assignedIDs { 1375 if _, ok := nodeMetadataCache[assignedID.Spec.NodeName]; !ok { 1376 node, err := c.NodeClient.Get(assignedID.Spec.NodeName) 1377 if err != nil { 1378 return nil, nil, nil, fmt.Errorf("failed to get node %s, error: %+v", assignedID.Spec.NodeName, err) 1379 } 1380 1381 nodeName, isVMSS, err := isVMSS(node) 1382 if err != nil { 1383 return nil, nil, nil, fmt.Errorf("failed to check if node %s is VMSS, error: %+v", assignedID.Spec.NodeName, err) 1384 } else if isVMSS { 1385 nodeName = getVMSSName(nodeName) 1386 } else { 1387 // VM node name does not require conversion 1388 nodeName = assignedID.Spec.NodeName 1389 } 1390 1391 // cache node metadata to avoid excessive GET calls 1392 nodeMetadataCache[assignedID.Spec.NodeName] = nodeMetadata{ 1393 nodeName: nodeName, 1394 isVMSS: isVMSS, 1395 } 1396 } 1397 1398 nodeName := nodeMetadataCache[assignedID.Spec.NodeName].nodeName 1399 isVMSS := nodeMetadataCache[assignedID.Spec.NodeName].isVMSS 1400 isVMSSMap[nodeName] = isVMSS 1401 1402 // only consider AzureAssignedIdentities in ASSIGNED state 1403 // do not consider AzureAssignedIdentities in CREATED state because they are either: 1404 // 1. in the process of assigning the identities on Azure or 1405 // 2. encountering errors when assigning identities on Azure 1406 if assignedID.Status.Status == aadpodid.AssignedIDAssigned && assignedID.Spec.AzureIdentityRef.Spec.Type == aadpodid.UserAssignedMSI { 1407 if _, ok := desiredState[nodeName]; !ok { 1408 desiredState[nodeName] = make(map[string]bool) 1409 } 1410 desiredState[nodeName][assignedID.Spec.AzureIdentityRef.Spec.ResourceID] = true 1411 } 1412 1413 if _, ok := currentState[nodeName]; !ok { 1414 currentState[nodeName] = make(map[string]bool) 1415 idList, err := c.getUserMSIListForNode(nodeName, isVMSS) 1416 if err != nil { 1417 return nil, nil, nil, fmt.Errorf("failed to get a list of user-assigned identites from node %s, error: %+v", nodeName, err) 1418 } 1419 1420 for _, identityResourceID := range idList { 1421 currentState[nodeName][identityResourceID] = true 1422 } 1423 } 1424 } 1425 1426 return currentState, desiredState, isVMSSMap, nil 1427 } 1428 1429 // generateIdentityAssignmentDiff perform a diff between current 1430 // and desired state of identity assignment on Azure and returns 1431 // a map with the node name as the key and a list of user-assigned 1432 // identities we should assign to the node as the value. 1433 func generateIdentityAssignmentDiff(currentState map[string]map[string]bool, desiredState map[string]map[string]bool) map[string][]string { 1434 diff := make(map[string][]string) 1435 for nodeName, identityResourceIDs := range desiredState { 1436 var identitiesToAssign []string 1437 for identityResourceID := range identityResourceIDs { 1438 if _, ok := currentState[nodeName]; ok && currentState[nodeName][identityResourceID] { 1439 continue 1440 } 1441 identitiesToAssign = append(identitiesToAssign, identityResourceID) 1442 } 1443 1444 if len(identitiesToAssign) > 0 { 1445 diff[nodeName] = identitiesToAssign 1446 } 1447 } 1448 1449 return diff 1450 } 1451 1452 // reconcileIdentityAssignment uses the existing list of AzureAssignedIdentities 1453 // as the single source of truth and reconciles identity assignment on Azure. 1454 func (c *Client) reconcileIdentityAssignment() { 1455 currentState, desiredState, isVMSSMap, err := c.generateIdentityAssignmentState() 1456 if err != nil { 1457 klog.Errorf("failed to generate identity assignment state, error: %+v", err) 1458 return 1459 } 1460 1461 klog.V(6).Infof("current state of identity assignment on Azure: %+v", currentState) 1462 klog.V(6).Infof("desired state of identity assignment on Azure: %+v", desiredState) 1463 1464 diff := generateIdentityAssignmentDiff(currentState, desiredState) 1465 for nodeNameOnAzure, identitiesToAssign := range diff { 1466 klog.Infof("reconciling identity assignment for %v on node %s", identitiesToAssign, nodeNameOnAzure) 1467 if err := c.CloudClient.UpdateUserMSI(identitiesToAssign, nil, nodeNameOnAzure, isVMSSMap[nodeNameOnAzure]); err != nil { 1468 klog.Errorf("failed to update user-assigned identities on node %s, error: %+v", nodeNameOnAzure, err) 1469 } 1470 } 1471 }