github.com/argoproj/argo-cd/v3@v3.2.1/controller/sharding/sharding.go (about) 1 package sharding 2 3 import ( 4 "context" 5 "encoding/json" 6 stderrors "errors" 7 "fmt" 8 "hash/fnv" 9 "math" 10 "os" 11 "slices" 12 "sort" 13 "strconv" 14 "strings" 15 "time" 16 17 corev1 "k8s.io/api/core/v1" 18 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 19 "k8s.io/client-go/kubernetes" 20 21 "github.com/argoproj/argo-cd/v3/common" 22 "github.com/argoproj/argo-cd/v3/controller/sharding/consistent" 23 "github.com/argoproj/argo-cd/v3/pkg/apis/application/v1alpha1" 24 25 log "github.com/sirupsen/logrus" 26 apierrors "k8s.io/apimachinery/pkg/api/errors" 27 28 "github.com/argoproj/argo-cd/v3/util/db" 29 "github.com/argoproj/argo-cd/v3/util/env" 30 "github.com/argoproj/argo-cd/v3/util/errors" 31 "github.com/argoproj/argo-cd/v3/util/settings" 32 ) 33 34 // Make it overridable for testing 35 var osHostnameFunction = os.Hostname 36 37 // Make it overridable for testing 38 var heartbeatCurrentTime = metav1.Now 39 40 var ( 41 HeartbeatDuration = env.ParseNumFromEnv(common.EnvControllerHeartbeatTime, 10, 10, 60) 42 HeartbeatTimeout = 3 * HeartbeatDuration 43 ) 44 45 const ShardControllerMappingKey = "shardControllerMapping" 46 47 type ( 48 DistributionFunction func(c *v1alpha1.Cluster) int 49 ClusterFilterFunction func(c *v1alpha1.Cluster) bool 50 clusterAccessor func() []*v1alpha1.Cluster 51 appAccessor func() []*v1alpha1.Application 52 ) 53 54 // shardApplicationControllerMapping stores the mapping of Shard Number to Application Controller in ConfigMap. 55 // It also stores the heartbeat of last synced time of the application controller. 56 type shardApplicationControllerMapping struct { 57 ShardNumber int 58 ControllerName string 59 HeartbeatTime metav1.Time 60 } 61 62 // GetClusterFilter returns a ClusterFilterFunction which is a function taking a cluster as a parameter 63 // and returns whether or not the cluster should be processed by a given shard. It calls the distributionFunction 64 // to determine which shard will process the cluster, and if the given shard is equal to the calculated shard 65 // the function will return true. 66 func GetClusterFilter(_ db.ArgoDB, distributionFunction DistributionFunction, replicas, shard int) ClusterFilterFunction { 67 return func(c *v1alpha1.Cluster) bool { 68 clusterShard := 0 69 if c != nil && c.Shard != nil { 70 requestedShard := int(*c.Shard) 71 if requestedShard < replicas { 72 clusterShard = requestedShard 73 } else { 74 log.Warnf("Specified cluster shard (%d) for cluster: %s is greater than the number of available shard. Assigning automatically.", requestedShard, c.Name) 75 } 76 } else { 77 clusterShard = distributionFunction(c) 78 } 79 return clusterShard == shard 80 } 81 } 82 83 // GetDistributionFunction returns which DistributionFunction should be used based on the passed algorithm and 84 // the current datas. 85 func GetDistributionFunction(clusters clusterAccessor, apps appAccessor, shardingAlgorithm string, replicasCount int) DistributionFunction { 86 log.Debugf("Using filter function: %s", shardingAlgorithm) 87 distributionFunction := LegacyDistributionFunction(replicasCount) 88 switch shardingAlgorithm { 89 case common.RoundRobinShardingAlgorithm: 90 distributionFunction = RoundRobinDistributionFunction(clusters, replicasCount) 91 case common.LegacyShardingAlgorithm: 92 distributionFunction = LegacyDistributionFunction(replicasCount) 93 case common.ConsistentHashingWithBoundedLoadsAlgorithm: 94 distributionFunction = ConsistentHashingWithBoundedLoadsDistributionFunction(clusters, apps, replicasCount) 95 default: 96 log.Warnf("distribution type %s is not supported, defaulting to %s", shardingAlgorithm, common.DefaultShardingAlgorithm) 97 } 98 return distributionFunction 99 } 100 101 // LegacyDistributionFunction returns a DistributionFunction using a stable distribution algorithm: 102 // for a given cluster the function will return the shard number based on the cluster id. This function 103 // is lightweight and can be distributed easily, however, it does not ensure an homogenous distribution as 104 // some shards may get assigned more clusters than others. It is the legacy function distribution that is 105 // kept for compatibility reasons 106 func LegacyDistributionFunction(replicas int) DistributionFunction { 107 return func(c *v1alpha1.Cluster) int { 108 if replicas == 0 { 109 log.Debugf("Replicas count is : %d, returning -1", replicas) 110 return -1 111 } 112 if c == nil { 113 log.Debug("In-cluster: returning 0") 114 return 0 115 } 116 // if Shard is manually set and the assigned value is lower than the number of replicas, 117 // then its value is returned otherwise it is the default calculated value 118 if c.Shard != nil && int(*c.Shard) < replicas { 119 return int(*c.Shard) 120 } 121 id := c.ID 122 log.Debugf("Calculating cluster shard for cluster id: %s", id) 123 if id == "" { 124 return 0 125 } 126 h := fnv.New32a() 127 _, _ = h.Write([]byte(id)) 128 shard := int32(h.Sum32() % uint32(replicas)) 129 log.Debugf("Cluster with id=%s will be processed by shard %d", id, shard) 130 return int(shard) 131 } 132 } 133 134 // RoundRobinDistributionFunction returns a DistributionFunction using an homogeneous distribution algorithm: 135 // for a given cluster the function will return the shard number based on the modulo of the cluster rank in 136 // the cluster's list sorted by uid on the shard number. 137 // This function ensures an homogenous distribution: each shards got assigned the same number of 138 // clusters +/-1 , but with the drawback of a reshuffling of clusters across shards in case of some changes 139 // in the cluster list 140 141 func RoundRobinDistributionFunction(clusters clusterAccessor, replicas int) DistributionFunction { 142 return func(c *v1alpha1.Cluster) int { 143 if replicas > 0 { 144 if c == nil { // in-cluster does not necessarily have a secret assigned. So we are receiving a nil cluster here. 145 return 0 146 } 147 // if Shard is manually set and the assigned value is lower than the number of replicas, 148 // then its value is returned otherwise it is the default calculated value 149 if c.Shard != nil && int(*c.Shard) < replicas { 150 return int(*c.Shard) 151 } 152 clusterIndexdByClusterIdMap := createClusterIndexByClusterIdMap(clusters) 153 clusterIndex, ok := clusterIndexdByClusterIdMap[c.ID] 154 if !ok { 155 log.Warnf("Cluster with id=%s not found in cluster map.", c.ID) 156 return -1 157 } 158 shard := int(clusterIndex % replicas) 159 log.Debugf("Cluster with id=%s will be processed by shard %d", c.ID, shard) 160 return shard 161 } 162 log.Warnf("The number of replicas (%d) is lower than 1", replicas) 163 return -1 164 } 165 } 166 167 // ConsistentHashingWithBoundedLoadsDistributionFunction returns a DistributionFunction using an almost homogeneous distribution algorithm: 168 // for a given cluster the function will return the shard number based on a consistent hashing with bounded loads algorithm. 169 // This function ensures an almost homogenous distribution: each shards got assigned the fairly similar number of 170 // clusters +/-10% , but with it is resilient to sharding and/or number of clusters changes. 171 func ConsistentHashingWithBoundedLoadsDistributionFunction(clusters clusterAccessor, apps appAccessor, replicas int) DistributionFunction { 172 return func(c *v1alpha1.Cluster) int { 173 if replicas > 0 { 174 if c == nil { // in-cluster does not necessarily have a secret assigned. So we are receiving a nil cluster here. 175 return 0 176 } 177 178 // if Shard is manually set and the assigned value is lower than the number of replicas, 179 // then its value is returned otherwise it is the default calculated value 180 if c.Shard != nil && int(*c.Shard) < replicas { 181 return int(*c.Shard) 182 } 183 // if the cluster is not in the clusters list anymore, we should unassign it from any shard, so we 184 // return the reserved value of -1 185 if !slices.Contains(clusters(), c) { 186 log.Warnf("Cluster with id=%s not found in cluster map.", c.ID) 187 return -1 188 } 189 shardIndexedByCluster := createConsistentHashingWithBoundLoads(replicas, clusters, apps) 190 shard, ok := shardIndexedByCluster[c.ID] 191 if !ok { 192 log.Warnf("Cluster with id=%s not found in cluster map.", c.ID) 193 return -1 194 } 195 log.Debugf("Cluster with id=%s will be processed by shard %d", c.ID, shard) 196 return shard 197 } 198 log.Warnf("The number of replicas (%d) is lower than 1", replicas) 199 return -1 200 } 201 } 202 203 func createConsistentHashingWithBoundLoads(replicas int, getCluster clusterAccessor, getApp appAccessor) map[string]int { 204 clusters := getSortedClustersList(getCluster) 205 appDistribution := getAppDistribution(getCluster, getApp) 206 shardIndexedByCluster := make(map[string]int) 207 appsIndexedByShard := make(map[string]int64) 208 consistentHashing := consistent.New() 209 // Adding a shard with id "-1" as a reserved value for clusters that does not have an assigned shard 210 // this happens for clusters that are removed for the clusters list 211 // consistentHashing.Add("-1") 212 for i := 0; i < replicas; i++ { 213 shard := strconv.Itoa(i) 214 consistentHashing.Add(shard) 215 appsIndexedByShard[shard] = 0 216 } 217 218 for _, c := range clusters { 219 clusterIndex, err := consistentHashing.GetLeast(c.ID) 220 if err != nil { 221 log.Warnf("Cluster with id=%s not found in cluster map.", c.ID) 222 } 223 shardIndexedByCluster[c.ID], err = strconv.Atoi(clusterIndex) 224 if err != nil { 225 log.Errorf("Consistent Hashing was supposed to return a shard index but it returned %d", err) 226 } 227 numApps, ok := appDistribution[c.Server] 228 if !ok { 229 numApps = 0 230 } 231 appsIndexedByShard[clusterIndex] += numApps 232 consistentHashing.UpdateLoad(clusterIndex, appsIndexedByShard[clusterIndex]) 233 } 234 235 return shardIndexedByCluster 236 } 237 238 func getAppDistribution(getCluster clusterAccessor, getApps appAccessor) map[string]int64 { 239 apps := getApps() 240 clusters := getCluster() 241 appDistribution := make(map[string]int64, len(clusters)) 242 243 for _, a := range apps { 244 if _, ok := appDistribution[a.Spec.Destination.Server]; !ok { 245 appDistribution[a.Spec.Destination.Server] = 0 246 } 247 appDistribution[a.Spec.Destination.Server]++ 248 } 249 return appDistribution 250 } 251 252 // NoShardingDistributionFunction returns a DistributionFunction that will process all cluster by shard 0 253 // the function is created for API compatibility purposes and is not supposed to be activated. 254 func NoShardingDistributionFunction() DistributionFunction { 255 return func(_ *v1alpha1.Cluster) int { return 0 } 256 } 257 258 // InferShard extracts the shard index based on its hostname. 259 func InferShard() (int, error) { 260 hostname, err := osHostnameFunction() 261 if err != nil { 262 return -1, err 263 } 264 parts := strings.Split(hostname, "-") 265 if len(parts) == 0 { 266 log.Warnf("hostname should end with shard number separated by '-' but got: %s", hostname) 267 return 0, nil 268 } 269 shard, err := strconv.Atoi(parts[len(parts)-1]) 270 if err != nil { 271 log.Warnf("hostname should end with shard number separated by '-' but got: %s", hostname) 272 return 0, nil 273 } 274 return int(shard), nil 275 } 276 277 func getSortedClustersList(getCluster clusterAccessor) []*v1alpha1.Cluster { 278 clusters := getCluster() 279 sort.Slice(clusters, func(i, j int) bool { 280 return clusters[i].ID < clusters[j].ID 281 }) 282 return clusters 283 } 284 285 func createClusterIndexByClusterIdMap(getCluster clusterAccessor) map[string]int { 286 clusters := getSortedClustersList(getCluster) 287 log.Debugf("ClustersList has %d items", len(clusters)) 288 clusterById := make(map[string]*v1alpha1.Cluster) 289 clusterIndexedByClusterId := make(map[string]int) 290 for i, cluster := range clusters { 291 log.Debugf("Adding cluster with id=%s and name=%s to cluster's map", cluster.ID, cluster.Name) 292 clusterById[cluster.ID] = cluster 293 clusterIndexedByClusterId[cluster.ID] = i 294 } 295 return clusterIndexedByClusterId 296 } 297 298 // GetOrUpdateShardFromConfigMap finds the shard number from the shard mapping configmap. If the shard mapping configmap does not exist, 299 // the function creates the shard mapping configmap. 300 // The function takes the shard number from the environment variable (default value -1, if not set) and passes it to this function. 301 // If the shard value passed to this function is -1, that is, the shard was not set as an environment variable, 302 // we default the shard number to 0 for computing the default config map. 303 func GetOrUpdateShardFromConfigMap(kubeClient kubernetes.Interface, settingsMgr *settings.SettingsManager, replicas, shard int) (int, error) { 304 hostname, err := osHostnameFunction() 305 if err != nil { 306 return -1, err 307 } 308 309 // fetch the shard mapping configMap 310 shardMappingCM, err := kubeClient.CoreV1().ConfigMaps(settingsMgr.GetNamespace()).Get(context.Background(), common.ArgoCDAppControllerShardConfigMapName, metav1.GetOptions{}) 311 if err != nil { 312 if !apierrors.IsNotFound(err) { 313 return -1, fmt.Errorf("error getting sharding config map: %w", err) 314 } 315 log.Infof("shard mapping configmap %s not found. Creating default shard mapping configmap.", common.ArgoCDAppControllerShardConfigMapName) 316 317 // if the shard is not set as an environment variable, set the default value of shard to 0 for generating default CM 318 if shard == -1 { 319 shard = 0 320 } 321 shardMappingCM, err = generateDefaultShardMappingCM(settingsMgr.GetNamespace(), hostname, replicas, shard) 322 if err != nil { 323 return -1, fmt.Errorf("error generating default shard mapping configmap %w", err) 324 } 325 if _, err = kubeClient.CoreV1().ConfigMaps(settingsMgr.GetNamespace()).Create(context.Background(), shardMappingCM, metav1.CreateOptions{}); err != nil { 326 return -1, fmt.Errorf("error creating shard mapping configmap %w", err) 327 } 328 // return 0 as the controller is assigned to shard 0 while generating default shard mapping ConfigMap 329 return shard, nil 330 } 331 // Identify the available shard and update the ConfigMap 332 data := shardMappingCM.Data[ShardControllerMappingKey] 333 var shardMappingData []shardApplicationControllerMapping 334 err = json.Unmarshal([]byte(data), &shardMappingData) 335 if err != nil { 336 return -1, fmt.Errorf("error unmarshalling shard config map data: %w", err) 337 } 338 339 shard, shardMappingData = getOrUpdateShardNumberForController(shardMappingData, hostname, replicas, shard) 340 updatedShardMappingData, err := json.Marshal(shardMappingData) 341 if err != nil { 342 return -1, fmt.Errorf("error marshalling data of shard mapping ConfigMap: %w", err) 343 } 344 shardMappingCM.Data[ShardControllerMappingKey] = string(updatedShardMappingData) 345 346 _, err = kubeClient.CoreV1().ConfigMaps(settingsMgr.GetNamespace()).Update(context.Background(), shardMappingCM, metav1.UpdateOptions{}) 347 if err != nil { 348 return -1, err 349 } 350 return shard, nil 351 } 352 353 // getOrUpdateShardNumberForController takes list of shardApplicationControllerMapping and performs computation to find the matching or empty shard number 354 func getOrUpdateShardNumberForController(shardMappingData []shardApplicationControllerMapping, hostname string, replicas, shard int) (int, []shardApplicationControllerMapping) { 355 // if current length of shardMappingData in shard mapping configMap is less than the number of replicas, 356 // create additional empty entries for missing shard numbers in shardMappingDataconfigMap 357 if len(shardMappingData) < replicas { 358 // generate extra default mappings 359 for currentShard := len(shardMappingData); currentShard < replicas; currentShard++ { 360 shardMappingData = append(shardMappingData, shardApplicationControllerMapping{ 361 ShardNumber: currentShard, 362 }) 363 } 364 } 365 366 // if current length of shardMappingData in shard mapping configMap is more than the number of replicas, 367 // we replace the config map with default config map and let controllers self assign the new shard to itself 368 if len(shardMappingData) > replicas { 369 shardMappingData = getDefaultShardMappingData(replicas) 370 } 371 372 if shard != -1 && shard < replicas { 373 log.Debugf("update heartbeat for shard %d", shard) 374 for i := range shardMappingData { 375 shardMapping := shardMappingData[i] 376 if shardMapping.ShardNumber == shard { 377 log.Debugf("Shard found. Updating heartbeat!!") 378 shardMapping.ControllerName = hostname 379 shardMapping.HeartbeatTime = heartbeatCurrentTime() 380 shardMappingData[i] = shardMapping 381 break 382 } 383 } 384 } else { 385 // find the matching shard with assigned controllerName 386 for i := range shardMappingData { 387 shardMapping := shardMappingData[i] 388 if shardMapping.ControllerName == hostname { 389 log.Debugf("Shard matched. Updating heartbeat!!") 390 shard = int(shardMapping.ShardNumber) 391 shardMapping.HeartbeatTime = heartbeatCurrentTime() 392 shardMappingData[i] = shardMapping 393 break 394 } 395 } 396 } 397 398 // at this point, we have still not found a shard with matching hostname. 399 // So, find a shard with either no controller assigned or assigned controller 400 // with heartbeat past threshold 401 if shard == -1 { 402 for i := range shardMappingData { 403 shardMapping := shardMappingData[i] 404 if (shardMapping.ControllerName == "") || (metav1.Now().After(shardMapping.HeartbeatTime.Add(time.Duration(HeartbeatTimeout) * time.Second))) { 405 shard = int(shardMapping.ShardNumber) 406 log.Debugf("Empty shard found %d", shard) 407 shardMapping.ControllerName = hostname 408 shardMapping.HeartbeatTime = heartbeatCurrentTime() 409 shardMappingData[i] = shardMapping 410 break 411 } 412 } 413 } 414 return shard, shardMappingData 415 } 416 417 // generateDefaultShardMappingCM creates a default shard mapping configMap. Assigns current controller to shard 0. 418 func generateDefaultShardMappingCM(namespace, hostname string, replicas, shard int) (*corev1.ConfigMap, error) { 419 shardingCM := &corev1.ConfigMap{ 420 ObjectMeta: metav1.ObjectMeta{ 421 Name: common.ArgoCDAppControllerShardConfigMapName, 422 Namespace: namespace, 423 }, 424 Data: map[string]string{}, 425 } 426 427 shardMappingData := getDefaultShardMappingData(replicas) 428 429 // if shard is not assigned to a controller, we use shard 0 430 if shard == -1 || shard > replicas { 431 shard = 0 432 } 433 shardMappingData[shard].ControllerName = hostname 434 shardMappingData[shard].HeartbeatTime = heartbeatCurrentTime() 435 436 data, err := json.Marshal(shardMappingData) 437 if err != nil { 438 return nil, fmt.Errorf("error generating default ConfigMap: %w", err) 439 } 440 shardingCM.Data[ShardControllerMappingKey] = string(data) 441 442 return shardingCM, nil 443 } 444 445 func getDefaultShardMappingData(replicas int) []shardApplicationControllerMapping { 446 shardMappingData := make([]shardApplicationControllerMapping, 0) 447 448 for i := 0; i < replicas; i++ { 449 mapping := shardApplicationControllerMapping{ 450 ShardNumber: i, 451 } 452 shardMappingData = append(shardMappingData, mapping) 453 } 454 return shardMappingData 455 } 456 457 func GetClusterSharding(kubeClient kubernetes.Interface, settingsMgr *settings.SettingsManager, shardingAlgorithm string, enableDynamicClusterDistribution bool) (ClusterShardingCache, error) { 458 var replicasCount int 459 if enableDynamicClusterDistribution { 460 applicationControllerName := env.StringFromEnv(common.EnvAppControllerName, common.DefaultApplicationControllerName) 461 appControllerDeployment, err := kubeClient.AppsV1().Deployments(settingsMgr.GetNamespace()).Get(context.Background(), applicationControllerName, metav1.GetOptions{}) 462 // if app controller deployment is not found when dynamic cluster distribution is enabled error out 463 if err != nil { 464 return nil, fmt.Errorf("(dynamic cluster distribution) failed to get app controller deployment: %w", err) 465 } 466 467 if appControllerDeployment == nil || appControllerDeployment.Spec.Replicas == nil { 468 return nil, stderrors.New("(dynamic cluster distribution) failed to get app controller deployment replica count") 469 } 470 replicasCount = int(*appControllerDeployment.Spec.Replicas) 471 } else { 472 replicasCount = env.ParseNumFromEnv(common.EnvControllerReplicas, 0, 0, math.MaxInt32) 473 } 474 shardNumber := env.ParseNumFromEnv(common.EnvControllerShard, -1, -math.MaxInt32, math.MaxInt32) 475 if replicasCount > 1 { 476 // check for shard mapping using configmap if application-controller is a deployment 477 // else use existing logic to infer shard from pod name if application-controller is a statefulset 478 if enableDynamicClusterDistribution { 479 var err error 480 // retry 3 times if we find a conflict while updating shard mapping configMap. 481 // If we still see conflicts after the retries, wait for next iteration of heartbeat process. 482 for i := 0; i <= common.AppControllerHeartbeatUpdateRetryCount; i++ { 483 shardNumber, err = GetOrUpdateShardFromConfigMap(kubeClient, settingsMgr, replicasCount, shardNumber) 484 if err != nil && !apierrors.IsConflict(err) { 485 err = fmt.Errorf("unable to get shard due to error updating the sharding config map: %w", err) 486 break 487 } 488 // if `err == nil`, should not log the following warning message 489 if err != nil { 490 log.Warnf("conflict when getting shard from shard mapping configMap. Retrying (%d/3)", i) 491 } 492 } 493 errors.CheckError(err) 494 } else { 495 if shardNumber < 0 { 496 var err error 497 shardNumber, err = InferShard() 498 errors.CheckError(err) 499 } 500 if shardNumber > replicasCount { 501 log.Warnf("Calculated shard number %d is greated than the number of replicas count. Defaulting to 0", shardNumber) 502 shardNumber = 0 503 } 504 } 505 } else { 506 log.Info("Processing all cluster shards") 507 shardNumber = 0 508 } 509 db := db.NewDB(settingsMgr.GetNamespace(), settingsMgr, kubeClient) 510 return NewClusterSharding(db, shardNumber, replicasCount, shardingAlgorithm), nil 511 }