github.com/argoproj/argo-cd/v2@v2.10.9/controller/sharding/sharding.go (about) 1 package sharding 2 3 import ( 4 "context" 5 "fmt" 6 "hash/fnv" 7 "math" 8 "os" 9 "sort" 10 "strconv" 11 "strings" 12 "time" 13 14 "encoding/json" 15 16 "github.com/argoproj/argo-cd/v2/common" 17 "github.com/argoproj/argo-cd/v2/pkg/apis/application/v1alpha1" 18 v1 "k8s.io/api/core/v1" 19 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 20 "k8s.io/client-go/kubernetes" 21 22 "github.com/argoproj/argo-cd/v2/util/db" 23 "github.com/argoproj/argo-cd/v2/util/env" 24 "github.com/argoproj/argo-cd/v2/util/errors" 25 "github.com/argoproj/argo-cd/v2/util/settings" 26 log "github.com/sirupsen/logrus" 27 kubeerrors "k8s.io/apimachinery/pkg/api/errors" 28 ) 29 30 // Make it overridable for testing 31 var osHostnameFunction = os.Hostname 32 33 // Make it overridable for testing 34 var heartbeatCurrentTime = metav1.Now 35 36 var ( 37 HeartbeatDuration = env.ParseNumFromEnv(common.EnvControllerHeartbeatTime, 10, 10, 60) 38 HeartbeatTimeout = 3 * HeartbeatDuration 39 ) 40 41 const ShardControllerMappingKey = "shardControllerMapping" 42 43 type DistributionFunction func(c *v1alpha1.Cluster) int 44 type ClusterFilterFunction func(c *v1alpha1.Cluster) bool 45 type clusterAccessor func() []*v1alpha1.Cluster 46 47 // shardApplicationControllerMapping stores the mapping of Shard Number to Application Controller in ConfigMap. 48 // It also stores the heartbeat of last synced time of the application controller. 49 type shardApplicationControllerMapping struct { 50 ShardNumber int 51 ControllerName string 52 HeartbeatTime metav1.Time 53 } 54 55 // GetClusterFilter returns a ClusterFilterFunction which is a function taking a cluster as a parameter 56 // and returns wheter or not the cluster should be processed by a given shard. It calls the distributionFunction 57 // to determine which shard will process the cluster, and if the given shard is equal to the calculated shard 58 // the function will return true. 59 func GetClusterFilter(db db.ArgoDB, distributionFunction DistributionFunction, replicas, shard int) ClusterFilterFunction { 60 return func(c *v1alpha1.Cluster) bool { 61 clusterShard := 0 62 if c != nil && c.Shard != nil { 63 requestedShard := int(*c.Shard) 64 if requestedShard < replicas { 65 clusterShard = requestedShard 66 } else { 67 log.Warnf("Specified cluster shard (%d) for cluster: %s is greater than the number of available shard. Assigning automatically.", requestedShard, c.Name) 68 } 69 } else { 70 clusterShard = distributionFunction(c) 71 } 72 return clusterShard == shard 73 } 74 } 75 76 // GetDistributionFunction returns which DistributionFunction should be used based on the passed algorithm and 77 // the current datas. 78 func GetDistributionFunction(clusters clusterAccessor, shardingAlgorithm string, replicasCount int) DistributionFunction { 79 log.Debugf("Using filter function: %s", shardingAlgorithm) 80 distributionFunction := LegacyDistributionFunction(replicasCount) 81 switch shardingAlgorithm { 82 case common.RoundRobinShardingAlgorithm: 83 distributionFunction = RoundRobinDistributionFunction(clusters, replicasCount) 84 case common.LegacyShardingAlgorithm: 85 distributionFunction = LegacyDistributionFunction(replicasCount) 86 default: 87 log.Warnf("distribution type %s is not supported, defaulting to %s", shardingAlgorithm, common.DefaultShardingAlgorithm) 88 } 89 return distributionFunction 90 } 91 92 // LegacyDistributionFunction returns a DistributionFunction using a stable distribution algorithm: 93 // for a given cluster the function will return the shard number based on the cluster id. This function 94 // is lightweight and can be distributed easily, however, it does not ensure an homogenous distribution as 95 // some shards may get assigned more clusters than others. It is the legacy function distribution that is 96 // kept for compatibility reasons 97 func LegacyDistributionFunction(replicas int) DistributionFunction { 98 return func(c *v1alpha1.Cluster) int { 99 if replicas == 0 { 100 log.Debugf("Replicas count is : %d, returning -1", replicas) 101 return -1 102 } 103 if c == nil { 104 log.Debug("In-cluster: returning 0") 105 return 0 106 } 107 // if Shard is manually set and the assigned value is lower than the number of replicas, 108 // then its value is returned otherwise it is the default calculated value 109 if c.Shard != nil && int(*c.Shard) < replicas { 110 return int(*c.Shard) 111 } 112 id := c.ID 113 log.Debugf("Calculating cluster shard for cluster id: %s", id) 114 if id == "" { 115 return 0 116 } else { 117 h := fnv.New32a() 118 _, _ = h.Write([]byte(id)) 119 shard := int32(h.Sum32() % uint32(replicas)) 120 log.Debugf("Cluster with id=%s will be processed by shard %d", id, shard) 121 return int(shard) 122 } 123 } 124 } 125 126 // RoundRobinDistributionFunction returns a DistributionFunction using an homogeneous distribution algorithm: 127 // for a given cluster the function will return the shard number based on the modulo of the cluster rank in 128 // the cluster's list sorted by uid on the shard number. 129 // This function ensures an homogenous distribution: each shards got assigned the same number of 130 // clusters +/-1 , but with the drawback of a reshuffling of clusters accross shards in case of some changes 131 // in the cluster list 132 133 func RoundRobinDistributionFunction(clusters clusterAccessor, replicas int) DistributionFunction { 134 return func(c *v1alpha1.Cluster) int { 135 if replicas > 0 { 136 if c == nil { // in-cluster does not necessarly have a secret assigned. So we are receiving a nil cluster here. 137 return 0 138 } 139 // if Shard is manually set and the assigned value is lower than the number of replicas, 140 // then its value is returned otherwise it is the default calculated value 141 if c.Shard != nil && int(*c.Shard) < replicas { 142 return int(*c.Shard) 143 } else { 144 clusterIndexdByClusterIdMap := createClusterIndexByClusterIdMap(clusters) 145 clusterIndex, ok := clusterIndexdByClusterIdMap[c.ID] 146 if !ok { 147 log.Warnf("Cluster with id=%s not found in cluster map.", c.ID) 148 return -1 149 } 150 shard := int(clusterIndex % replicas) 151 log.Debugf("Cluster with id=%s will be processed by shard %d", c.ID, shard) 152 return shard 153 } 154 } 155 log.Warnf("The number of replicas (%d) is lower than 1", replicas) 156 return -1 157 } 158 } 159 160 // NoShardingDistributionFunction returns a DistributionFunction that will process all cluster by shard 0 161 // the function is created for API compatibility purposes and is not supposed to be activated. 162 func NoShardingDistributionFunction() DistributionFunction { 163 return func(c *v1alpha1.Cluster) int { return 0 } 164 } 165 166 // InferShard extracts the shard index based on its hostname. 167 func InferShard() (int, error) { 168 hostname, err := osHostnameFunction() 169 if err != nil { 170 return -1, err 171 } 172 parts := strings.Split(hostname, "-") 173 if len(parts) == 0 { 174 log.Warnf("hostname should end with shard number separated by '-' but got: %s", hostname) 175 return 0, nil 176 } 177 shard, err := strconv.Atoi(parts[len(parts)-1]) 178 if err != nil { 179 log.Warnf("hostname should end with shard number separated by '-' but got: %s", hostname) 180 return 0, nil 181 } 182 return int(shard), nil 183 } 184 185 func getSortedClustersList(getCluster clusterAccessor) []*v1alpha1.Cluster { 186 clusters := getCluster() 187 sort.Slice(clusters, func(i, j int) bool { 188 return clusters[i].ID < clusters[j].ID 189 }) 190 return clusters 191 } 192 193 func createClusterIndexByClusterIdMap(getCluster clusterAccessor) map[string]int { 194 clusters := getSortedClustersList(getCluster) 195 log.Debugf("ClustersList has %d items", len(clusters)) 196 clusterById := make(map[string]*v1alpha1.Cluster) 197 clusterIndexedByClusterId := make(map[string]int) 198 for i, cluster := range clusters { 199 log.Debugf("Adding cluster with id=%s and name=%s to cluster's map", cluster.ID, cluster.Name) 200 clusterById[cluster.ID] = cluster 201 clusterIndexedByClusterId[cluster.ID] = i 202 } 203 return clusterIndexedByClusterId 204 } 205 206 // GetOrUpdateShardFromConfigMap finds the shard number from the shard mapping configmap. If the shard mapping configmap does not exist, 207 // the function creates the shard mapping configmap. 208 // The function takes the shard number from the environment variable (default value -1, if not set) and passes it to this function. 209 // If the shard value passed to this function is -1, that is, the shard was not set as an environment variable, 210 // we default the shard number to 0 for computing the default config map. 211 func GetOrUpdateShardFromConfigMap(kubeClient kubernetes.Interface, settingsMgr *settings.SettingsManager, replicas, shard int) (int, error) { 212 hostname, err := osHostnameFunction() 213 if err != nil { 214 return -1, err 215 } 216 217 // fetch the shard mapping configMap 218 shardMappingCM, err := kubeClient.CoreV1().ConfigMaps(settingsMgr.GetNamespace()).Get(context.Background(), common.ArgoCDAppControllerShardConfigMapName, metav1.GetOptions{}) 219 220 if err != nil { 221 if !kubeerrors.IsNotFound(err) { 222 return -1, fmt.Errorf("error getting sharding config map: %s", err) 223 } 224 log.Infof("shard mapping configmap %s not found. Creating default shard mapping configmap.", common.ArgoCDAppControllerShardConfigMapName) 225 226 // if the shard is not set as an environment variable, set the default value of shard to 0 for generating default CM 227 if shard == -1 { 228 shard = 0 229 } 230 shardMappingCM, err = generateDefaultShardMappingCM(settingsMgr.GetNamespace(), hostname, replicas, shard) 231 if err != nil { 232 return -1, fmt.Errorf("error generating default shard mapping configmap %s", err) 233 } 234 if _, err = kubeClient.CoreV1().ConfigMaps(settingsMgr.GetNamespace()).Create(context.Background(), shardMappingCM, metav1.CreateOptions{}); err != nil { 235 return -1, fmt.Errorf("error creating shard mapping configmap %s", err) 236 } 237 // return 0 as the controller is assigned to shard 0 while generating default shard mapping ConfigMap 238 return shard, nil 239 } else { 240 // Identify the available shard and update the ConfigMap 241 data := shardMappingCM.Data[ShardControllerMappingKey] 242 var shardMappingData []shardApplicationControllerMapping 243 err := json.Unmarshal([]byte(data), &shardMappingData) 244 if err != nil { 245 return -1, fmt.Errorf("error unmarshalling shard config map data: %s", err) 246 } 247 248 shard, shardMappingData := getOrUpdateShardNumberForController(shardMappingData, hostname, replicas, shard) 249 updatedShardMappingData, err := json.Marshal(shardMappingData) 250 if err != nil { 251 return -1, fmt.Errorf("error marshalling data of shard mapping ConfigMap: %s", err) 252 } 253 shardMappingCM.Data[ShardControllerMappingKey] = string(updatedShardMappingData) 254 255 _, err = kubeClient.CoreV1().ConfigMaps(settingsMgr.GetNamespace()).Update(context.Background(), shardMappingCM, metav1.UpdateOptions{}) 256 if err != nil { 257 return -1, err 258 } 259 return shard, nil 260 } 261 } 262 263 // getOrUpdateShardNumberForController takes list of shardApplicationControllerMapping and performs computation to find the matching or empty shard number 264 func getOrUpdateShardNumberForController(shardMappingData []shardApplicationControllerMapping, hostname string, replicas, shard int) (int, []shardApplicationControllerMapping) { 265 266 // if current length of shardMappingData in shard mapping configMap is less than the number of replicas, 267 // create additional empty entries for missing shard numbers in shardMappingDataconfigMap 268 if len(shardMappingData) < replicas { 269 // generate extra default mappings 270 for currentShard := len(shardMappingData); currentShard < replicas; currentShard++ { 271 shardMappingData = append(shardMappingData, shardApplicationControllerMapping{ 272 ShardNumber: currentShard, 273 }) 274 } 275 } 276 277 // if current length of shardMappingData in shard mapping configMap is more than the number of replicas, 278 // we replace the config map with default config map and let controllers self assign the new shard to itself 279 if len(shardMappingData) > replicas { 280 shardMappingData = getDefaultShardMappingData(replicas) 281 } 282 283 if shard != -1 && shard < replicas { 284 log.Debugf("update heartbeat for shard %d", shard) 285 for i := range shardMappingData { 286 shardMapping := shardMappingData[i] 287 if shardMapping.ShardNumber == shard { 288 log.Debugf("Shard found. Updating heartbeat!!") 289 shardMapping.ControllerName = hostname 290 shardMapping.HeartbeatTime = heartbeatCurrentTime() 291 shardMappingData[i] = shardMapping 292 break 293 } 294 } 295 } else { 296 // find the matching shard with assigned controllerName 297 for i := range shardMappingData { 298 shardMapping := shardMappingData[i] 299 if shardMapping.ControllerName == hostname { 300 log.Debugf("Shard matched. Updating heartbeat!!") 301 shard = int(shardMapping.ShardNumber) 302 shardMapping.HeartbeatTime = heartbeatCurrentTime() 303 shardMappingData[i] = shardMapping 304 break 305 } 306 } 307 } 308 309 // at this point, we have still not found a shard with matching hostname. 310 // So, find a shard with either no controller assigned or assigned controller 311 // with heartbeat past threshold 312 if shard == -1 { 313 for i := range shardMappingData { 314 shardMapping := shardMappingData[i] 315 if (shardMapping.ControllerName == "") || (metav1.Now().After(shardMapping.HeartbeatTime.Add(time.Duration(HeartbeatTimeout) * time.Second))) { 316 shard = int(shardMapping.ShardNumber) 317 log.Debugf("Empty shard found %d", shard) 318 shardMapping.ControllerName = hostname 319 shardMapping.HeartbeatTime = heartbeatCurrentTime() 320 shardMappingData[i] = shardMapping 321 break 322 } 323 } 324 } 325 return shard, shardMappingData 326 } 327 328 // generateDefaultShardMappingCM creates a default shard mapping configMap. Assigns current controller to shard 0. 329 func generateDefaultShardMappingCM(namespace, hostname string, replicas, shard int) (*v1.ConfigMap, error) { 330 331 shardingCM := &v1.ConfigMap{ 332 ObjectMeta: metav1.ObjectMeta{ 333 Name: common.ArgoCDAppControllerShardConfigMapName, 334 Namespace: namespace, 335 }, 336 Data: map[string]string{}, 337 } 338 339 shardMappingData := getDefaultShardMappingData(replicas) 340 341 // if shard is not assigned to a controller, we use shard 0 342 if shard == -1 || shard > replicas { 343 shard = 0 344 } 345 shardMappingData[shard].ControllerName = hostname 346 shardMappingData[shard].HeartbeatTime = heartbeatCurrentTime() 347 348 data, err := json.Marshal(shardMappingData) 349 if err != nil { 350 return nil, fmt.Errorf("error generating default ConfigMap: %s", err) 351 } 352 shardingCM.Data[ShardControllerMappingKey] = string(data) 353 354 return shardingCM, nil 355 } 356 357 func getDefaultShardMappingData(replicas int) []shardApplicationControllerMapping { 358 shardMappingData := make([]shardApplicationControllerMapping, 0) 359 360 for i := 0; i < replicas; i++ { 361 mapping := shardApplicationControllerMapping{ 362 ShardNumber: i, 363 } 364 shardMappingData = append(shardMappingData, mapping) 365 } 366 return shardMappingData 367 } 368 369 func GetClusterSharding(kubeClient kubernetes.Interface, settingsMgr *settings.SettingsManager, shardingAlgorithm string, enableDynamicClusterDistribution bool) (ClusterShardingCache, error) { 370 var replicasCount int 371 if enableDynamicClusterDistribution { 372 applicationControllerName := env.StringFromEnv(common.EnvAppControllerName, common.DefaultApplicationControllerName) 373 appControllerDeployment, err := kubeClient.AppsV1().Deployments(settingsMgr.GetNamespace()).Get(context.Background(), applicationControllerName, metav1.GetOptions{}) 374 375 // if app controller deployment is not found when dynamic cluster distribution is enabled error out 376 if err != nil { 377 return nil, fmt.Errorf("(dymanic cluster distribution) failed to get app controller deployment: %v", err) 378 } 379 380 if appControllerDeployment != nil && appControllerDeployment.Spec.Replicas != nil { 381 replicasCount = int(*appControllerDeployment.Spec.Replicas) 382 } else { 383 return nil, fmt.Errorf("(dymanic cluster distribution) failed to get app controller deployment replica count") 384 } 385 386 } else { 387 replicasCount = env.ParseNumFromEnv(common.EnvControllerReplicas, 0, 0, math.MaxInt32) 388 } 389 shardNumber := env.ParseNumFromEnv(common.EnvControllerShard, -1, -math.MaxInt32, math.MaxInt32) 390 if replicasCount > 1 { 391 // check for shard mapping using configmap if application-controller is a deployment 392 // else use existing logic to infer shard from pod name if application-controller is a statefulset 393 if enableDynamicClusterDistribution { 394 var err error 395 // retry 3 times if we find a conflict while updating shard mapping configMap. 396 // If we still see conflicts after the retries, wait for next iteration of heartbeat process. 397 for i := 0; i <= common.AppControllerHeartbeatUpdateRetryCount; i++ { 398 shardNumber, err = GetOrUpdateShardFromConfigMap(kubeClient, settingsMgr, replicasCount, shardNumber) 399 if err != nil && !kubeerrors.IsConflict(err) { 400 err = fmt.Errorf("unable to get shard due to error updating the sharding config map: %s", err) 401 break 402 } 403 log.Warnf("conflict when getting shard from shard mapping configMap. Retrying (%d/3)", i) 404 } 405 errors.CheckError(err) 406 } else { 407 if shardNumber < 0 { 408 var err error 409 shardNumber, err = InferShard() 410 errors.CheckError(err) 411 } 412 if shardNumber > replicasCount { 413 log.Warnf("Calculated shard number %d is greated than the number of replicas count. Defaulting to 0", shardNumber) 414 shardNumber = 0 415 } 416 } 417 } else { 418 log.Info("Processing all cluster shards") 419 shardNumber = 0 420 } 421 db := db.NewDB(settingsMgr.GetNamespace(), settingsMgr, kubeClient) 422 return NewClusterSharding(db, shardNumber, replicasCount, shardingAlgorithm), nil 423 }