agones.dev/agones@v1.53.0/pkg/fleets/controller_rollingupdatefix.go (about)

     1  package fleets
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  
     7  	agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
     8  	"github.com/pkg/errors"
     9  	corev1 "k8s.io/api/core/v1"
    10  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    11  	"k8s.io/apimachinery/pkg/util/intstr"
    12  	"k8s.io/utils/integer"
    13  )
    14  
    15  func (c *Controller) cleanupUnhealthyReplicasRollingUpdateFix(ctx context.Context, rest []*agonesv1.GameServerSet,
    16  	fleet *agonesv1.Fleet, maxCleanupCount int32) ([]*agonesv1.GameServerSet, int32, error) {
    17  
    18  	// Safely scale down all old GameServerSets with unhealthy replicas.
    19  	totalScaledDown := int32(0)
    20  	for i, gsSet := range rest {
    21  		healthy := gsSet.Status.ReadyReplicas + gsSet.Status.AllocatedReplicas + gsSet.Status.ReservedReplicas
    22  
    23  		if totalScaledDown >= maxCleanupCount {
    24  			break
    25  		}
    26  		if gsSet.Spec.Replicas == 0 {
    27  			// cannot scale down this replica set.
    28  			continue
    29  		}
    30  		if gsSet.Spec.Replicas <= healthy {
    31  			// no unhealthy replicas found, no scaling required.
    32  			continue
    33  		}
    34  
    35  		scaledDownCount := int32(integer.IntMin(int(maxCleanupCount-totalScaledDown), int(gsSet.Spec.Replicas-healthy)))
    36  		newReplicasCount := gsSet.Spec.Replicas - scaledDownCount
    37  		if newReplicasCount > gsSet.Spec.Replicas {
    38  			return nil, 0, fmt.Errorf("when cleaning up unhealthy replicas, got invalid request to scale down %s/%s %d -> %d", gsSet.Namespace, gsSet.Name, gsSet.Spec.Replicas, newReplicasCount)
    39  		}
    40  
    41  		gsSetCopy := gsSet.DeepCopy()
    42  		gsSetCopy.Spec.Replicas = newReplicasCount
    43  		totalScaledDown += scaledDownCount
    44  		if _, err := c.gameServerSetGetter.GameServerSets(gsSetCopy.ObjectMeta.Namespace).Update(ctx, gsSetCopy, metav1.UpdateOptions{}); err != nil {
    45  			return nil, totalScaledDown, errors.Wrapf(err, "error updating gameserverset %s", gsSetCopy.ObjectMeta.Name)
    46  		}
    47  		c.recorder.Eventf(fleet, corev1.EventTypeNormal, "ScalingGameServerSet",
    48  			"Scaling inactive GameServerSet %s from %d to %d", gsSetCopy.ObjectMeta.Name, gsSet.Spec.Replicas, gsSetCopy.Spec.Replicas)
    49  
    50  		rest[i] = gsSetCopy
    51  	}
    52  	return rest, totalScaledDown, nil
    53  }
    54  
    55  func (c *Controller) rollingUpdateRestFixedOnReadyRollingUpdateFix(ctx context.Context, fleet *agonesv1.Fleet, active *agonesv1.GameServerSet, rest []*agonesv1.GameServerSet) error {
    56  	if len(rest) == 0 {
    57  		return nil
    58  	}
    59  
    60  	// Look at Kubernetes Deployment util ResolveFenceposts() function
    61  	r, err := intstr.GetValueFromIntOrPercent(fleet.Spec.Strategy.RollingUpdate.MaxUnavailable, int(fleet.Status.ReadyReplicas), false)
    62  	if err != nil {
    63  		return errors.Wrapf(err, "error parsing MaxUnavailable value: %s", fleet.ObjectMeta.Name)
    64  	}
    65  	if r == 0 {
    66  		r = 1
    67  	}
    68  	if r > int(fleet.Spec.Replicas) {
    69  		r = int(fleet.Spec.Replicas)
    70  	}
    71  	unavailable := int32(r)
    72  
    73  	totalAlreadyScaledDown := int32(0)
    74  
    75  	totalScaleDownCount := int32(0)
    76  	// Check if we can scale down.
    77  	allGSS := rest
    78  	allGSS = append(allGSS, active)
    79  	readyReplicasCount := agonesv1.GetReadyReplicaCountForGameServerSets(allGSS)
    80  	minAvailable := fleet.Status.ReadyReplicas - unavailable
    81  	if minAvailable > fleet.Spec.Replicas {
    82  		minAvailable = fleet.Spec.Replicas
    83  	}
    84  
    85  	// Check if we are ready to scale down
    86  	newGSSUnavailablePodCount := active.Spec.Replicas - active.Status.ReadyReplicas - active.Status.ReservedReplicas -
    87  		active.Status.AllocatedReplicas - active.Status.ShutdownReplicas
    88  	maxScaledDown := readyReplicasCount - minAvailable - newGSSUnavailablePodCount
    89  
    90  	if maxScaledDown <= 0 {
    91  		return nil
    92  	}
    93  	rest, _, err = c.cleanupUnhealthyReplicasRollingUpdateFix(ctx, rest, fleet, maxScaledDown)
    94  	if err != nil {
    95  		loggerForFleet(fleet, c.baseLogger).WithField("fleet", fleet.ObjectMeta.Name).WithField("maxScaledDown", maxScaledDown).
    96  			Debug("Can not cleanup Unhealth Replicas")
    97  		// There could be the case when GameServerSet would be updated from another place, say Status or Spec would be updated
    98  		// We don't want to propagate such errors further
    99  		// And this set in sync with reconcileOldReplicaSets() Kubernetes code
   100  		return nil
   101  	}
   102  	// Resulting value is readyReplicasCount + unavailable - fleet.Spec.Replicas
   103  	totalScaleDownCount = readyReplicasCount - minAvailable
   104  	if readyReplicasCount <= minAvailable {
   105  		// Cannot scale down.
   106  		return nil
   107  	}
   108  	for _, gsSet := range rest {
   109  		if totalAlreadyScaledDown >= totalScaleDownCount {
   110  			// No further scaling required.
   111  			break
   112  		}
   113  
   114  		// Crucial fix if we are using wrong configuration of a fleet,
   115  		// that would lead to Status.Replicas being 0 but number of GameServers would be in a Scheduled or Unhealthy state.
   116  		// Compare with scaleDownOldReplicaSetsForRollingUpdate() for loop.
   117  		// if the Spec.Replicas are less than or equal to 0, then that means we are done
   118  		// scaling this GameServerSet down, and can therefore exit/move to the next one.
   119  		if gsSet.Spec.Replicas <= 0 {
   120  			continue
   121  		}
   122  
   123  		// If the Spec.Replicas does not equal the Status.Replicas for this GameServerSet, this means
   124  		// that the rolling down process is currently ongoing, and we should therefore exit so we can wait for it to finish
   125  		if gsSet.Spec.Replicas != gsSet.Status.Replicas {
   126  			break
   127  		}
   128  		gsSetCopy := gsSet.DeepCopy()
   129  		if gsSet.Status.ShutdownReplicas == 0 {
   130  			// Wait for new GameServers to become Ready before scaling down Inactive GameServerset
   131  			// Scale down.
   132  			scaleDownCount := int32(integer.IntMin(int(gsSet.Spec.Replicas), int(totalScaleDownCount-totalAlreadyScaledDown)))
   133  
   134  			newReplicasCount := gsSet.Spec.Replicas - scaleDownCount
   135  			if newReplicasCount > gsSet.Spec.Replicas {
   136  				return fmt.Errorf("when scaling down old GameServerSet, got invalid request to scale down %s/%s %d -> %d", gsSet.Namespace, gsSet.Name, gsSet.Spec.Replicas, newReplicasCount)
   137  			}
   138  
   139  			switch {
   140  			case gsSet.Status.Replicas == gsSet.Status.AllocatedReplicas:
   141  				gsSetCopy.Spec.Replicas = 0
   142  			case newReplicasCount == gsSet.Spec.Replicas:
   143  				// No updates on GameServerSet
   144  				continue
   145  			default:
   146  				gsSetCopy.Spec.Replicas = newReplicasCount
   147  			}
   148  			loggerForFleet(fleet, c.baseLogger).WithField("gameserverset", gsSet.ObjectMeta.Name).WithField("replicas", gsSetCopy.Spec.Replicas).
   149  				Debug("applying rolling update to inactive gameserverset")
   150  
   151  			if _, err := c.gameServerSetGetter.GameServerSets(gsSetCopy.ObjectMeta.Namespace).Update(ctx, gsSetCopy, metav1.UpdateOptions{}); err != nil {
   152  				return errors.Wrapf(err, "error updating gameserverset %s", gsSetCopy.ObjectMeta.Name)
   153  			}
   154  			c.recorder.Eventf(fleet, corev1.EventTypeNormal, "ScalingGameServerSet",
   155  				"Scaling inactive GameServerSet %s from %d to %d", gsSetCopy.ObjectMeta.Name, gsSet.Spec.Replicas, gsSetCopy.Spec.Replicas)
   156  
   157  			totalAlreadyScaledDown += scaleDownCount
   158  		}
   159  	}
   160  	return nil
   161  }