github.com/gocrane/crane@v0.11.0/pkg/recommendation/recommender/hpa/recommend.go

github.com/gocrane/crane@v0.11.0/pkg/recommendation/recommender/hpa/recommend.go (about)

     1  package hpa
     2  
     3  import (
     4  	"encoding/json"
     5  	"fmt"
     6  	"math"
     7  	"strconv"
     8  	"time"
     9  
    10  	"github.com/montanaflynn/stats"
    11  	autoscalingv2 "k8s.io/api/autoscaling/v2beta2"
    12  	corev1 "k8s.io/api/core/v1"
    13  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    14  	"k8s.io/klog/v2"
    15  	"sigs.k8s.io/yaml"
    16  
    17  	autoscalingapi "github.com/gocrane/api/autoscaling/v1alpha1"
    18  	predictionapi "github.com/gocrane/api/prediction/v1alpha1"
    19  
    20  	"github.com/gocrane/crane/pkg/common"
    21  	"github.com/gocrane/crane/pkg/metricnaming"
    22  	"github.com/gocrane/crane/pkg/prediction/config"
    23  	"github.com/gocrane/crane/pkg/recommend/types"
    24  	"github.com/gocrane/crane/pkg/recommendation/framework"
    25  	"github.com/gocrane/crane/pkg/utils"
    26  )
    27  
    28  const callerFormat = "HPARecommendationCaller-%s-%s"
    29  
    30  func (rr *HPARecommender) PreRecommend(ctx *framework.RecommendationContext) error {
    31  	return rr.ReplicasRecommender.PreRecommend(ctx)
    32  }
    33  
    34  func (rr *HPARecommender) Recommend(ctx *framework.RecommendationContext) error {
    35  	return rr.ReplicasRecommender.Recommend(ctx)
    36  }
    37  
    38  // Policy add some logic for result of recommend phase.
    39  func (rr *HPARecommender) Policy(ctx *framework.RecommendationContext) error {
    40  	predictable := true
    41  
    42  	if len(ctx.ResultValues) != 1 {
    43  		klog.Warningf("%s: prediction metrics data is unexpected, List length is %d ", ctx.String(), len(ctx.ResultValues))
    44  		predictable = false
    45  	}
    46  
    47  	if rr.PredictableEnabled && !predictable {
    48  		return fmt.Errorf("cannot predict target")
    49  	}
    50  
    51  	minReplicas, cpuMax, percentileCpu, err := rr.GetMinReplicas(ctx)
    52  	if err != nil {
    53  		return err
    54  	}
    55  
    56  	err = rr.checkMinCpuUsageThreshold(cpuMax)
    57  	if err != nil {
    58  		return fmt.Errorf("checkMinCpuUsageThreshold failed: %v", err)
    59  	}
    60  
    61  	medianMin, medianMax, err := rr.minMaxMedians(ctx.InputValue(string(corev1.ResourceCPU)))
    62  	if err != nil {
    63  		return fmt.Errorf("minMaxMedians failed: %v", err)
    64  	}
    65  
    66  	err = rr.checkFluctuation(medianMin, medianMax)
    67  	if err != nil {
    68  		return fmt.Errorf("%s checkFluctuation failed: %v", rr.Name(), err)
    69  	}
    70  
    71  	targetUtilization, _, err := rr.proposeTargetUtilization(ctx)
    72  	if err != nil {
    73  		return fmt.Errorf("proposeTargetUtilization failed: %v", err)
    74  	}
    75  
    76  	maxReplicas, err := rr.proposeMaxReplicas(&ctx.PodTemplate, percentileCpu, targetUtilization, minReplicas)
    77  	if err != nil {
    78  		return fmt.Errorf("proposeMaxReplicas failed: %v", err)
    79  	}
    80  
    81  	defaultPredictionWindow := int32(3600)
    82  	resourceCpu := corev1.ResourceCPU
    83  
    84  	proposedEHPA := &types.EffectiveHorizontalPodAutoscalerRecommendation{
    85  		MaxReplicas: &maxReplicas,
    86  		MinReplicas: &minReplicas,
    87  		Metrics: []autoscalingv2.MetricSpec{
    88  			{
    89  				Type: autoscalingv2.ResourceMetricSourceType,
    90  				Resource: &autoscalingv2.ResourceMetricSource{
    91  					Name: resourceCpu,
    92  					Target: autoscalingv2.MetricTarget{
    93  						Type:               autoscalingv2.UtilizationMetricType,
    94  						AverageUtilization: &targetUtilization,
    95  					},
    96  				},
    97  			},
    98  		},
    99  	}
   100  
   101  	if predictable {
   102  		proposedEHPA.Prediction = &autoscalingapi.Prediction{
   103  			PredictionWindowSeconds: &defaultPredictionWindow,
   104  			PredictionAlgorithm: &autoscalingapi.PredictionAlgorithm{
   105  				AlgorithmType: predictionapi.AlgorithmTypeDSP,
   106  				DSP:           ctx.AlgorithmConfig.DSP,
   107  			},
   108  		}
   109  	}
   110  
   111  	// get metric spec from existing hpa and use them
   112  	if rr.ReferenceHpaEnabled && ctx.HPA != nil {
   113  		for _, metricSpec := range ctx.HPA.Spec.Metrics {
   114  			// don't use resource cpu, since we already configuration it before
   115  			if metricSpec.Type == autoscalingv2.ResourceMetricSourceType && metricSpec.Resource != nil && metricSpec.Resource.Name == resourceCpu {
   116  				continue
   117  			}
   118  
   119  			proposedEHPA.Metrics = append(proposedEHPA.Metrics, metricSpec)
   120  		}
   121  	}
   122  
   123  	result := types.ProposedRecommendation{
   124  		EffectiveHPA: proposedEHPA,
   125  	}
   126  
   127  	resultBytes, err := yaml.Marshal(result)
   128  	if err != nil {
   129  		return fmt.Errorf("%s marshal result failed: %v", rr.Name(), err)
   130  	}
   131  
   132  	ctx.Recommendation.Status.RecommendedValue = string(resultBytes)
   133  	if ctx.EHPA == nil {
   134  		ctx.Recommendation.Status.Action = "Create"
   135  
   136  		newEhpa := &autoscalingapi.EffectiveHorizontalPodAutoscaler{
   137  			TypeMeta: metav1.TypeMeta{
   138  				Kind:       "EffectiveHorizontalPodAutoscaler",
   139  				APIVersion: autoscalingapi.GroupVersion.String(),
   140  			},
   141  			ObjectMeta: metav1.ObjectMeta{
   142  				Namespace: ctx.Recommendation.Spec.TargetRef.Namespace,
   143  				Name:      ctx.Recommendation.Spec.TargetRef.Name,
   144  			},
   145  			Spec: autoscalingapi.EffectiveHorizontalPodAutoscalerSpec{
   146  				MinReplicas:   proposedEHPA.MinReplicas,
   147  				MaxReplicas:   *proposedEHPA.MaxReplicas,
   148  				Metrics:       proposedEHPA.Metrics,
   149  				ScaleStrategy: autoscalingapi.ScaleStrategyPreview,
   150  				Prediction:    proposedEHPA.Prediction,
   151  				ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{
   152  					Kind:       ctx.Recommendation.Spec.TargetRef.Kind,
   153  					APIVersion: ctx.Recommendation.Spec.TargetRef.APIVersion,
   154  					Name:       ctx.Recommendation.Spec.TargetRef.Name,
   155  				},
   156  			},
   157  		}
   158  
   159  		newEhpaBytes, err := json.Marshal(newEhpa)
   160  		if err != nil {
   161  			return fmt.Errorf("marshal ehpa failed %s. ", err)
   162  		}
   163  		ctx.Recommendation.Status.RecommendedInfo = string(newEhpaBytes)
   164  	} else {
   165  		ctx.Recommendation.Status.Action = "Patch"
   166  
   167  		patchEhpa := &autoscalingapi.EffectiveHorizontalPodAutoscaler{
   168  			Spec: autoscalingapi.EffectiveHorizontalPodAutoscalerSpec{
   169  				MinReplicas: proposedEHPA.MinReplicas,
   170  				MaxReplicas: *proposedEHPA.MaxReplicas,
   171  				Metrics:     proposedEHPA.Metrics,
   172  			},
   173  		}
   174  
   175  		patchEhpaBytes, err := json.Marshal(patchEhpa)
   176  		if err != nil {
   177  			return fmt.Errorf("marshal ehpa failed %s. ", err)
   178  		}
   179  		ctx.Recommendation.Status.RecommendedInfo = string(patchEhpaBytes)
   180  		ctx.Recommendation.Status.TargetRef = corev1.ObjectReference{
   181  			Namespace:  ctx.Recommendation.Spec.TargetRef.Namespace,
   182  			Name:       ctx.Recommendation.Spec.TargetRef.Name,
   183  			Kind:       "EffectiveHorizontalPodAutoscaler",
   184  			APIVersion: autoscalingapi.GroupVersion.String(),
   185  		}
   186  	}
   187  
   188  	return nil
   189  }
   190  
   191  // checkMinCpuUsageThreshold check if the max cpu for target is reach to replicas.min-cpu-usage-threshold
   192  func (rr *HPARecommender) checkMinCpuUsageThreshold(cpuMax float64) error {
   193  	klog.V(4).Infof("%s checkMinCpuUsageThreshold, cpuMax %f threshold %f", rr.Name(), cpuMax, rr.MinCpuUsageThreshold)
   194  	if cpuMax < rr.MinCpuUsageThreshold {
   195  		return fmt.Errorf("target cpuusage %f is under replicas.min-cpu-usage-threshold %f. ", cpuMax, rr.MinCpuUsageThreshold)
   196  	}
   197  
   198  	return nil
   199  }
   200  
   201  func (rr *HPARecommender) minMaxMedians(predictionTs []*common.TimeSeries) (float64, float64, error) {
   202  	// aggregate with time's hour
   203  	cpuUsagePredictionMap := make(map[int][]float64)
   204  	for _, sample := range predictionTs[0].Samples {
   205  		sampleTime := time.Unix(sample.Timestamp, 0)
   206  		if _, exist := cpuUsagePredictionMap[sampleTime.Hour()]; exist {
   207  			cpuUsagePredictionMap[sampleTime.Hour()] = append(cpuUsagePredictionMap[sampleTime.Hour()], sample.Value)
   208  		} else {
   209  			newUsageInHour := make([]float64, 0)
   210  			newUsageInHour = append(newUsageInHour, sample.Value)
   211  			cpuUsagePredictionMap[sampleTime.Hour()] = newUsageInHour
   212  		}
   213  	}
   214  
   215  	// use median to deburring data
   216  	var medianUsages []float64
   217  	for _, usageInHour := range cpuUsagePredictionMap {
   218  		medianUsage, err := stats.Median(usageInHour)
   219  		if err != nil {
   220  			return 0., 0., err
   221  		}
   222  		medianUsages = append(medianUsages, medianUsage)
   223  	}
   224  
   225  	medianMax := math.SmallestNonzeroFloat64
   226  	medianMin := math.MaxFloat64
   227  	for _, value := range medianUsages {
   228  		if value > medianMax {
   229  			medianMax = value
   230  		}
   231  
   232  		if value < medianMin {
   233  			medianMin = value
   234  		}
   235  	}
   236  
   237  	klog.V(4).Infof("%s minMaxMedians medianMax %f, medianMin %f, medianUsages %v", rr.Name(), medianMax, medianMin, medianUsages)
   238  
   239  	return medianMin, medianMax, nil
   240  }
   241  
   242  // checkFluctuation check if the time series fluctuation is reach to replicas.fluctuation-threshold
   243  func (rr *HPARecommender) checkFluctuation(medianMin, medianMax float64) error {
   244  	fluctuationThreshold, err := strconv.ParseFloat(rr.Config["fluctuation-threshold"], 64)
   245  	if err != nil {
   246  		return err
   247  	}
   248  
   249  	if medianMin == 0 {
   250  		medianMin = 0.1 // use a small value to continue calculate
   251  	}
   252  
   253  	fluctuation := medianMax / medianMin
   254  	if fluctuation < fluctuationThreshold {
   255  		return fmt.Errorf("target cpu fluctuation %f is under replicas.fluctuation-threshold %f. ", fluctuation, fluctuationThreshold)
   256  	}
   257  
   258  	return nil
   259  }
   260  
   261  // proposeTargetUtilization use the 99 percentile cpu usage to propose target utilization,
   262  // since we think if pod have reach the top usage before, maybe this is a suitable target to running.
   263  // Considering too high or too low utilization are both invalid, we will be capping target utilization finally.
   264  func (rr *HPARecommender) proposeTargetUtilization(ctx *framework.RecommendationContext) (int32, int64, error) {
   265  	percentilePredictor := ctx.PredictorMgr.GetPredictor(predictionapi.AlgorithmTypePercentile)
   266  
   267  	var cpuUsage float64
   268  	// use percentile algo to get the 99 percentile cpu usage for this target
   269  	for _, container := range ctx.PodTemplate.Spec.Containers {
   270  		caller := fmt.Sprintf(callerFormat, klog.KObj(ctx.Recommendation), ctx.Recommendation.UID)
   271  		metricNamer := metricnaming.ResourceToContainerMetricNamer(ctx.Recommendation.Spec.TargetRef.Namespace, ctx.Recommendation.Spec.TargetRef.APIVersion,
   272  			ctx.Recommendation.Spec.TargetRef.Kind, ctx.Recommendation.Spec.TargetRef.Name, container.Name, corev1.ResourceCPU, caller)
   273  		cpuConfig := &config.Config{
   274  			Percentile: &predictionapi.Percentile{
   275  				Aggregated:        true,
   276  				HistoryLength:     "168h",
   277  				SampleInterval:    "1m",
   278  				MarginFraction:    "0.15",
   279  				TargetUtilization: "1.0",
   280  				Percentile:        "0.99",
   281  				Histogram: predictionapi.HistogramConfig{
   282  					HalfLife:   "24h",
   283  					BucketSize: "0.1",
   284  					MaxValue:   "100",
   285  				},
   286  			},
   287  		}
   288  		tsList, err := utils.QueryPredictedValuesOnce(ctx.Recommendation,
   289  			percentilePredictor,
   290  			caller,
   291  			cpuConfig,
   292  			metricNamer)
   293  		if err != nil {
   294  			return 0, 0, err
   295  		}
   296  		if len(tsList) < 1 || len(tsList[0].Samples) < 1 {
   297  			return 0, 0, fmt.Errorf("no value retured for queryExpr: %s", metricNamer.BuildUniqueKey())
   298  		}
   299  		cpuUsage += tsList[0].Samples[0].Value
   300  	}
   301  
   302  	requestTotal, err := utils.CalculatePodTemplateRequests(&ctx.PodTemplate, corev1.ResourceCPU)
   303  	if err != nil {
   304  		return 0, 0, err
   305  	}
   306  
   307  	klog.V(4).Infof("propose targetUtilization, cpuUsage %f requestsPod %d", cpuUsage, requestTotal)
   308  	targetUtilization := int32(math.Ceil((cpuUsage * 1000 / float64(requestTotal)) * 100))
   309  
   310  	// capping
   311  	if targetUtilization < int32(rr.MinCpuTargetUtilization) {
   312  		targetUtilization = int32(rr.MinCpuTargetUtilization)
   313  	}
   314  
   315  	// capping
   316  	if targetUtilization > int32(rr.MaxCpuTargetUtilization) {
   317  		targetUtilization = int32(rr.MaxCpuTargetUtilization)
   318  	}
   319  
   320  	return targetUtilization, requestTotal, nil
   321  }
   322  
   323  // proposeMaxReplicas use max cpu usage to compare with target pod cpu usage to get the max replicas.
   324  func (rr *HPARecommender) proposeMaxReplicas(podTemplate *corev1.PodTemplateSpec, percentileCpu float64, targetUtilization int32, minReplicas int32) (int32, error) {
   325  	requestsPod, err := utils.CalculatePodTemplateRequests(podTemplate, corev1.ResourceCPU)
   326  	if err != nil {
   327  		return 0, err
   328  	}
   329  
   330  	klog.V(4).Infof("proposeMaxReplicas, percentileCpu %f requestsPod %d targetUtilization %d", percentileCpu, requestsPod, targetUtilization)
   331  
   332  	// request * targetUtilization is the target average cpu usage, use total p95thCpu to divide, we can get the expect max replicas.
   333  	calcMaxReplicas := (percentileCpu * 100 * 1000 * rr.MaxReplicasFactor) / float64(int32(requestsPod)*targetUtilization)
   334  	maxReplicas := int32(math.Ceil(calcMaxReplicas))
   335  
   336  	// maxReplicas should be always larger than minReplicas
   337  	if maxReplicas < minReplicas {
   338  		maxReplicas = minReplicas
   339  	}
   340  
   341  	return maxReplicas, nil
   342  }