github.com/kubewharf/katalyst-core@v0.5.3/pkg/controller/resource-recommend/processor/percentile/processor.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package percentile
    18  
    19  import (
    20  	"context"
    21  	"runtime/debug"
    22  	"sync"
    23  	"time"
    24  
    25  	"github.com/pkg/errors"
    26  	"golang.org/x/time/rate"
    27  	"k8s.io/apimachinery/pkg/types"
    28  	"k8s.io/client-go/util/workqueue"
    29  	"k8s.io/klog/v2"
    30  	"sigs.k8s.io/controller-runtime/pkg/client"
    31  
    32  	"github.com/kubewharf/katalyst-core/pkg/controller/resource-recommend/datasource"
    33  	"github.com/kubewharf/katalyst-core/pkg/controller/resource-recommend/processor"
    34  	"github.com/kubewharf/katalyst-core/pkg/controller/resource-recommend/processor/percentile/task"
    35  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    36  	"github.com/kubewharf/katalyst-core/pkg/util/resource-recommend/log"
    37  	datasourcetypes "github.com/kubewharf/katalyst-core/pkg/util/resource-recommend/types/datasource"
    38  	errortypes "github.com/kubewharf/katalyst-core/pkg/util/resource-recommend/types/error"
    39  	processortypes "github.com/kubewharf/katalyst-core/pkg/util/resource-recommend/types/processor"
    40  )
    41  
    42  const (
    43  	ProcessorName = "percentile"
    44  	// DefaultConcurrentTaskNum is num of default concurrent task
    45  	DefaultConcurrentTaskNum      = 100
    46  	DefaultPercentile             = 0.9
    47  	DefaultGarbageCollectInterval = 1 * time.Hour
    48  	ExceptionRequeueBaseDelay     = time.Minute
    49  	ExceptionRequeueMaxDelay      = 30 * time.Minute
    50  )
    51  
    52  type Processor struct {
    53  	mutex sync.Mutex
    54  
    55  	client.Client
    56  
    57  	DatasourceProxy *datasource.Proxy
    58  
    59  	TaskQueue workqueue.RateLimitingInterface
    60  
    61  	AggregateTasks *sync.Map
    62  
    63  	// Stores taskID corresponding to Metrics in the ResourceRecommend
    64  	ResourceRecommendTaskIDsMap map[types.NamespacedName]*map[datasourcetypes.Metric]processortypes.TaskID
    65  }
    66  
    67  var DefaultQueueRateLimiter = workqueue.NewMaxOfRateLimiter(
    68  	workqueue.NewItemExponentialFailureRateLimiter(ExceptionRequeueBaseDelay, ExceptionRequeueMaxDelay),
    69  	// 10 qps, 100 bucket size.  This is only for retry speed and its only the overall factor (not per item)
    70  	&workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(10), 100)},
    71  )
    72  
    73  func NewProcessor(datasourceProxy *datasource.Proxy, c client.Client) processor.Processor {
    74  	return &Processor{
    75  		DatasourceProxy:             datasourceProxy,
    76  		TaskQueue:                   workqueue.NewNamedRateLimitingQueue(DefaultQueueRateLimiter, ProcessorName),
    77  		Client:                      c,
    78  		AggregateTasks:              &sync.Map{},
    79  		ResourceRecommendTaskIDsMap: make(map[types.NamespacedName]*map[datasourcetypes.Metric]processortypes.TaskID),
    80  	}
    81  }
    82  
    83  func (p *Processor) Register(processConfig *processortypes.ProcessConfig) (cErr *errortypes.CustomError) {
    84  	defer func() {
    85  		if cErr != nil {
    86  			klog.ErrorS(cErr, "Percentile task register failed", "ResourceRecommend", processConfig.ResourceRecommendNamespacedName)
    87  		}
    88  		if r := recover(); r != nil {
    89  			errMsg := "percentile process register panic"
    90  			klog.ErrorS(r.(error), errMsg, "stack", string(debug.Stack()))
    91  			cErr = errortypes.RegisterProcessTaskPanic()
    92  		}
    93  	}()
    94  
    95  	if err := processConfig.Validate(); err != nil {
    96  		return errortypes.RegisterProcessTaskValidateError(err)
    97  	}
    98  
    99  	taskID := processConfig.GenerateTaskID()
   100  
   101  	// Check whether a task has been registered and avoid repeated registration
   102  	_, ok := p.AggregateTasks.Load(taskID)
   103  	if ok {
   104  		klog.V(4).InfoS("The Percentile Processor task already registered", "processConfig", general.StructToString(processConfig))
   105  		return nil
   106  	}
   107  
   108  	p.mutex.Lock()
   109  	defer p.mutex.Unlock()
   110  
   111  	klog.InfoS("Register Percentile Processor Task", "processConfig", general.StructToString(processConfig))
   112  
   113  	metric := *processConfig.Metric
   114  
   115  	t, err := task.NewTask(metric, processConfig.Config)
   116  	if err != nil {
   117  		cErr := errortypes.NewProcessTaskError(err)
   118  		return cErr
   119  	}
   120  
   121  	_, loaded := p.AggregateTasks.LoadOrStore(taskID, t)
   122  	if !loaded {
   123  		p.TaskQueue.Add(taskID)
   124  	}
   125  
   126  	// Record the taskID corresponding to the Metric with the same ResourceRecommendID into ResourceRecommendTaskIDsMap
   127  	// To get the taskID from the ResourceRecommendID and Metric
   128  	if tasks, ok := p.ResourceRecommendTaskIDsMap[processConfig.ResourceRecommendNamespacedName]; !ok {
   129  		p.ResourceRecommendTaskIDsMap[processConfig.ResourceRecommendNamespacedName] = &map[datasourcetypes.Metric]processortypes.TaskID{
   130  			metric: taskID,
   131  		}
   132  	} else {
   133  		if existingTaskID, exist := (*tasks)[metric]; exist {
   134  			if existingTaskID == taskID {
   135  				return nil
   136  			}
   137  			// existingTaskID != taskID means that the config of the task has changed.
   138  			// Need to delete the old task of config
   139  			p.AggregateTasks.Delete(existingTaskID)
   140  		}
   141  		(*tasks)[metric] = taskID
   142  	}
   143  
   144  	return nil
   145  }
   146  
   147  func (p *Processor) Cancel(processKey *processortypes.ProcessKey) (cErr *errortypes.CustomError) {
   148  	if processKey == nil {
   149  		return nil
   150  	}
   151  
   152  	defer func() {
   153  		if cErr != nil {
   154  			klog.ErrorS(cErr, "Percentile task cancel failed", "ResourceRecommend", processKey.ResourceRecommendNamespacedName)
   155  		}
   156  		if r := recover(); r != nil {
   157  			errMsg := "percentile process cancel panic"
   158  			klog.ErrorS(r.(error), errMsg, "stack", string(debug.Stack()))
   159  			cErr = errortypes.CancelProcessTaskPanic()
   160  		}
   161  	}()
   162  
   163  	p.mutex.Lock()
   164  	defer p.mutex.Unlock()
   165  
   166  	tasks, ok := p.ResourceRecommendTaskIDsMap[processKey.ResourceRecommendNamespacedName]
   167  	if !ok {
   168  		klog.InfoS("Cancel task failed, percentile process task not found", "ResourceRecommend",
   169  			processKey.ResourceRecommendNamespacedName)
   170  		return errortypes.NotFoundTasksError(processKey.ResourceRecommendNamespacedName)
   171  	}
   172  	if processKey.Metric == nil {
   173  		klog.InfoS("delete percentile process tasks", "processConfig", processKey)
   174  		for _, taskID := range *tasks {
   175  			p.AggregateTasks.Delete(taskID)
   176  		}
   177  		delete(p.ResourceRecommendTaskIDsMap, processKey.ResourceRecommendNamespacedName)
   178  	} else {
   179  		if taskID, exist := (*tasks)[*processKey.Metric]; exist {
   180  			klog.InfoS("percentile process task delete for", "processConfig", processKey)
   181  			p.AggregateTasks.Delete(taskID)
   182  			delete(*tasks, *processKey.Metric)
   183  		} else {
   184  			klog.InfoS("task for metric cannot be found, don't deleted", "processConfig", processKey, "metric", processKey.Metric)
   185  		}
   186  		if tasks == nil || len(*tasks) == 0 {
   187  			delete(p.ResourceRecommendTaskIDsMap, processKey.ResourceRecommendNamespacedName)
   188  		}
   189  	}
   190  
   191  	return nil
   192  }
   193  
   194  func (p *Processor) Run(ctx context.Context) {
   195  	log.InfoS(ctx, "percentile processor starting")
   196  
   197  	// Get task from queue and run it
   198  	go p.ProcessTasks(ctx)
   199  
   200  	// Garbage collect every hour. Clearing timeout or no attribution task
   201  	go p.GarbageCollector(ctx)
   202  
   203  	log.InfoS(ctx, "percentile processor running")
   204  
   205  	<-ctx.Done()
   206  
   207  	log.InfoS(ctx, "percentile processor end")
   208  }
   209  
   210  func (p *Processor) QueryProcessedValues(processKey *processortypes.ProcessKey) (float64, error) {
   211  	t, err := p.getTaskForProcessKey(processKey)
   212  	if err != nil {
   213  		return 0, errors.Wrapf(err, "internal err, process task not found")
   214  	}
   215  	percentileValue, err := t.QueryPercentileValue(NewContext(), DefaultPercentile)
   216  	if err != nil {
   217  		return 0, err
   218  	}
   219  	return percentileValue, nil
   220  }