github.com/kubewharf/katalyst-core@v0.5.3/pkg/controller/resource-recommend/processor/percentile/processor.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package percentile 18 19 import ( 20 "context" 21 "runtime/debug" 22 "sync" 23 "time" 24 25 "github.com/pkg/errors" 26 "golang.org/x/time/rate" 27 "k8s.io/apimachinery/pkg/types" 28 "k8s.io/client-go/util/workqueue" 29 "k8s.io/klog/v2" 30 "sigs.k8s.io/controller-runtime/pkg/client" 31 32 "github.com/kubewharf/katalyst-core/pkg/controller/resource-recommend/datasource" 33 "github.com/kubewharf/katalyst-core/pkg/controller/resource-recommend/processor" 34 "github.com/kubewharf/katalyst-core/pkg/controller/resource-recommend/processor/percentile/task" 35 "github.com/kubewharf/katalyst-core/pkg/util/general" 36 "github.com/kubewharf/katalyst-core/pkg/util/resource-recommend/log" 37 datasourcetypes "github.com/kubewharf/katalyst-core/pkg/util/resource-recommend/types/datasource" 38 errortypes "github.com/kubewharf/katalyst-core/pkg/util/resource-recommend/types/error" 39 processortypes "github.com/kubewharf/katalyst-core/pkg/util/resource-recommend/types/processor" 40 ) 41 42 const ( 43 ProcessorName = "percentile" 44 // DefaultConcurrentTaskNum is num of default concurrent task 45 DefaultConcurrentTaskNum = 100 46 DefaultPercentile = 0.9 47 DefaultGarbageCollectInterval = 1 * time.Hour 48 ExceptionRequeueBaseDelay = time.Minute 49 ExceptionRequeueMaxDelay = 30 * time.Minute 50 ) 51 52 type Processor struct { 53 mutex sync.Mutex 54 55 client.Client 56 57 DatasourceProxy *datasource.Proxy 58 59 TaskQueue workqueue.RateLimitingInterface 60 61 AggregateTasks *sync.Map 62 63 // Stores taskID corresponding to Metrics in the ResourceRecommend 64 ResourceRecommendTaskIDsMap map[types.NamespacedName]*map[datasourcetypes.Metric]processortypes.TaskID 65 } 66 67 var DefaultQueueRateLimiter = workqueue.NewMaxOfRateLimiter( 68 workqueue.NewItemExponentialFailureRateLimiter(ExceptionRequeueBaseDelay, ExceptionRequeueMaxDelay), 69 // 10 qps, 100 bucket size. This is only for retry speed and its only the overall factor (not per item) 70 &workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(10), 100)}, 71 ) 72 73 func NewProcessor(datasourceProxy *datasource.Proxy, c client.Client) processor.Processor { 74 return &Processor{ 75 DatasourceProxy: datasourceProxy, 76 TaskQueue: workqueue.NewNamedRateLimitingQueue(DefaultQueueRateLimiter, ProcessorName), 77 Client: c, 78 AggregateTasks: &sync.Map{}, 79 ResourceRecommendTaskIDsMap: make(map[types.NamespacedName]*map[datasourcetypes.Metric]processortypes.TaskID), 80 } 81 } 82 83 func (p *Processor) Register(processConfig *processortypes.ProcessConfig) (cErr *errortypes.CustomError) { 84 defer func() { 85 if cErr != nil { 86 klog.ErrorS(cErr, "Percentile task register failed", "ResourceRecommend", processConfig.ResourceRecommendNamespacedName) 87 } 88 if r := recover(); r != nil { 89 errMsg := "percentile process register panic" 90 klog.ErrorS(r.(error), errMsg, "stack", string(debug.Stack())) 91 cErr = errortypes.RegisterProcessTaskPanic() 92 } 93 }() 94 95 if err := processConfig.Validate(); err != nil { 96 return errortypes.RegisterProcessTaskValidateError(err) 97 } 98 99 taskID := processConfig.GenerateTaskID() 100 101 // Check whether a task has been registered and avoid repeated registration 102 _, ok := p.AggregateTasks.Load(taskID) 103 if ok { 104 klog.V(4).InfoS("The Percentile Processor task already registered", "processConfig", general.StructToString(processConfig)) 105 return nil 106 } 107 108 p.mutex.Lock() 109 defer p.mutex.Unlock() 110 111 klog.InfoS("Register Percentile Processor Task", "processConfig", general.StructToString(processConfig)) 112 113 metric := *processConfig.Metric 114 115 t, err := task.NewTask(metric, processConfig.Config) 116 if err != nil { 117 cErr := errortypes.NewProcessTaskError(err) 118 return cErr 119 } 120 121 _, loaded := p.AggregateTasks.LoadOrStore(taskID, t) 122 if !loaded { 123 p.TaskQueue.Add(taskID) 124 } 125 126 // Record the taskID corresponding to the Metric with the same ResourceRecommendID into ResourceRecommendTaskIDsMap 127 // To get the taskID from the ResourceRecommendID and Metric 128 if tasks, ok := p.ResourceRecommendTaskIDsMap[processConfig.ResourceRecommendNamespacedName]; !ok { 129 p.ResourceRecommendTaskIDsMap[processConfig.ResourceRecommendNamespacedName] = &map[datasourcetypes.Metric]processortypes.TaskID{ 130 metric: taskID, 131 } 132 } else { 133 if existingTaskID, exist := (*tasks)[metric]; exist { 134 if existingTaskID == taskID { 135 return nil 136 } 137 // existingTaskID != taskID means that the config of the task has changed. 138 // Need to delete the old task of config 139 p.AggregateTasks.Delete(existingTaskID) 140 } 141 (*tasks)[metric] = taskID 142 } 143 144 return nil 145 } 146 147 func (p *Processor) Cancel(processKey *processortypes.ProcessKey) (cErr *errortypes.CustomError) { 148 if processKey == nil { 149 return nil 150 } 151 152 defer func() { 153 if cErr != nil { 154 klog.ErrorS(cErr, "Percentile task cancel failed", "ResourceRecommend", processKey.ResourceRecommendNamespacedName) 155 } 156 if r := recover(); r != nil { 157 errMsg := "percentile process cancel panic" 158 klog.ErrorS(r.(error), errMsg, "stack", string(debug.Stack())) 159 cErr = errortypes.CancelProcessTaskPanic() 160 } 161 }() 162 163 p.mutex.Lock() 164 defer p.mutex.Unlock() 165 166 tasks, ok := p.ResourceRecommendTaskIDsMap[processKey.ResourceRecommendNamespacedName] 167 if !ok { 168 klog.InfoS("Cancel task failed, percentile process task not found", "ResourceRecommend", 169 processKey.ResourceRecommendNamespacedName) 170 return errortypes.NotFoundTasksError(processKey.ResourceRecommendNamespacedName) 171 } 172 if processKey.Metric == nil { 173 klog.InfoS("delete percentile process tasks", "processConfig", processKey) 174 for _, taskID := range *tasks { 175 p.AggregateTasks.Delete(taskID) 176 } 177 delete(p.ResourceRecommendTaskIDsMap, processKey.ResourceRecommendNamespacedName) 178 } else { 179 if taskID, exist := (*tasks)[*processKey.Metric]; exist { 180 klog.InfoS("percentile process task delete for", "processConfig", processKey) 181 p.AggregateTasks.Delete(taskID) 182 delete(*tasks, *processKey.Metric) 183 } else { 184 klog.InfoS("task for metric cannot be found, don't deleted", "processConfig", processKey, "metric", processKey.Metric) 185 } 186 if tasks == nil || len(*tasks) == 0 { 187 delete(p.ResourceRecommendTaskIDsMap, processKey.ResourceRecommendNamespacedName) 188 } 189 } 190 191 return nil 192 } 193 194 func (p *Processor) Run(ctx context.Context) { 195 log.InfoS(ctx, "percentile processor starting") 196 197 // Get task from queue and run it 198 go p.ProcessTasks(ctx) 199 200 // Garbage collect every hour. Clearing timeout or no attribution task 201 go p.GarbageCollector(ctx) 202 203 log.InfoS(ctx, "percentile processor running") 204 205 <-ctx.Done() 206 207 log.InfoS(ctx, "percentile processor end") 208 } 209 210 func (p *Processor) QueryProcessedValues(processKey *processortypes.ProcessKey) (float64, error) { 211 t, err := p.getTaskForProcessKey(processKey) 212 if err != nil { 213 return 0, errors.Wrapf(err, "internal err, process task not found") 214 } 215 percentileValue, err := t.QueryPercentileValue(NewContext(), DefaultPercentile) 216 if err != nil { 217 return 0, err 218 } 219 return percentileValue, nil 220 }