github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/resourcemanager/fetcher/kubelet/kubeletplugin.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package kubelet 18 19 import ( 20 "context" 21 "encoding/json" 22 "fmt" 23 "sync" 24 "time" 25 26 info "github.com/google/cadvisor/info/v1" 27 "github.com/pkg/errors" 28 "go.uber.org/atomic" 29 "k8s.io/klog/v2" 30 apiconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" 31 32 nodev1alpha1 "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" 33 "github.com/kubewharf/katalyst-api/pkg/protocol/reporterplugin/v1alpha1" 34 "github.com/kubewharf/katalyst-api/pkg/utils" 35 "github.com/kubewharf/katalyst-core/pkg/agent/resourcemanager/fetcher/kubelet/topology" 36 "github.com/kubewharf/katalyst-core/pkg/agent/resourcemanager/fetcher/plugin" 37 "github.com/kubewharf/katalyst-core/pkg/config" 38 "github.com/kubewharf/katalyst-core/pkg/metaserver" 39 "github.com/kubewharf/katalyst-core/pkg/metrics" 40 "github.com/kubewharf/katalyst-core/pkg/util" 41 "github.com/kubewharf/katalyst-core/pkg/util/kubelet/podresources" 42 "github.com/kubewharf/katalyst-core/pkg/util/process" 43 ) 44 45 const ( 46 // PluginName is name of kubelet reporter plugin 47 PluginName = "kubelet-reporter-plugin" 48 ) 49 50 // kubeletPlugin implements the endpoint interface, and it's an in-tree reporter plugin 51 type kubeletPlugin struct { 52 mutex sync.RWMutex 53 54 ctx context.Context 55 cancel context.CancelFunc 56 57 // conf is used to indicate the file path and name for system data in the future 58 // currently, it's not used todo: implement this logic 59 conf *config.Configuration 60 61 topologyStatusAdapter topology.Adapter 62 63 // cb since kubeletPlugin needs to call updateContent whenever the topology changes, 64 // it needs a corresponding callback function 65 cb plugin.ListAndWatchCallback 66 67 // notifierCh channel sent by topology adapter to trigger ListAndWatch send to 68 // manager 69 notifierCh chan struct{} 70 71 latestReportContentResponse atomic.Value 72 73 *process.StopControl 74 emitter metrics.MetricEmitter 75 metaServer *metaserver.MetaServer 76 } 77 78 // NewKubeletReporterPlugin creates a kubelet reporter plugin 79 func NewKubeletReporterPlugin(emitter metrics.MetricEmitter, metaServer *metaserver.MetaServer, 80 conf *config.Configuration, callback plugin.ListAndWatchCallback, 81 ) (plugin.ReporterPlugin, error) { 82 ctx, cancel := context.WithCancel(context.Background()) 83 p := &kubeletPlugin{ 84 emitter: emitter, 85 metaServer: metaServer, 86 conf: conf, 87 notifierCh: make(chan struct{}, 10), 88 ctx: ctx, 89 cancel: cancel, 90 cb: callback, 91 StopControl: process.NewStopControl(time.Time{}), 92 } 93 94 topologyStatusAdapter, err := topology.NewPodResourcesServerTopologyAdapter(metaServer, conf.QoSConfiguration, 95 conf.PodResourcesServerEndpoints, conf.KubeletResourcePluginPaths, conf.ResourceNameToZoneTypeMap, 96 nil, p.getNumaInfo, topology.GenericPodResourcesFilter(conf.QoSConfiguration), podresources.GetV1Client, 97 conf.NeedValidationResources) 98 if err != nil { 99 return nil, err 100 } 101 102 p.topologyStatusAdapter = topologyStatusAdapter 103 104 return p, nil 105 } 106 107 func (p *kubeletPlugin) Name() string { 108 return PluginName 109 } 110 111 func (p *kubeletPlugin) Run(success chan<- bool) { 112 err := p.topologyStatusAdapter.Run(p.ctx, p.topologyStatusChangeHandler) 113 if err != nil { 114 klog.Fatalf("run topology status adapter failed: %v", err) 115 return 116 } 117 success <- true 118 119 for { 120 select { 121 case _, ok := <-p.notifierCh: 122 if !ok { 123 klog.Infof("plugin %s has been stopped", PluginName) 124 return 125 } 126 127 resp, err := p.getReportContent(p.ctx) 128 if err != nil { 129 klog.Errorf("plugin %s failed to get report content with error %v", PluginName, err) 130 continue 131 } 132 133 p.ListAndWatchReportContentCallback(PluginName, resp) 134 case <-p.ctx.Done(): 135 klog.Infof("plugin %s has been stopped", PluginName) 136 return 137 } 138 } 139 } 140 141 func (p *kubeletPlugin) GetReportContent(ctx context.Context) (*v1alpha1.GetReportContentResponse, error) { 142 return p.getReportContent(ctx) 143 } 144 145 func (p *kubeletPlugin) ListAndWatchReportContentCallback(pluginName string, response *v1alpha1.GetReportContentResponse) { 146 p.setCache(response) 147 148 p.cb(pluginName, response) 149 } 150 151 func (p *kubeletPlugin) GetCache() *v1alpha1.GetReportContentResponse { 152 resp := p.latestReportContentResponse.Load() 153 if resp == nil { 154 return nil 155 } 156 157 return resp.(*v1alpha1.GetReportContentResponse) 158 } 159 160 // Stop to cancel all context and close notifierCh 161 func (p *kubeletPlugin) Stop() { 162 p.mutex.Lock() 163 defer p.mutex.Unlock() 164 165 p.cancel() 166 close(p.notifierCh) 167 168 p.StopControl.Stop() 169 } 170 171 // topologyStatusChangeHandler is called by topology adapter when topology status changes 172 func (p *kubeletPlugin) topologyStatusChangeHandler() { 173 p.mutex.RLock() 174 defer p.mutex.RUnlock() 175 176 select { 177 case p.notifierCh <- struct{}{}: 178 klog.Infof("send topology change notification to plugin %s", PluginName) 179 default: 180 klog.Warningf("plugin %s is busy, skip topology change notification", PluginName) 181 } 182 } 183 184 func (p *kubeletPlugin) setCache(resp *v1alpha1.GetReportContentResponse) { 185 p.latestReportContentResponse.Store(resp) 186 } 187 188 // getReportContent get report content from all collectors 189 func (p *kubeletPlugin) getReportContent(ctx context.Context) (*v1alpha1.GetReportContentResponse, error) { 190 reportContent, err := p.getTopologyStatusContent(ctx) 191 if err != nil { 192 return nil, err 193 } 194 195 return &v1alpha1.GetReportContentResponse{ 196 Content: reportContent, 197 }, nil 198 } 199 200 // getTopologyStatusContent get topology status content from topologyStatusAdapter 201 func (p *kubeletPlugin) getTopologyStatusContent(ctx context.Context) ([]*v1alpha1.ReportContent, error) { 202 topologyStatus, err := p.topologyStatusAdapter.GetTopologyZones(ctx) 203 if err != nil { 204 return nil, errors.Wrap(err, "get numa topology status from adapter failed") 205 } 206 207 value, err := json.Marshal(&topologyStatus) 208 if err != nil { 209 return nil, errors.Wrap(err, "marshal topology status failed") 210 } 211 212 topologyStatusContent := []*v1alpha1.ReportContent{ 213 { 214 GroupVersionKind: &util.CNRGroupVersionKind, 215 Field: []*v1alpha1.ReportField{ 216 { 217 FieldType: v1alpha1.FieldType_Status, 218 FieldName: util.CNRFieldNameTopologyZone, 219 Value: value, 220 }, 221 }, 222 }, 223 } 224 225 if p.conf.EnableReportTopologyPolicy { 226 content, err := p.getTopologyPolicyReportContent(ctx) 227 if err != nil { 228 return nil, errors.Wrap(err, "get topology policy report content failed") 229 } 230 topologyStatusContent = append(topologyStatusContent, content) 231 } 232 233 return topologyStatusContent, nil 234 } 235 236 func (p *kubeletPlugin) getNumaInfo() ([]info.Node, error) { 237 if p.metaServer == nil || p.metaServer.MachineInfo == nil { 238 return nil, fmt.Errorf("get metaserver machine info is nil") 239 } 240 return p.metaServer.MachineInfo.Topology, nil 241 } 242 243 func (p *kubeletPlugin) getTopologyPolicyReportContent(ctx context.Context) (*v1alpha1.ReportContent, error) { 244 var ( 245 topologyPolicy nodev1alpha1.TopologyPolicy 246 err error 247 ) 248 249 if p.reportOrmTopologyPolicy() { 250 // report orm topology policy only if orm is explicitly enabled in the configuration. 251 topologyPolicy = utils.GenerateTopologyPolicy(p.conf.TopologyPolicyName, apiconfig.ContainerTopologyManagerScope) 252 } else { 253 topologyPolicy, err = p.topologyStatusAdapter.GetTopologyPolicy(ctx) 254 if err != nil { 255 return nil, errors.Wrap(err, "get topology policy from adapter failed") 256 } 257 } 258 259 valueTopologyPolicy, err := json.Marshal(&topologyPolicy) 260 if err != nil { 261 return nil, errors.Wrap(err, "marshal topology policy failed") 262 } 263 264 return &v1alpha1.ReportContent{ 265 GroupVersionKind: &util.CNRGroupVersionKind, 266 Field: []*v1alpha1.ReportField{ 267 { 268 FieldType: v1alpha1.FieldType_Status, 269 FieldName: util.CNRFieldNameTopologyPolicy, 270 Value: valueTopologyPolicy, 271 }, 272 }, 273 }, nil 274 } 275 276 func (p *kubeletPlugin) reportOrmTopologyPolicy() bool { 277 if p.conf.TopologyPolicyName == "" { 278 return false 279 } 280 281 return true 282 }