github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/resourcemanager/fetcher/manager.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // Package fetcher is a framework to collect resources from multiple plugins 18 // (both in-tree and out-of-tree implementations) and push contents to reporter 19 // manager to assemble and update thrugh APIServer. 20 package fetcher // import "github.com/kubewharf/katalyst-core/pkg/reportermanager/fetcher" 21 22 import ( 23 "context" 24 "fmt" 25 "sync" 26 "time" 27 28 "google.golang.org/grpc/metadata" 29 "google.golang.org/grpc/status" 30 "k8s.io/apimachinery/pkg/util/errors" 31 "k8s.io/apimachinery/pkg/util/sets" 32 "k8s.io/apimachinery/pkg/util/wait" 33 "k8s.io/klog/v2" 34 pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" 35 "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" 36 cpmerrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" 37 38 "github.com/kubewharf/katalyst-api/pkg/plugins/registration" 39 "github.com/kubewharf/katalyst-api/pkg/protocol/reporterplugin/v1alpha1" 40 "github.com/kubewharf/katalyst-core/pkg/agent/resourcemanager/fetcher/checkpoint" 41 "github.com/kubewharf/katalyst-core/pkg/agent/resourcemanager/fetcher/kubelet" 42 "github.com/kubewharf/katalyst-core/pkg/agent/resourcemanager/fetcher/plugin" 43 "github.com/kubewharf/katalyst-core/pkg/agent/resourcemanager/fetcher/system" 44 "github.com/kubewharf/katalyst-core/pkg/agent/resourcemanager/reporter" 45 "github.com/kubewharf/katalyst-core/pkg/config" 46 "github.com/kubewharf/katalyst-core/pkg/metaserver" 47 "github.com/kubewharf/katalyst-core/pkg/metrics" 48 "github.com/kubewharf/katalyst-core/pkg/util/general" 49 ) 50 51 const reporterManagerCheckpoint = "reporter_manager_checkpoint" 52 53 const ( 54 metricsNameGetContentCost = "reporter_get_content_cost" 55 metricsNameGetContentPluginCost = "reporter_get_content_plugin_cost" 56 metricsNameGenericSyncCost = "reporter_generic_sync_cost" 57 ) 58 59 // ReporterPluginManager is used to manage in-tree or out-tree reporter plugin registrations and 60 // get report content from these plugins to aggregate them into the Reporter Manager 61 type ReporterPluginManager struct { 62 // callback is used for reporting in one time call. 63 callback plugin.ListAndWatchCallback 64 65 // map pluginName to its corresponding endpoint implementation 66 mutex sync.Mutex 67 innerEndpoints sets.String 68 endpoints map[string]plugin.Endpoint 69 70 checkpointManager checkpointmanager.CheckpointManager 71 72 reporter reporter.Manager 73 emitter metrics.MetricEmitter 74 75 // reconcilePeriod is the duration between calls to sync. 76 reconcilePeriod time.Duration 77 syncFunc func(ctx context.Context) 78 79 // healthzState records last time that the corresponding module is determined as healthy. 80 healthzState sync.Map 81 } 82 83 var innerReporterPluginsDisabledByDefault = sets.NewString() 84 85 // NewReporterPluginManager creates a new reporter plugin manager. 86 func NewReporterPluginManager(reporterMgr reporter.Manager, emitter metrics.MetricEmitter, 87 metaServer *metaserver.MetaServer, conf *config.Configuration, 88 ) (*ReporterPluginManager, error) { 89 manager := &ReporterPluginManager{ 90 innerEndpoints: sets.NewString(), 91 endpoints: make(map[string]plugin.Endpoint), 92 reporter: reporterMgr, 93 emitter: emitter, 94 reconcilePeriod: conf.CollectInterval, 95 } 96 97 manager.syncFunc = manager.genericSync 98 manager.callback = manager.genericCallback 99 100 checkpointManager, err := checkpointmanager.NewCheckpointManager(conf.CheckpointManagerDir) 101 if err != nil { 102 return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err) 103 } 104 manager.checkpointManager = checkpointManager 105 106 // load remote endpoints report response information from disk. 107 err = manager.readCheckpoint() 108 if err != nil { 109 _ = emitter.StoreInt64("reporter_plugin_checkpoint_read_failed", 1, metrics.MetricTypeNameCount) 110 klog.Warningf("continue after failing to read checkpoint file. response info from reporter plugin may NOT be up-to-date. Err: %v", err) 111 } 112 113 // register inner reporter plugins 114 err = manager.registerInnerReporterPlugins(emitter, metaServer, conf, manager.genericCallback, newReporterPluginInitializers()) 115 if err != nil { 116 return nil, fmt.Errorf("get inner reporter plugin failed: %s", err) 117 } 118 119 return manager, nil 120 } 121 122 // newReporterPluginInitializers adds in-tree reporter plugins into init function list 123 func newReporterPluginInitializers() map[string]plugin.InitFunc { 124 innerReporterPluginInitializers := make(map[string]plugin.InitFunc) 125 innerReporterPluginInitializers[system.PluginName] = system.NewSystemReporterPlugin 126 innerReporterPluginInitializers[kubelet.PluginName] = kubelet.NewKubeletReporterPlugin 127 return innerReporterPluginInitializers 128 } 129 130 func (m *ReporterPluginManager) registerInnerReporterPlugins(emitter metrics.MetricEmitter, 131 metaServer *metaserver.MetaServer, conf *config.Configuration, callback plugin.ListAndWatchCallback, 132 innerReporterPluginInitializers map[string]plugin.InitFunc, 133 ) error { 134 var errList []error 135 136 for pluginName, initFn := range innerReporterPluginInitializers { 137 if !general.IsNameEnabled(pluginName, innerReporterPluginsDisabledByDefault, conf.GenericReporterConfiguration.InnerPlugins) { 138 klog.Infof("reporter plugin %s is disabled", pluginName) 139 continue 140 } 141 142 curPlugin, err := initFn(emitter, metaServer, conf, callback) 143 if err != nil { 144 errList = append(errList, err) 145 continue 146 } 147 148 err = m.registerPlugin(pluginName, curPlugin) 149 if err != nil { 150 errList = append(errList, err) 151 continue 152 } 153 154 m.innerEndpoints.Insert(pluginName) 155 } 156 157 if len(errList) > 0 { 158 return errors.NewAggregate(errList) 159 } 160 161 return nil 162 } 163 164 // GetHandlerType get manage plugin type 165 func (m *ReporterPluginManager) GetHandlerType() string { 166 return registration.ReporterPlugin 167 } 168 169 // ValidatePlugin is to validate the plugin info is supported 170 func (m *ReporterPluginManager) ValidatePlugin(pluginName string, endpoint string, versions []string) error { 171 klog.Infof("[reporter manager] get Plugin %s at Endpoint %s with versions %v", pluginName, endpoint, versions) 172 173 if !m.isVersionCompatibleWithPlugin(versions) { 174 return fmt.Errorf("reporter manager version, %s, is not among plugin supported versions %v", pluginapi.Version, versions) 175 } 176 177 return nil 178 } 179 180 // RegisterPlugin is to handle plugin register event 181 func (m *ReporterPluginManager) RegisterPlugin(pluginName, endpoint string, _ []string) error { 182 klog.Infof("[reporter manager] registering Plugin %s at Endpoint %s", pluginName, endpoint) 183 184 var cache *v1alpha1.GetReportContentResponse 185 // if the plugin is already registered, use the old cache to avoid data loss 186 // when the plugin is re-registered. 187 m.mutex.Lock() 188 old, ok := m.endpoints[pluginName] 189 m.mutex.Unlock() 190 if ok { 191 cache = old.GetCache() 192 } 193 194 e, err := plugin.NewRemoteEndpoint(endpoint, pluginName, cache, m.emitter, m.callback) 195 if err != nil { 196 return fmt.Errorf("failed to dial device plugin with socketPath %s: %v", endpoint, err) 197 } 198 199 return m.registerPlugin(pluginName, e) 200 } 201 202 // DeRegisterPlugin is to handler plugin de-register event 203 func (m *ReporterPluginManager) DeRegisterPlugin(pluginName string) { 204 m.mutex.Lock() 205 defer m.mutex.Unlock() 206 207 if e, ok := m.endpoints[pluginName]; ok { 208 e.Stop() 209 klog.Errorf("[reporter manager] reporter plugin %s has been deregistered", pluginName) 210 _ = m.emitter.StoreInt64("reporter_plugin_deregister", 1, metrics.MetricTypeNameCount, 211 metrics.ConvertMapToTags(map[string]string{ 212 "plugin": pluginName, 213 })...) 214 } 215 } 216 217 // Run start the reporter plugin manager 218 func (m *ReporterPluginManager) Run(ctx context.Context) { 219 go wait.UntilWithContext(ctx, m.syncFunc, m.reconcilePeriod) 220 221 klog.Infof("reporter plugin manager started") 222 m.reporter.Run(ctx) 223 } 224 225 func (m *ReporterPluginManager) isVersionCompatibleWithPlugin(versions []string) bool { 226 // todo: currently this is fine as we only have a single supported version. When we do need to support 227 // multiple versions in the future, we may need to extend this function to return a supported version. 228 // E.g., say kubelet supports v1beta1 and v1beta2, and we get v1alpha1 and v1beta1 from a device plugin, 229 // this function should return v1beta1 230 for _, version := range versions { 231 for _, supportedVersion := range v1alpha1.SupportedVersions { 232 if version == supportedVersion { 233 return true 234 } 235 } 236 } 237 238 return false 239 } 240 241 func (m *ReporterPluginManager) registerPlugin(pluginName string, e plugin.Endpoint) error { 242 m.registerEndpoint(pluginName, e) 243 244 success := make(chan bool) 245 246 go m.runEndpoint(pluginName, e, success) 247 248 select { 249 case pass := <-success: 250 if pass { 251 klog.Infof("plugin %s run success", pluginName) 252 return nil 253 } 254 return fmt.Errorf("failed to register plugin %s", pluginName) 255 } 256 } 257 258 func (m *ReporterPluginManager) registerEndpoint(pluginName string, e plugin.Endpoint) { 259 m.mutex.Lock() 260 defer m.mutex.Unlock() 261 262 old, ok := m.endpoints[pluginName] 263 264 if ok && !old.IsStopped() { 265 klog.Infof("stop old endpoint: %s", pluginName) 266 old.Stop() 267 } 268 269 m.endpoints[pluginName] = e 270 klog.Infof("registered plugin name %s", pluginName) 271 } 272 273 func (m *ReporterPluginManager) runEndpoint(pluginName string, e plugin.Endpoint, success chan<- bool) { 274 e.Run(success) 275 e.Stop() 276 277 _ = m.emitter.StoreInt64("reporter_plugin_unhealthy", 1, metrics.MetricTypeNameCount, 278 metrics.ConvertMapToTags(map[string]string{ 279 "plugin": pluginName, 280 })...) 281 klog.Infof("reporter plugin %s became unhealthy", pluginName) 282 } 283 284 // genericCallback is triggered by ListAndWatch of plugin implementations; 285 // the ListWatch function will store report content in Endpoint and send to manager, 286 // and the manager can read it from Endpoint cache to obtain content changes initiative 287 func (m *ReporterPluginManager) genericCallback(pluginName string, _ *v1alpha1.GetReportContentResponse) { 288 klog.Infof("genericCallback") 289 // get report content from each healthy Endpoint from cache, the last response 290 // from this plugin has been already stored to its Endpoint cache before this callback called 291 reportResponses, _ := m.getReportContent(true) 292 293 err := m.pushContents(context.Background(), reportResponses) 294 if err != nil { 295 _ = m.emitter.StoreInt64("reporter_plugin_lw_push_failed", 1, metrics.MetricTypeNameCount, []metrics.MetricTag{ 296 {Key: "plugin", Val: pluginName}, 297 }...) 298 klog.Errorf("report plugin %s in callback failed with error: %v", pluginName, err) 299 } 300 } 301 302 func (m *ReporterPluginManager) pushContents(ctx context.Context, reportResponses map[string]*v1alpha1.GetReportContentResponse) error { 303 if err := m.writeCheckpoint(reportResponses); err != nil { 304 klog.Errorf("writing checkpoint encountered %v", err) 305 } 306 307 return m.reporter.PushContents(ctx, reportResponses) 308 } 309 310 // genericSync periodically calls the Get function to obtain content changes 311 func (m *ReporterPluginManager) genericSync(ctx context.Context) { 312 klog.Infof("genericSync") 313 314 begin := time.Now() 315 defer func() { 316 costs := time.Since(begin) 317 klog.InfoS("finished genericSync", "costs", costs) 318 _ = m.emitter.StoreInt64(metricsNameGenericSyncCost, costs.Microseconds(), metrics.MetricTypeNameRaw) 319 }() 320 321 // clear unhealthy plugin periodically 322 m.clearUnhealthyPlugin() 323 324 // get report content from each healthy Endpoint directly 325 reportResponses, _ := m.getReportContent(false) 326 327 pushErr := m.pushContents(ctx, reportResponses) 328 if pushErr != nil { 329 _ = m.emitter.StoreInt64("reporter_plugin_sync_push_failed", 1, metrics.MetricTypeNameCount) 330 klog.Errorf("report plugin failed with error: %v", pushErr) 331 } 332 } 333 334 // clearUnhealthyPlugin is to clear stopped plugins from cache which exceeded grace period 335 func (m *ReporterPluginManager) clearUnhealthyPlugin() { 336 m.mutex.Lock() 337 defer m.mutex.Unlock() 338 339 for pluginName, e := range m.endpoints { 340 if e.StopGracePeriodExpired() { 341 delete(m.endpoints, pluginName) 342 343 klog.Warningf("plugin %s has been clear", pluginName) 344 _ = m.emitter.StoreInt64("reporter_plugin_clear", 1, metrics.MetricTypeNameCount, 345 metrics.ConvertMapToTags(map[string]string{ 346 "plugin": pluginName, 347 })...) 348 } 349 } 350 } 351 352 // getReportContent is to get reportContent from plugins. if cacheFirst is true, 353 // use plugin cache (when it is no nil), otherwise we call plugin directly. 354 func (m *ReporterPluginManager) getReportContent(cacheFirst bool) (map[string]*v1alpha1.GetReportContentResponse, error) { 355 reportResponses := make(map[string]*v1alpha1.GetReportContentResponse) 356 errList := make([]error, 0) 357 358 begin := time.Now() 359 m.mutex.Lock() 360 defer func() { 361 m.mutex.Unlock() 362 costs := time.Since(begin) 363 klog.InfoS("finished getReportContent cnr", "costs", costs) 364 _ = m.emitter.StoreInt64(metricsNameGetContentCost, costs.Microseconds(), metrics.MetricTypeNameRaw) 365 }() 366 367 // get report content from each Endpoint 368 for pluginName, e := range m.endpoints { 369 var ( 370 resp *v1alpha1.GetReportContentResponse 371 err error 372 ) 373 374 // if cacheFirst is false or cache response is nil, we will try to get report content directly from plugin 375 if cacheFirst { 376 cache := e.GetCache() 377 if cache != nil { 378 reportResponses[pluginName] = cache 379 continue 380 } 381 } 382 383 ctx := metadata.NewOutgoingContext(context.Background(), metadata.New(nil)) 384 epBegin := time.Now() 385 resp, err = e.GetReportContent(ctx) 386 epCosts := time.Since(epBegin) 387 klog.InfoS("GetReportContent", "costs", epCosts, "pluginName", pluginName) 388 _ = m.emitter.StoreInt64(metricsNameGetContentPluginCost, epCosts.Microseconds(), metrics.MetricTypeNameRaw, []metrics.MetricTag{{Key: "plugin", Val: pluginName}}...) 389 if err != nil { 390 errList = append(errList, err) 391 s, _ := status.FromError(err) 392 _ = m.emitter.StoreInt64("reporter_plugin_get_content_failed", 1, metrics.MetricTypeNameCount, []metrics.MetricTag{ 393 {Key: "code", Val: s.Code().String()}, 394 {Key: "plugin", Val: pluginName}, 395 }...) 396 397 klog.Errorf("GetReportContentResponse from %s Endpoint failed with error: %v", pluginName, err) 398 // if it gets report content failed, uses cached response 399 resp = e.GetCache() 400 } 401 402 reportResponses[pluginName] = resp 403 } 404 405 return reportResponses, errors.NewAggregate(errList) 406 } 407 408 func (m *ReporterPluginManager) writeCheckpoint(reportResponses map[string]*v1alpha1.GetReportContentResponse) error { 409 remoteResponses := make(map[string]*v1alpha1.GetReportContentResponse, 0) 410 // only write remote endpoint response to checkpoint 411 for name, response := range reportResponses { 412 if m.innerEndpoints.Has(name) { 413 continue 414 } 415 remoteResponses[name] = response 416 } 417 data := checkpoint.New(remoteResponses) 418 err := m.checkpointManager.CreateCheckpoint(reporterManagerCheckpoint, data) 419 if err != nil { 420 _ = m.emitter.StoreInt64("reporter_plugin_checkpoint_write_failed", 1, metrics.MetricTypeNameCount) 421 return fmt.Errorf("failed to write checkpoint file %q: %v", reporterManagerCheckpoint, err) 422 } 423 return nil 424 } 425 426 func (m *ReporterPluginManager) readCheckpoint() error { 427 reportResponses := make(map[string]*v1alpha1.GetReportContentResponse, 0) 428 cp := checkpoint.New(reportResponses) 429 err := m.checkpointManager.GetCheckpoint(reporterManagerCheckpoint, cp) 430 if err != nil { 431 if err == cpmerrors.ErrCheckpointNotFound { 432 klog.Warningf("failed to retrieve checkpoint for %q: %v", reporterManagerCheckpoint, err) 433 return nil 434 } 435 return err 436 } 437 reportResponses = cp.GetData() 438 m.mutex.Lock() 439 defer m.mutex.Unlock() 440 for name, response := range reportResponses { 441 // During start up, creates stopped remote endpoint so that the report content 442 // will stay zero till the corresponding device plugin re-registers. 443 m.endpoints[name] = plugin.NewStoppedRemoteEndpoint(name, response) 444 } 445 return nil 446 }