github.com/kubewharf/katalyst-core@v0.5.3/pkg/metaserver/kcc/manager.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package kcc 18 19 import ( 20 "context" 21 "fmt" 22 "reflect" 23 "sync" 24 "time" 25 26 apiequality "k8s.io/apimachinery/pkg/api/equality" 27 "k8s.io/apimachinery/pkg/api/meta" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/runtime" 30 "k8s.io/apimachinery/pkg/runtime/schema" 31 "k8s.io/apimachinery/pkg/util/errors" 32 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 33 "k8s.io/apimachinery/pkg/util/wait" 34 "k8s.io/klog/v2" 35 "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" 36 37 "github.com/kubewharf/katalyst-api/pkg/apis/config/v1alpha1" 38 "github.com/kubewharf/katalyst-core/pkg/client" 39 pkgconfig "github.com/kubewharf/katalyst-core/pkg/config" 40 "github.com/kubewharf/katalyst-core/pkg/config/agent" 41 "github.com/kubewharf/katalyst-core/pkg/config/agent/dynamic" 42 "github.com/kubewharf/katalyst-core/pkg/config/agent/dynamic/crd" 43 "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/cnc" 44 "github.com/kubewharf/katalyst-core/pkg/metrics" 45 "github.com/kubewharf/katalyst-core/pkg/util/native" 46 "github.com/kubewharf/katalyst-core/pkg/util/syntax" 47 ) 48 49 const ( 50 updateConfigInterval = 3 * time.Second 51 updateConfigJitterFactor = 0.5 52 ) 53 54 const ( 55 metricsNameUpdateConfig = "metaserver_update_config" 56 metricsNameLoadCheckpoint = "metaserver_load_checkpoint" 57 58 metricsValueStatusCheckpointNotFoundOrCorrupted = "notFoundOrCorrupted" 59 metricsValueStatusCheckpointInvalidOrExpired = "invalidOrExpired" 60 metricsValueStatusCheckpointSuccess = "success" 61 ) 62 63 const ( 64 configManagerCheckpoint = "config_manager_checkpoint" 65 ) 66 67 var ( 68 katalystConfigGVRToGVKMap = getGVRToGVKMap() 69 70 updateConfigBackoff = wait.Backoff{ 71 Duration: 5 * time.Second, 72 Factor: 2, 73 Jitter: 0.1, 74 Steps: 5, 75 Cap: 15 * time.Second, 76 } 77 ) 78 79 // ConfigurationManager is a user for ConfigurationLoader working for dynamic configuration manager 80 type ConfigurationManager interface { 81 // InitializeConfig trigger dynamic configuration initialize directly 82 InitializeConfig(ctx context.Context) error 83 // AddConfigWatcher add gvr to list which will be watched to get dynamic configuration 84 AddConfigWatcher(gvrs ...metav1.GroupVersionResource) error 85 // Run starts the main loop 86 Run(ctx context.Context) 87 } 88 89 type DummyConfigurationManager struct{} 90 91 func (d *DummyConfigurationManager) InitializeConfig(_ context.Context) error { 92 return nil 93 } 94 95 func (d *DummyConfigurationManager) AddConfigWatcher(_ ...metav1.GroupVersionResource) error { 96 return nil 97 } 98 99 func (d *DummyConfigurationManager) Run(_ context.Context) {} 100 101 var _ ConfigurationManager = &DynamicConfigManager{} 102 103 // DynamicConfigManager is to fetch dynamic config from remote 104 type DynamicConfigManager struct { 105 // defaultConfig is used to store the static configuration parsed from flags 106 // currentConfig merges default conf with dynamic conf (defined in kcc); and 107 // the dynamic conf is used as an incremental way. 108 conf *agent.AgentConfiguration 109 defaultConfig *dynamic.Configuration 110 111 // lastDynamicConfigCRD is used to record the last dynamic config CRD 112 // to avoid unnecessary update 113 lastDynamicConfigCRD *crd.DynamicConfigCRD 114 115 configLoader ConfigurationLoader 116 emitter metrics.MetricEmitter 117 118 // resourceGVRMap records those GVR that should be interested 119 // gvrToKind maps from GVR to GVK (only kind can be used to reflect objects) 120 mux sync.RWMutex 121 resourceGVRMap map[string]metav1.GroupVersionResource 122 123 // checkpoint stores recent dynamic config 124 checkpointManager checkpointmanager.CheckpointManager 125 checkpointGraceTime time.Duration 126 } 127 128 // NewDynamicConfigManager new a dynamic config manager use katalyst custom config sdk. 129 func NewDynamicConfigManager(clientSet *client.GenericClientSet, emitter metrics.MetricEmitter, 130 cncFetcher cnc.CNCFetcher, conf *pkgconfig.Configuration, 131 ) (ConfigurationManager, error) { 132 configLoader := NewKatalystCustomConfigLoader(clientSet, conf.ConfigCacheTTL, cncFetcher) 133 134 checkpointManager, err := checkpointmanager.NewCheckpointManager(conf.CheckpointManagerDir) 135 if err != nil { 136 return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err) 137 } 138 139 return &DynamicConfigManager{ 140 conf: conf.AgentConfiguration, 141 defaultConfig: deepCopy(conf.GetDynamicConfiguration()), 142 configLoader: configLoader, 143 emitter: emitter, 144 resourceGVRMap: make(map[string]metav1.GroupVersionResource), 145 checkpointManager: checkpointManager, 146 checkpointGraceTime: conf.ConfigCheckpointGraceTime, 147 }, nil 148 } 149 150 // AddConfigWatcher add gvr to list which will be watched to get dynamic configuration 151 func (c *DynamicConfigManager) AddConfigWatcher(gvrs ...metav1.GroupVersionResource) error { 152 c.mux.Lock() 153 defer c.mux.Unlock() 154 155 for _, gvr := range gvrs { 156 if oldGVR, ok := c.resourceGVRMap[gvr.Resource]; ok && gvr != oldGVR { 157 return fmt.Errorf("resource %s already reggistered by gvrs %s which is different with %s", 158 gvr.Resource, oldGVR.String(), gvr.String()) 159 } 160 161 c.resourceGVRMap[gvr.Resource] = gvr 162 } 163 164 return nil 165 } 166 167 // Run is to start update config loops until the context is done 168 func (c *DynamicConfigManager) Run(ctx context.Context) { 169 go wait.JitterUntilWithContext(ctx, func(context.Context) { 170 if err := c.tryUpdateConfig(ctx, true); err != nil { 171 klog.Errorf("try update config error: %v", err) 172 } 173 }, updateConfigInterval, updateConfigJitterFactor, true) 174 <-ctx.Done() 175 } 176 177 // InitializeConfig will try to initialize dynamic config 178 func (c *DynamicConfigManager) InitializeConfig(ctx context.Context) error { 179 err := wait.ExponentialBackoff(updateConfigBackoff, func() (bool, error) { 180 err := c.tryUpdateConfig(ctx, false) 181 if err == nil { 182 return true, nil 183 } 184 185 if c.conf.ConfigSkipFailedInitialization { 186 klog.Warningf("unable to update dynamic config: %v, fallback to default config", err) 187 return true, nil 188 } 189 190 klog.Errorf("unable to update dynamic config: %v, back off to retry", err) 191 return false, nil 192 }) 193 194 return err 195 } 196 197 func (c *DynamicConfigManager) tryUpdateConfig(ctx context.Context, skipError bool) error { 198 c.mux.RLock() 199 defer c.mux.RUnlock() 200 201 err := c.updateConfig(ctx) 202 if err != nil { 203 _ = c.emitter.StoreInt64(metricsNameUpdateConfig, 1, metrics.MetricTypeNameCount, metrics.MetricTag{ 204 Key: "status", Val: "failed", 205 }) 206 207 // return an error if skipError is false to make sure the config is correct at startup 208 if !skipError { 209 return err 210 } 211 } else { 212 _ = c.emitter.StoreInt64(metricsNameUpdateConfig, 1, metrics.MetricTypeNameCount, metrics.MetricTag{ 213 Key: "status", Val: "success", 214 }) 215 } 216 217 return nil 218 } 219 220 // updateConfig is used to get dynamic agent config from remote 221 func (c *DynamicConfigManager) updateConfig(ctx context.Context) error { 222 dynamicConfigCRD, success, err := c.updateDynamicConfig(c.resourceGVRMap, katalystConfigGVRToGVKMap, 223 func(gvr metav1.GroupVersionResource, conf interface{}) error { 224 return c.configLoader.LoadConfig(ctx, gvr, conf) 225 }, 226 ) 227 if !success { 228 return err 229 } else if apiequality.Semantic.DeepEqual(c.lastDynamicConfigCRD, dynamicConfigCRD) { 230 klog.V(4).Infof("dynamic config is not changed") 231 return nil 232 } 233 234 klog.Infof("dynamic config crd is changed from %v to %v", c.lastDynamicConfigCRD, dynamicConfigCRD) 235 currentConfig := deepCopy(c.defaultConfig) 236 applyDynamicConfig(currentConfig, dynamicConfigCRD) 237 c.conf.SetDynamicConfiguration(currentConfig) 238 c.lastDynamicConfigCRD = dynamicConfigCRD 239 return err 240 } 241 242 func (c *DynamicConfigManager) writeCheckpoint(kind string, configData reflect.Value) { 243 // read checkpoint to get config data related to other gvr 244 data, err := c.readCheckpoint() 245 if err != nil { 246 klog.Errorf("load checkpoint from %q failed: %v, try to overwrite it", configManagerCheckpoint, err) 247 _ = c.emitter.StoreInt64(metricsNameLoadCheckpoint, 1, metrics.MetricTypeNameCount, []metrics.MetricTag{ 248 {Key: "status", Val: metricsValueStatusCheckpointNotFoundOrCorrupted}, 249 {Key: "kind", Val: kind}, 250 }...) 251 } 252 253 // checkpoint doesn't exist or became corrupted, make a new checkpoint 254 if data == nil { 255 data = NewCheckpoint(make(map[string]TargetConfigData)) 256 } 257 258 // set config value and timestamp for kind 259 data.SetData(kind, configData, metav1.Now()) 260 err = c.checkpointManager.CreateCheckpoint(configManagerCheckpoint, data) 261 if err != nil { 262 klog.Errorf("failed to write checkpoint file %q: %v", configManagerCheckpoint, err) 263 } 264 } 265 266 func (c *DynamicConfigManager) readCheckpoint() (ConfigManagerCheckpoint, error) { 267 configResponses := make(map[string]TargetConfigData) 268 cp := NewCheckpoint(configResponses) 269 err := c.checkpointManager.GetCheckpoint(configManagerCheckpoint, cp) 270 if err != nil { 271 return nil, err 272 } 273 274 return cp, nil 275 } 276 277 func (c *DynamicConfigManager) updateDynamicConfig(resourceGVRMap map[string]metav1.GroupVersionResource, 278 gvrToKind map[schema.GroupVersionResource]schema.GroupVersionKind, 279 loader func(gvr metav1.GroupVersionResource, conf interface{}) error, 280 ) (*crd.DynamicConfigCRD, bool, error) { 281 dynamicConfiguration := &crd.DynamicConfigCRD{} 282 success := false 283 284 var errList []error 285 for _, gvr := range resourceGVRMap { 286 schemaGVR := native.ToSchemaGVR(gvr.Group, gvr.Version, gvr.Resource) 287 kind, ok := gvrToKind[schemaGVR] 288 if !ok { 289 errList = append(errList, fmt.Errorf("gvk of gvr %s is not found", gvr)) 290 continue 291 } 292 293 // get target dynamic config configField by kind 294 configField := reflect.ValueOf(dynamicConfiguration).Elem().FieldByName(kind.Kind) 295 296 // create a new instance of this configField type 297 newConfigData := reflect.New(configField.Type().Elem()) 298 err := loader(gvr, newConfigData.Interface()) 299 if err != nil { 300 klog.Warningf("failed to load targetConfigMeta from targetConfigMeta fetcher: %s", err) 301 // get target dynamic configField value from checkpoint 302 data, err := c.readCheckpoint() 303 if err != nil { 304 _ = c.emitter.StoreInt64(metricsNameLoadCheckpoint, 1, metrics.MetricTypeNameRaw, []metrics.MetricTag{ 305 {Key: "status", Val: metricsValueStatusCheckpointNotFoundOrCorrupted}, 306 {Key: "kind", Val: kind.Kind}, 307 }...) 308 errList = append(errList, fmt.Errorf("failed to get targetConfigMeta from checkpoint")) 309 continue 310 } else { 311 configData, timestamp := data.GetData(kind.Kind) 312 if configData.Kind() == reflect.Ptr && !configData.IsNil() && 313 time.Now().Before(timestamp.Add(c.checkpointGraceTime)) { 314 newConfigData = configData 315 klog.Infof("failed to load targetConfigMeta from remote, use local checkpoint instead") 316 _ = c.emitter.StoreInt64(metricsNameLoadCheckpoint, 1, metrics.MetricTypeNameRaw, []metrics.MetricTag{ 317 {Key: "status", Val: metricsValueStatusCheckpointSuccess}, 318 {Key: "kind", Val: kind.Kind}, 319 }...) 320 } else { 321 _ = c.emitter.StoreInt64(metricsNameLoadCheckpoint, 1, metrics.MetricTypeNameRaw, []metrics.MetricTag{ 322 {Key: "status", Val: metricsValueStatusCheckpointInvalidOrExpired}, 323 {Key: "kind", Val: kind.Kind}, 324 }...) 325 errList = append(errList, fmt.Errorf("checkpoint data for gvr %v is empty or out of date", gvr.String())) 326 continue 327 } 328 } 329 } 330 331 // set target dynamic configField by new config field 332 configField.Set(newConfigData) 333 success = true 334 c.writeCheckpoint(kind.Kind, newConfigData) 335 } 336 337 return dynamicConfiguration, success, errors.NewAggregate(errList) 338 } 339 340 func getGVRToGVKMap() map[schema.GroupVersionResource]schema.GroupVersionKind { 341 scheme := runtime.NewScheme() 342 utilruntime.Must(v1alpha1.AddToScheme(scheme)) 343 344 knownTypes := scheme.AllKnownTypes() 345 gvrToKind := make(map[schema.GroupVersionResource]schema.GroupVersionKind) 346 for kind := range knownTypes { 347 plural, singular := meta.UnsafeGuessKindToResource(kind) 348 gvrToKind[plural] = kind 349 gvrToKind[singular] = kind 350 } 351 return gvrToKind 352 } 353 354 func applyDynamicConfig(config *dynamic.Configuration, 355 dynamicConfigCRD *crd.DynamicConfigCRD, 356 ) { 357 config.ApplyConfiguration(dynamicConfigCRD) 358 } 359 360 func deepCopy(src *dynamic.Configuration) *dynamic.Configuration { 361 return syntax.DeepCopy(src).(*dynamic.Configuration) 362 }