github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/resourcemanager/reporter/cnr/cnrreporter.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package cnr 18 19 import ( 20 "context" 21 "fmt" 22 "reflect" 23 "sync" 24 "time" 25 26 apiequality "k8s.io/apimachinery/pkg/api/equality" 27 apierrors "k8s.io/apimachinery/pkg/api/errors" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/util/errors" 30 "k8s.io/apimachinery/pkg/util/sets" 31 "k8s.io/apimachinery/pkg/util/wait" 32 "k8s.io/klog/v2" 33 34 nodev1alpha1 "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" 35 clientset "github.com/kubewharf/katalyst-api/pkg/client/clientset/versioned" 36 "github.com/kubewharf/katalyst-api/pkg/protocol/reporterplugin/v1alpha1" 37 "github.com/kubewharf/katalyst-core/pkg/agent/resourcemanager/reporter" 38 "github.com/kubewharf/katalyst-core/pkg/client" 39 "github.com/kubewharf/katalyst-core/pkg/client/control" 40 "github.com/kubewharf/katalyst-core/pkg/config" 41 "github.com/kubewharf/katalyst-core/pkg/metaserver" 42 metaservercnr "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/cnr" 43 "github.com/kubewharf/katalyst-core/pkg/metrics" 44 "github.com/kubewharf/katalyst-core/pkg/util" 45 "github.com/kubewharf/katalyst-core/pkg/util/general" 46 "github.com/kubewharf/katalyst-core/pkg/util/syntax" 47 ) 48 49 const ( 50 cnrReporterName = "cnr-reporter" 51 52 // cnrUpdateMaxRetryTimes update cnr retry time. 53 cnrUpdateMaxRetryTimes = 3 54 ) 55 56 const ( 57 refreshLatestCNRJitterFactor = 0.5 58 ) 59 60 const ( 61 metricsNameRefreshCNRCost = "refresh_cnr_cost" 62 metricsNameUpdateCNRCost = "update_cnr_cost" 63 metricsNameUpdateCNRSpecMetadataCost = "update_cnr_spec_metadata_cost" 64 metricsNameUpdateCNRStatusCost = "update_cnr_status_cost" 65 ) 66 67 // cnrReporterImpl is to report cnr content to remote 68 type cnrReporterImpl struct { 69 cnrName string 70 71 // defaultLabels contains the default config for CNR created by reporter 72 defaultLabels map[string]string 73 // latestUpdatedCNR is used as an in-memory cache for CNR; 74 // whenever CNR info is needed, get from this cache firstly 75 latestUpdatedCNR *nodev1alpha1.CustomNodeResource 76 mux sync.Mutex 77 78 notifiers map[string]metaservercnr.CNRNotifier 79 80 client clientset.Interface 81 updater control.CNRControl 82 emitter metrics.MetricEmitter 83 84 mergeValueFunc syntax.MergeValueFunc 85 86 refreshLatestCNRPeriod time.Duration 87 } 88 89 // NewCNRReporter create a cnr reporter 90 func NewCNRReporter(genericClient *client.GenericClientSet, metaServer *metaserver.MetaServer, 91 emitter metrics.MetricEmitter, conf *config.Configuration, 92 ) (reporter.Reporter, error) { 93 c := &cnrReporterImpl{ 94 cnrName: conf.NodeName, 95 refreshLatestCNRPeriod: conf.RefreshLatestCNRPeriod, 96 defaultLabels: conf.DefaultCNRLabels, 97 notifiers: make(map[string]metaservercnr.CNRNotifier), 98 emitter: emitter, 99 client: genericClient.InternalClient, 100 updater: control.NewCNRControlImpl(genericClient.InternalClient), 101 } 102 // register itself as a resource reporter in meta-server 103 metaServer.SetCNRFetcher(c) 104 105 c.mergeValueFunc = syntax.SimpleMergeTwoValues 106 return c, nil 107 } 108 109 // Run start cnr reporter 110 func (c *cnrReporterImpl) Run(ctx context.Context) { 111 go wait.JitterUntilWithContext(ctx, c.refreshLatestCNR, c.refreshLatestCNRPeriod, refreshLatestCNRJitterFactor, true) 112 <-ctx.Done() 113 } 114 115 // GetCNR tries to return local cache if exists, otherwise get from APIServer 116 117 func (c *cnrReporterImpl) GetCNR(ctx context.Context) (*nodev1alpha1.CustomNodeResource, error) { 118 cnr := c.latestUpdatedCNR.DeepCopy() 119 if cnr != nil { 120 return cnr, nil 121 } 122 123 return c.client.NodeV1alpha1().CustomNodeResources().Get(ctx, c.cnrName, metav1.GetOptions{ResourceVersion: "0"}) 124 } 125 126 // Update is to update remote cnr according to reported fields 127 func (c *cnrReporterImpl) Update(ctx context.Context, fields []*v1alpha1.ReportField) error { 128 beginWithLock := time.Now() 129 c.mux.Lock() 130 beginWithoutLock := time.Now() 131 132 defer func() { 133 costs := time.Since(beginWithoutLock) 134 klog.InfoS("finished update cnr without lock", "costs", costs) 135 136 c.mux.Unlock() 137 138 costs = time.Since(beginWithLock) 139 klog.InfoS("finished update cnr with lock", "costs", costs) 140 _ = c.emitter.StoreInt64(metricsNameUpdateCNRCost, costs.Microseconds(), metrics.MetricTypeNameRaw) 141 }() 142 143 if klog.V(4).Enabled() { 144 for _, f := range fields { 145 klog.Infof("field name %s/%s with value %s", f.FieldType, f.FieldName, string(f.Value)) 146 } 147 } 148 149 for i := 0; i < cnrUpdateMaxRetryTimes; i++ { 150 if err := c.tryUpdateCNR(ctx, fields, i); err != nil { 151 klog.Errorf("error updating cnr, will retry: %v", err) 152 } else { 153 return nil 154 } 155 } 156 157 return fmt.Errorf("attempt to update cnr failed with total retries of %d", cnrUpdateMaxRetryTimes) 158 } 159 160 // RegisterNotifier register a notifier to cnr reporter 161 func (c *cnrReporterImpl) RegisterNotifier(name string, notifier metaservercnr.CNRNotifier) error { 162 c.mux.Lock() 163 defer c.mux.Unlock() 164 165 if _, ok := c.notifiers[name]; ok { 166 return fmt.Errorf("notifier %s already exists", name) 167 } 168 169 c.notifiers[name] = notifier 170 return nil 171 } 172 173 // UnregisterNotifier unregister a notifier from cnr reporter 174 func (c *cnrReporterImpl) UnregisterNotifier(name string) error { 175 c.mux.Lock() 176 defer c.mux.Unlock() 177 178 if _, ok := c.notifiers[name]; !ok { 179 return fmt.Errorf("notifier %s not exists", name) 180 } 181 182 delete(c.notifiers, name) 183 return nil 184 } 185 186 // refreshLatestCNR get latest cnr from remote, because cnr in cache may not have been updated. 187 func (c *cnrReporterImpl) refreshLatestCNR(ctx context.Context) { 188 c.mux.Lock() 189 defer c.mux.Unlock() 190 191 begin := time.Now() 192 defer func() { 193 costs := time.Since(begin) 194 klog.Infof("finished refresh cnr (%v)", costs) 195 _ = c.emitter.StoreInt64(metricsNameRefreshCNRCost, costs.Microseconds(), metrics.MetricTypeNameRaw) 196 }() 197 198 cnr, err := c.client.NodeV1alpha1().CustomNodeResources().Get(ctx, c.cnrName, metav1.GetOptions{ResourceVersion: "0"}) 199 if err == nil { 200 c.latestUpdatedCNR = cnr.DeepCopy() 201 } else if !c.resetCNRIfNeeded(err) { 202 klog.Errorf("refresh local cnr cache failed with error: %v", err) 203 } 204 } 205 206 // tryUpdateCNR update cnr according reported fields, first update cnr try will use cached latestUpdatedCNR, 207 // if there are some errors such as conflict happened, it will retry by getting cnr from api server 208 func (c *cnrReporterImpl) tryUpdateCNR(ctx context.Context, fields []*v1alpha1.ReportField, tryIdx int) error { 209 var ( 210 cnr *nodev1alpha1.CustomNodeResource 211 err error 212 ) 213 214 // only get cnr from api server iff latest updated cnr is nil or tryIdx > 0 215 if c.latestUpdatedCNR == nil || tryIdx > 0 { 216 c.countMetricsWithBaseTags("reporter_update_retry") 217 218 cnr, err = c.client.NodeV1alpha1().CustomNodeResources().Get(ctx, c.cnrName, metav1.GetOptions{ResourceVersion: "0"}) 219 if err != nil && !apierrors.IsNotFound(err) { 220 c.countMetricsWithBaseTags("reporter_update_get_failed") 221 if c.resetCNRIfNeeded(err) { 222 return nil 223 } 224 return err 225 } 226 227 // NotFound to create cnr 228 if err != nil { 229 cnr, err = c.createCNR(ctx, fields) 230 if err != nil { 231 c.countMetricsWithBaseTags("reporter_update_failed") 232 return fmt.Errorf("create cnr failed: %s", err) 233 } 234 } 235 236 c.latestUpdatedCNR = cnr.DeepCopy() 237 } else { 238 cnr = c.latestUpdatedCNR.DeepCopy() 239 } 240 241 if cnr == nil { 242 return fmt.Errorf("nil %q cnr object", c.cnrName) 243 } 244 245 originCNR := cnr.DeepCopy() 246 err = setCNR(cnr, fields, c.mergeValueFunc) 247 if err != nil { 248 return err 249 } 250 251 // todo: consider whether we need to handle update error automatically 252 // i.e. use queue to push and pop those failed items 253 254 // try patch spec and metadata first, because the update of cnr will change the ResourceVersion in ObjectMeta 255 originCNR, err = c.tryUpdateCNRSpecAndMetadata(ctx, originCNR, cnr) 256 if err != nil && !c.resetCNRIfNeeded(err) { 257 return err 258 } else if err != nil { 259 originCNR = c.latestUpdatedCNR.DeepCopy() 260 } 261 262 _, err = c.tryUpdateCNRStatus(ctx, originCNR, cnr) 263 if err != nil { 264 return err 265 } 266 267 return nil 268 } 269 270 func (c *cnrReporterImpl) tryUpdateCNRSpecAndMetadata(ctx context.Context, 271 originCNR, currentCNR *nodev1alpha1.CustomNodeResource, 272 ) (*nodev1alpha1.CustomNodeResource, error) { 273 var ( 274 cnr *nodev1alpha1.CustomNodeResource 275 err error 276 ) 277 278 if cnrSpecHasChanged(&originCNR.Spec, ¤tCNR.Spec) || cnrMetadataHasChanged(&originCNR.ObjectMeta, ¤tCNR.ObjectMeta) { 279 klog.Infof("cnr spec or metadata changed, try to patch it") 280 281 begin := time.Now() 282 defer func() { 283 costs := time.Since(begin) 284 klog.Infof("finished update cnr spec and metadata (%v)", costs) 285 _ = c.emitter.StoreInt64(metricsNameUpdateCNRSpecMetadataCost, costs.Microseconds(), metrics.MetricTypeNameRaw) 286 }() 287 288 // patch cnr spec and metadata 289 cnr, err = c.updater.PatchCNRSpecAndMetadata(ctx, c.cnrName, originCNR, currentCNR) 290 if err != nil { 291 c.countMetricsWithBaseTags("reporter_update", 292 metrics.ConvertMapToTags(map[string]string{ 293 "field": "spec", 294 "status": "failed", 295 })...) 296 return nil, err 297 } 298 299 c.countMetricsWithBaseTags("reporter_update", 300 metrics.ConvertMapToTags(map[string]string{ 301 "field": "spec", 302 "status": "success", 303 })...) 304 305 klog.Infof("patch cnr spec and metadata success\n old cnr spec: %#v, metadata: %#v,\n "+ 306 "new cnr spec: %#v, metadata: %#v", 307 originCNR.Spec, originCNR.ObjectMeta, cnr.Spec, cnr.ObjectMeta) 308 c.latestUpdatedCNR = cnr.DeepCopy() 309 310 // notify cnr spec and metadata update 311 for _, notifier := range c.notifiers { 312 notifier.OnCNRUpdate(cnr) 313 } 314 } else { 315 return originCNR, nil 316 } 317 318 return cnr, nil 319 } 320 321 func (c *cnrReporterImpl) tryUpdateCNRStatus(ctx context.Context, 322 originCNR, currentCNR *nodev1alpha1.CustomNodeResource, 323 ) (*nodev1alpha1.CustomNodeResource, error) { 324 var ( 325 cnr *nodev1alpha1.CustomNodeResource 326 err error 327 ) 328 329 if cnrStatusHasChanged(&originCNR.Status, ¤tCNR.Status) { 330 klog.Infof("cnr status changed, try to patch it") 331 332 begin := time.Now() 333 defer func() { 334 costs := time.Since(begin) 335 klog.Infof("finished update cnr status (%v)", costs) 336 _ = c.emitter.StoreInt64(metricsNameUpdateCNRStatusCost, costs.Microseconds(), metrics.MetricTypeNameRaw) 337 }() 338 339 // patch cnr status 340 cnr, err = c.updater.PatchCNRStatus(ctx, c.cnrName, originCNR, currentCNR) 341 if err != nil { 342 c.countMetricsWithBaseTags("reporter_update", 343 metrics.ConvertMapToTags(map[string]string{ 344 "field": "status", 345 "status": "failed", 346 })...) 347 return nil, err 348 } 349 350 c.countMetricsWithBaseTags("reporter_update", 351 metrics.ConvertMapToTags(map[string]string{ 352 "field": "status", 353 "status": "success", 354 })...) 355 356 klog.Infof("patch cnr status success old status: %#v,\n new status: %#v", originCNR.Status, cnr.Status) 357 c.latestUpdatedCNR = cnr.DeepCopy() 358 359 // notify cnr status update 360 for _, notifier := range c.notifiers { 361 notifier.OnCNRStatusUpdate(cnr) 362 } 363 } else { 364 return originCNR, nil 365 } 366 367 return cnr, nil 368 } 369 370 // resetCNRIfNeeded reset cnr if unmarshal type error, it will initialize 371 // local cnr cache to make sure the content of cnr always is true 372 // todo if $ref is supported in CRD, we can skip this since api-server will help with validations 373 func (c *cnrReporterImpl) resetCNRIfNeeded(err error) bool { 374 if general.IsUnmarshalTypeError(err) { 375 c.latestUpdatedCNR = c.defaultCNR() 376 klog.Infof("success re-initialize local cnr cache") 377 return true 378 } 379 380 return false 381 } 382 383 func (c *cnrReporterImpl) defaultCNR() *nodev1alpha1.CustomNodeResource { 384 return &nodev1alpha1.CustomNodeResource{ 385 ObjectMeta: metav1.ObjectMeta{ 386 Name: c.cnrName, 387 Labels: c.defaultLabels, 388 }, 389 } 390 } 391 392 func (c *cnrReporterImpl) createCNR(ctx context.Context, fields []*v1alpha1.ReportField) (*nodev1alpha1.CustomNodeResource, error) { 393 cnr := c.defaultCNR() 394 395 err := setCNR(cnr, fields, c.mergeValueFunc) 396 if err != nil { 397 return nil, fmt.Errorf("set cnr failed: %s", err) 398 } 399 400 klog.Infof("try to create cnr: %#v", cnr) 401 402 cnr, err = c.client.NodeV1alpha1().CustomNodeResources().Create(ctx, cnr, metav1.CreateOptions{}) 403 if err != nil { 404 return cnr, err 405 } 406 407 return cnr, nil 408 } 409 410 func setCNR(cnr *nodev1alpha1.CustomNodeResource, fields []*v1alpha1.ReportField, 411 mergeFunc func(src reflect.Value, dst reflect.Value) error, 412 ) error { 413 var errList []error 414 initializedFields := sets.String{} 415 for _, f := range fields { 416 if f == nil { 417 continue 418 } 419 420 // initialize need report cnr field first 421 if !initializedFields.Has(f.FieldName) { 422 err := initializeFieldToCNR(cnr, *f) 423 if err != nil { 424 errList = append(errList, err) 425 continue 426 } 427 428 initializedFields.Insert(f.FieldName) 429 } 430 431 // parse report field to cnr by merge function 432 _, err := parseReportFieldToCNR(cnr, *f, mergeFunc) 433 if err != nil { 434 errList = append(errList, err) 435 continue 436 } 437 } 438 439 if len(errList) > 0 { 440 return errors.NewAggregate(errList) 441 } 442 443 if err := reviseCNR(cnr); err != nil { 444 return err 445 } 446 447 return nil 448 } 449 450 // reviseCNR revises the field of cnr to make sure it is not redundant 451 func reviseCNR(cnr *nodev1alpha1.CustomNodeResource) error { 452 if cnr == nil { 453 return nil 454 } 455 456 // merge all topology zones 457 cnr.Status.TopologyZone = util.MergeTopologyZone(nil, cnr.Status.TopologyZone) 458 return nil 459 } 460 461 func (c *cnrReporterImpl) countMetricsWithBaseTags(key string, tags ...metrics.MetricTag) { 462 tags = append(tags, 463 metrics.ConvertMapToTags(map[string]string{ 464 "reporterName": cnrReporterName, 465 })...) 466 467 _ = c.emitter.StoreInt64(key, 1, metrics.MetricTypeNameCount, tags...) 468 } 469 470 // initializeFieldToCNR initialize cnr fields to nil 471 func initializeFieldToCNR(cnr *nodev1alpha1.CustomNodeResource, field v1alpha1.ReportField) error { 472 // get need report value of cnr 473 originValue, err := getCNRField(cnr, field) 474 if err != nil { 475 return err 476 } 477 478 originValue.Set(reflect.New(originValue.Type()).Elem()) 479 return nil 480 } 481 482 // parseReportFieldToCNR parse reportField and merge to origin cnr by mergeFunc 483 func parseReportFieldToCNR(cnr *nodev1alpha1.CustomNodeResource, reportField v1alpha1.ReportField, 484 mergeFunc func(src reflect.Value, dst reflect.Value) error, 485 ) (*nodev1alpha1.CustomNodeResource, error) { 486 if cnr == nil { 487 return nil, fmt.Errorf("cnr is nil") 488 } 489 490 // get need report value of cnr 491 originValue, err := getCNRField(cnr, reportField) 492 if err != nil { 493 return nil, err 494 } 495 496 // parse report value to base field type 497 reportValue, err := syntax.ParseBytesByType(reportField.Value, originValue.Type()) 498 if err != nil || !reportValue.IsValid() { 499 return nil, fmt.Errorf("report %s with value %s is invald with err: %s", reportField.FieldName, string(reportField.Value), err) 500 } 501 502 err = mergeFunc(reportValue, originValue) 503 if err != nil { 504 return nil, err 505 } 506 507 return cnr, nil 508 } 509 510 // getCNRField only support to parse first-level fields in cnr now; 511 // todo: support to parse nested fields in the future. 512 func getCNRField(cnr *nodev1alpha1.CustomNodeResource, reportField v1alpha1.ReportField) (reflect.Value, error) { 513 var el reflect.Value 514 switch reportField.FieldType { 515 case v1alpha1.FieldType_Status: 516 el = reflect.ValueOf(&cnr.Status) 517 case v1alpha1.FieldType_Spec: 518 el = reflect.ValueOf(&cnr.Spec) 519 case v1alpha1.FieldType_Metadata: 520 el = reflect.ValueOf(cnr) 521 default: 522 return reflect.Value{}, fmt.Errorf("not support field type %s", reportField.FieldType) 523 } 524 525 if el.Kind() == reflect.Ptr { 526 el = el.Elem() 527 } 528 529 // find origin value by field name 530 field := el.FieldByName(reportField.FieldName) 531 if !field.IsValid() { 532 return reflect.Value{}, fmt.Errorf("field %s is invalid", reportField.FieldName) 533 } 534 535 return field, nil 536 } 537 538 func cnrMetadataHasChanged(originMeta *metav1.ObjectMeta, meta *metav1.ObjectMeta) bool { 539 return !apiequality.Semantic.DeepEqual(originMeta, meta) 540 } 541 542 func cnrSpecHasChanged(originSpec *nodev1alpha1.CustomNodeResourceSpec, spec *nodev1alpha1.CustomNodeResourceSpec) bool { 543 return !apiequality.Semantic.DeepEqual(originSpec, spec) 544 } 545 546 func cnrStatusHasChanged(originStatus *nodev1alpha1.CustomNodeResourceStatus, status *nodev1alpha1.CustomNodeResourceStatus) bool { 547 return !apiequality.Semantic.DeepEqual(originStatus, status) 548 }