github.com/1aal/kubeblocks@v0.0.0-20231107070852-e1c03e598921/pkg/lorry/dcs/k8s.go (about) 1 /* 2 Copyright (C) 2022-2023 ApeCloud Co., Ltd 3 4 This file is part of KubeBlocks project 5 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU Affero General Public License as published by 8 the Free Software Foundation, either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU Affero General Public License for more details. 15 16 You should have received a copy of the GNU Affero General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 package dcs 21 22 import ( 23 "context" 24 "encoding/json" 25 "fmt" 26 "os" 27 "strconv" 28 "time" 29 30 "github.com/go-logr/logr" 31 32 "github.com/pkg/errors" 33 corev1 "k8s.io/api/core/v1" 34 apierrors "k8s.io/apimachinery/pkg/api/errors" 35 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 36 "k8s.io/apimachinery/pkg/labels" 37 "k8s.io/client-go/kubernetes" 38 "k8s.io/client-go/kubernetes/scheme" 39 "k8s.io/client-go/rest" 40 ctrl "sigs.k8s.io/controller-runtime" 41 "sigs.k8s.io/controller-runtime/pkg/client/apiutil" 42 43 appsv1alpha1 "github.com/1aal/kubeblocks/apis/apps/v1alpha1" 44 "github.com/1aal/kubeblocks/pkg/constant" 45 k8s "github.com/1aal/kubeblocks/pkg/lorry/util/kubernetes" 46 viper "github.com/1aal/kubeblocks/pkg/viperx" 47 ) 48 49 type KubernetesStore struct { 50 ctx context.Context 51 clusterName string 52 componentName string 53 clusterCompName string 54 currentMemberName string 55 namespace string 56 cluster *Cluster 57 client *rest.RESTClient 58 clientset *kubernetes.Clientset 59 LeaderObservedTime int64 60 logger logr.Logger 61 } 62 63 func NewKubernetesStore() (*KubernetesStore, error) { 64 ctx := context.Background() 65 logger := ctrl.Log.WithName("DCS-K8S") 66 clientset, err := k8s.GetClientSet() 67 if err != nil { 68 err = errors.Wrap(err, "clientset init failed") 69 return nil, err 70 } 71 client, err := k8s.GetRESTClientForKB() 72 if err != nil { 73 err = errors.Wrap(err, "restclient init failed") 74 return nil, err 75 } 76 77 clusterName := os.Getenv(constant.KBEnvClusterName) 78 if clusterName == "" { 79 return nil, errors.New("KB_CLUSTER_NAME must be set") 80 } 81 82 componentName := os.Getenv(constant.KBEnvComponentName) 83 if componentName == "" { 84 return nil, errors.New("KB_CCMP_NAME must be set") 85 } 86 87 clusterCompName := os.Getenv(constant.KBEnvClusterCompName) 88 if clusterCompName == "" { 89 return nil, errors.New("KB_CLUSTER_COMP_NAME must be set") 90 } 91 92 currentMemberName := os.Getenv(constant.KBEnvPodName) 93 if clusterName == "" { 94 return nil, errors.New("KB_POD_NAME must be set") 95 } 96 97 namespace := os.Getenv(constant.KBEnvNamespace) 98 if namespace == "" { 99 return nil, errors.New("KB_NAMESPACE must be set") 100 } 101 102 store := &KubernetesStore{ 103 ctx: ctx, 104 clusterName: clusterName, 105 componentName: componentName, 106 clusterCompName: clusterCompName, 107 currentMemberName: currentMemberName, 108 namespace: namespace, 109 client: client, 110 clientset: clientset, 111 logger: logger, 112 } 113 return store, err 114 } 115 116 func (store *KubernetesStore) Initialize(cluster *Cluster) error { 117 store.logger.Info("k8s store initializing") 118 _, err := store.GetCluster() 119 if err != nil { 120 return err 121 } 122 123 err = store.CreateHaConfig(cluster) 124 if err != nil { 125 store.logger.Error(err, "Create Ha ConfigMap failed") 126 } 127 128 err = store.CreateLease() 129 if err != nil { 130 store.logger.Error(err, "Create Leader ConfigMap failed") 131 } 132 return err 133 } 134 135 func (store *KubernetesStore) GetClusterName() string { 136 return store.clusterName 137 } 138 139 func (store *KubernetesStore) GetClusterFromCache() *Cluster { 140 if store.cluster != nil { 141 return store.cluster 142 } 143 cluster, _ := store.GetCluster() 144 return cluster 145 } 146 147 func (store *KubernetesStore) GetCluster() (*Cluster, error) { 148 clusterResource := &appsv1alpha1.Cluster{} 149 err := store.client.Get(). 150 Namespace(store.namespace). 151 Resource("clusters"). 152 Name(store.clusterName). 153 VersionedParams(&metav1.GetOptions{}, scheme.ParameterCodec). 154 Do(store.ctx). 155 Into(clusterResource) 156 // store.logger.Debugf("cluster resource: %v", clusterResource) 157 if err != nil { 158 store.logger.Error(err, "k8s get cluster error") 159 return nil, err 160 } 161 162 var replicas int32 163 for _, component := range clusterResource.Spec.ComponentSpecs { 164 if component.Name == store.componentName { 165 replicas = component.Replicas 166 break 167 } 168 } 169 170 members, err := store.GetMembers() 171 if err != nil { 172 store.logger.Info("get members error", "error", err) 173 } 174 175 leader, err := store.GetLeader() 176 if err != nil { 177 store.logger.Info("get leader error", "error", err) 178 } 179 180 switchover, err := store.GetSwitchover() 181 if err != nil { 182 store.logger.Info("get switchover error", "error", err) 183 } 184 185 haConfig, err := store.GetHaConfig() 186 if err != nil { 187 store.logger.Info("get HaConfig error", "error", err) 188 } 189 190 cluster := &Cluster{ 191 ClusterCompName: store.clusterCompName, 192 Namespace: store.namespace, 193 Replicas: replicas, 194 Members: members, 195 Leader: leader, 196 Switchover: switchover, 197 HaConfig: haConfig, 198 resource: clusterResource, 199 } 200 201 store.cluster = cluster 202 return cluster, nil 203 } 204 205 func (store *KubernetesStore) GetMembers() ([]Member, error) { 206 labelsMap := map[string]string{ 207 constant.AppInstanceLabelKey: store.clusterName, 208 constant.AppManagedByLabelKey: "kubeblocks", 209 constant.KBAppComponentLabelKey: store.componentName, 210 } 211 212 selector := labels.SelectorFromSet(labelsMap) 213 store.logger.Info(fmt.Sprintf("pod selector: %s", selector.String())) 214 podList, err := store.clientset.CoreV1().Pods(store.namespace).List(context.TODO(), metav1.ListOptions{LabelSelector: selector.String()}) 215 if err != nil { 216 return nil, err 217 } 218 219 store.logger.Info(fmt.Sprintf("podlist: %d", len(podList.Items))) 220 members := make([]Member, len(podList.Items)) 221 for i, pod := range podList.Items { 222 member := &members[i] 223 member.Name = pod.Name 224 // member.Name = fmt.Sprintf("%s.%s-headless.%s.svc", pod.Name, store.clusterCompName, store.namespace) 225 member.Role = pod.Labels["app.kubernetes.io/role"] 226 member.PodIP = pod.Status.PodIP 227 member.DBPort = getDBPort(&pod) 228 member.LorryPort = getLorryPort(&pod) 229 member.UID = string(pod.UID) 230 member.resource = pod.DeepCopy() 231 } 232 233 return members, nil 234 } 235 236 func (store *KubernetesStore) ResetCluster() {} 237 func (store *KubernetesStore) DeleteCluster() {} 238 239 func (store *KubernetesStore) GetLeaderConfigMap() (*corev1.ConfigMap, error) { 240 leaderName := store.getLeaderName() 241 leaderConfigMap, err := store.clientset.CoreV1().ConfigMaps(store.namespace).Get(store.ctx, leaderName, metav1.GetOptions{}) 242 if err != nil { 243 if apierrors.IsNotFound(err) { 244 store.logger.Info("Leader configmap is not found", "configmap", leaderName) 245 return nil, nil 246 } 247 store.logger.Error(err, "Get Leader configmap failed") 248 } 249 return leaderConfigMap, err 250 } 251 252 func (store *KubernetesStore) IsLeaseExist() (bool, error) { 253 leaderConfigMap, err := store.GetLeaderConfigMap() 254 appCluster, ok := store.cluster.resource.(*appsv1alpha1.Cluster) 255 if leaderConfigMap != nil && ok && leaderConfigMap.CreationTimestamp.Before(&appCluster.CreationTimestamp) { 256 store.logger.Info("A previous leader configmap resource exists, delete it", "name", leaderConfigMap.Name) 257 _ = store.DeleteLeader() 258 return false, nil 259 } 260 return leaderConfigMap != nil, err 261 } 262 263 func (store *KubernetesStore) CreateLease() error { 264 isExist, err := store.IsLeaseExist() 265 if isExist || err != nil { 266 return err 267 } 268 269 leaderConfigMapName := store.getLeaderName() 270 leaderName := store.currentMemberName 271 now := time.Now().Unix() 272 nowStr := strconv.FormatInt(now, 10) 273 ttl := viper.GetString(constant.KBEnvTTL) 274 leaderConfigMap := &corev1.ConfigMap{ 275 ObjectMeta: metav1.ObjectMeta{ 276 Name: leaderConfigMapName, 277 Annotations: map[string]string{ 278 "leader": leaderName, 279 "acquire-time": nowStr, 280 "renew-time": nowStr, 281 "ttl": ttl, 282 "extra": "", 283 }, 284 }, 285 } 286 287 store.logger.Info(fmt.Sprintf("K8S store initializing, create leader ConfigMap: %s", leaderConfigMapName)) 288 err = store.createConfigMap(leaderConfigMap) 289 if err != nil { 290 store.logger.Error(err, "Create Leader ConfigMap failed") 291 return err 292 } 293 return nil 294 } 295 296 func (store *KubernetesStore) GetLeader() (*Leader, error) { 297 configmap, err := store.GetLeaderConfigMap() 298 if err != nil { 299 return nil, err 300 } 301 302 if configmap == nil { 303 return nil, nil 304 } 305 306 annotations := configmap.Annotations 307 acquireTime, err := strconv.ParseInt(annotations["acquire-time"], 10, 64) 308 if err != nil { 309 acquireTime = 0 310 } 311 renewTime, err := strconv.ParseInt(annotations["renew-time"], 10, 64) 312 if err != nil { 313 renewTime = 0 314 } 315 ttl, err := strconv.Atoi(annotations["ttl"]) 316 if err != nil { 317 ttl = viper.GetInt("KB_TTL") 318 } 319 leader := annotations["leader"] 320 stateStr, ok := annotations["dbstate"] 321 var dbState *DBState 322 if ok { 323 dbState = new(DBState) 324 err = json.Unmarshal([]byte(stateStr), &dbState) 325 if err != nil { 326 store.logger.Error(err, fmt.Sprintf("get leader dbstate failed, annotations: %v", annotations)) 327 } 328 } 329 330 if ttl > 0 && time.Now().Unix()-renewTime > int64(ttl) { 331 store.logger.Info(fmt.Sprintf("lock expired: %v, now: %d", annotations, time.Now().Unix())) 332 leader = "" 333 } 334 335 return &Leader{ 336 Index: configmap.ResourceVersion, 337 Name: leader, 338 AcquireTime: acquireTime, 339 RenewTime: renewTime, 340 TTL: ttl, 341 Resource: configmap, 342 DBState: dbState, 343 }, nil 344 } 345 346 func (store *KubernetesStore) DeleteLeader() error { 347 leaderName := store.getLeaderName() 348 err := store.clientset.CoreV1().ConfigMaps(store.namespace).Delete(store.ctx, leaderName, metav1.DeleteOptions{}) 349 if err != nil { 350 store.logger.Error(err, "Delete leader configmap failed") 351 } 352 return err 353 } 354 355 func (store *KubernetesStore) AttempAcquireLease() error { 356 now := strconv.FormatInt(time.Now().Unix(), 10) 357 ttl := store.cluster.HaConfig.ttl 358 leaderName := store.currentMemberName 359 annotation := map[string]string{ 360 "leader": leaderName, 361 "ttl": strconv.Itoa(ttl), 362 "renew-time": now, 363 "acquire-time": now, 364 } 365 366 configMap := store.cluster.Leader.Resource.(*corev1.ConfigMap) 367 configMap.SetAnnotations(annotation) 368 if store.cluster.Leader.DBState != nil { 369 str, _ := json.Marshal(store.cluster.Leader.DBState) 370 configMap.Annotations["dbstate"] = string(str) 371 } 372 cm, err := store.clientset.CoreV1().ConfigMaps(store.namespace).Update(context.TODO(), configMap, metav1.UpdateOptions{}) 373 if err != nil { 374 store.logger.Error(err, "Acquire lease failed") 375 } else { 376 store.cluster.Leader.Resource = cm 377 } 378 379 return err 380 } 381 382 func (store *KubernetesStore) HasLease() bool { 383 return store.cluster != nil && store.cluster.Leader != nil && store.cluster.Leader.Name == store.currentMemberName 384 } 385 386 func (store *KubernetesStore) UpdateLease() error { 387 configMap := store.cluster.Leader.Resource.(*corev1.ConfigMap) 388 389 annotations := configMap.GetAnnotations() 390 if annotations["leader"] != store.currentMemberName { 391 return errors.Errorf("lost lease") 392 } 393 ttl := store.cluster.HaConfig.ttl 394 annotations["ttl"] = strconv.Itoa(ttl) 395 annotations["renew-time"] = strconv.FormatInt(time.Now().Unix(), 10) 396 397 if store.cluster.Leader.DBState != nil { 398 str, _ := json.Marshal(store.cluster.Leader.DBState) 399 configMap.Annotations["dbstate"] = string(str) 400 } 401 configMap.SetAnnotations(annotations) 402 403 _, err := store.clientset.CoreV1().ConfigMaps(store.namespace).Update(context.TODO(), configMap, metav1.UpdateOptions{}) 404 return err 405 } 406 407 func (store *KubernetesStore) ReleaseLease() error { 408 store.logger.Info("release lease") 409 configMap := store.cluster.Leader.Resource.(*corev1.ConfigMap) 410 configMap.Annotations["leader"] = "" 411 412 if store.cluster.Leader.DBState != nil { 413 str, _ := json.Marshal(store.cluster.Leader.DBState) 414 configMap.Annotations["dbstate"] = string(str) 415 } 416 _, err := store.clientset.CoreV1().ConfigMaps(store.namespace).Update(context.TODO(), configMap, metav1.UpdateOptions{}) 417 if err != nil { 418 store.logger.Error(err, "release lease failed") 419 } 420 // TODO: if response status code is 409, it means operation conflict. 421 return err 422 } 423 424 func (store *KubernetesStore) CreateHaConfig(cluster *Cluster) error { 425 haName := store.getHAConfigName() 426 haConfig, _ := store.GetHaConfig() 427 if haConfig.resource != nil { 428 return nil 429 } 430 431 store.logger.Info(fmt.Sprintf("Create Ha ConfigMap: %s", haName)) 432 ttl := viper.GetString(constant.KBEnvTTL) 433 maxLag := viper.GetString(constant.KBEnvMaxLag) 434 enableHA := viper.GetString(constant.KBEnvEnableHA) 435 if enableHA == "" { 436 // enable HA by default 437 enableHA = "true" 438 } 439 haConfigMap := &corev1.ConfigMap{ 440 ObjectMeta: metav1.ObjectMeta{ 441 Name: haName, 442 Annotations: map[string]string{ 443 "ttl": ttl, 444 "enable": enableHA, 445 "MaxLagOnSwitchover": maxLag, 446 }, 447 }, 448 } 449 450 err := store.createConfigMap(haConfigMap) 451 if err != nil { 452 store.logger.Error(err, "Create Ha ConfigMap failed") 453 } 454 return err 455 } 456 457 func (store *KubernetesStore) GetHaConfig() (*HaConfig, error) { 458 configmapName := store.getHAConfigName() 459 deleteMembers := make(map[string]MemberToDelete) 460 configmap, err := store.clientset.CoreV1().ConfigMaps(store.namespace).Get(context.TODO(), configmapName, metav1.GetOptions{}) 461 if err != nil { 462 if !apierrors.IsNotFound(err) { 463 store.logger.Error(err, fmt.Sprintf("Get ha configmap [%s] error", configmapName)) 464 } else { 465 err = nil 466 } 467 return &HaConfig{ 468 index: "", 469 ttl: viper.GetInt("KB_TTL"), 470 maxLagOnSwitchover: 1048576, 471 DeleteMembers: deleteMembers, 472 }, err 473 } 474 475 annotations := configmap.Annotations 476 ttl, err := strconv.Atoi(annotations["ttl"]) 477 if err != nil { 478 ttl = viper.GetInt("KB_TTL") 479 } 480 maxLagOnSwitchover, err := strconv.Atoi(annotations["MaxLagOnSwitchover"]) 481 if err != nil { 482 maxLagOnSwitchover = 1048576 483 } 484 485 enable := false 486 enableStr := annotations["enable"] 487 if enableStr != "" { 488 enable, err = strconv.ParseBool(enableStr) 489 } 490 491 str := annotations["delete-members"] 492 if str != "" { 493 err := json.Unmarshal([]byte(str), &deleteMembers) 494 if err != nil { 495 store.logger.Error(err, fmt.Sprintf("Get delete members [%s] error", str)) 496 } 497 } 498 499 return &HaConfig{ 500 index: configmap.ResourceVersion, 501 ttl: ttl, 502 enable: enable, 503 maxLagOnSwitchover: int64(maxLagOnSwitchover), 504 DeleteMembers: deleteMembers, 505 resource: configmap, 506 }, err 507 } 508 509 func (store *KubernetesStore) UpdateHaConfig() error { 510 haConfig := store.cluster.HaConfig 511 if haConfig.resource == nil { 512 return errors.New("No HA configmap") 513 } 514 515 configMap := haConfig.resource.(*corev1.ConfigMap) 516 annotations := configMap.Annotations 517 annotations["ttl"] = strconv.Itoa(haConfig.ttl) 518 deleteMembers, err := json.Marshal(haConfig.DeleteMembers) 519 if err != nil { 520 store.logger.Error(err, fmt.Sprintf("marsha delete members [%v]", haConfig)) 521 } 522 annotations["delete-members"] = string(deleteMembers) 523 annotations["MaxLagOnSwitchover"] = strconv.Itoa(int(haConfig.maxLagOnSwitchover)) 524 525 _, err = store.clientset.CoreV1().ConfigMaps(store.namespace).Update(context.TODO(), configMap, metav1.UpdateOptions{}) 526 return err 527 } 528 529 func (store *KubernetesStore) GetSwitchOverConfigMap() (*corev1.ConfigMap, error) { 530 switchoverName := store.getSwitchoverName() 531 switchOverConfigMap, err := store.clientset.CoreV1().ConfigMaps(store.namespace).Get(store.ctx, switchoverName, metav1.GetOptions{}) 532 if err != nil { 533 if apierrors.IsNotFound(err) { 534 store.logger.Info(fmt.Sprintf("no switchOver [%s] setting", switchoverName)) 535 return nil, nil 536 } 537 store.logger.Error(err, "Get switchOver configmap failed") 538 } 539 return switchOverConfigMap, err 540 } 541 542 func (store *KubernetesStore) GetSwitchover() (*Switchover, error) { 543 switchOverConfigMap, _ := store.GetSwitchOverConfigMap() 544 if switchOverConfigMap == nil { 545 return nil, nil 546 } 547 annotations := switchOverConfigMap.Annotations 548 scheduledAt, _ := strconv.Atoi(annotations["scheduled-at"]) 549 switchOver := newSwitchover(switchOverConfigMap.ResourceVersion, annotations["leader"], annotations["candidate"], int64(scheduledAt)) 550 return switchOver, nil 551 } 552 553 func (store *KubernetesStore) CreateSwitchover(leader, candidate string) error { 554 switchoverName := store.getSwitchoverName() 555 switchover, _ := store.GetSwitchover() 556 if switchover != nil { 557 return fmt.Errorf("there is another switchover %s unfinished", switchoverName) 558 } 559 560 store.logger.Info(fmt.Sprintf("Create switchover configmap %s", switchoverName)) 561 swConfigMap := &corev1.ConfigMap{ 562 ObjectMeta: metav1.ObjectMeta{ 563 Name: switchoverName, 564 Annotations: map[string]string{ 565 "leader": leader, 566 "candidate": candidate, 567 }, 568 }, 569 } 570 571 err := store.createConfigMap(swConfigMap) 572 if err != nil { 573 store.logger.Error(err, "Create switchover configmap failed") 574 return err 575 } 576 return nil 577 } 578 579 func (store *KubernetesStore) DeleteSwitchover() error { 580 switchoverName := store.getSwitchoverName() 581 err := store.clientset.CoreV1().ConfigMaps(store.namespace).Delete(store.ctx, switchoverName, metav1.DeleteOptions{}) 582 if err != nil { 583 store.logger.Error(err, "Delete switchOver configmap failed") 584 } 585 return err 586 } 587 588 func (store *KubernetesStore) getLeaderName() string { 589 return store.clusterCompName + "-leader" 590 } 591 592 func (store *KubernetesStore) getHAConfigName() string { 593 return store.clusterCompName + "-haconfig" 594 } 595 596 func (store *KubernetesStore) getSwitchoverName() string { 597 return store.clusterCompName + "-switchover" 598 } 599 600 func (store *KubernetesStore) createConfigMap(configMap *corev1.ConfigMap) error { 601 labelsMap := map[string]string{ 602 "app.kubernetes.io/instance": store.clusterName, 603 "app.kubernetes.io/managed-by": "kubeblocks", 604 "apps.kubeblocks.io/component-name": store.componentName, 605 } 606 607 configMap.Labels = labelsMap 608 configMap.Namespace = store.namespace 609 configMap.OwnerReferences = []metav1.OwnerReference{getOwnerRef(store.cluster)} 610 _, err := store.clientset.CoreV1().ConfigMaps(store.namespace).Create(store.ctx, configMap, metav1.CreateOptions{}) 611 if err != nil { 612 return err 613 } 614 return nil 615 } 616 617 func (store *KubernetesStore) AddCurrentMember() error { 618 return nil 619 } 620 621 // TODO: Use the database instance's character type to determine its port number more precisely 622 func getDBPort(pod *corev1.Pod) string { 623 mainContainer := pod.Spec.Containers[0] 624 port := mainContainer.Ports[0] 625 dbPort := port.ContainerPort 626 return strconv.Itoa(int(dbPort)) 627 } 628 629 func getLorryPort(pod *corev1.Pod) string { 630 for _, container := range pod.Spec.Containers { 631 for _, port := range container.Ports { 632 if port.Name == "probe-http-port" { 633 return strconv.Itoa(int(port.ContainerPort)) 634 } 635 } 636 } 637 return "" 638 } 639 640 func getOwnerRef(cluster *Cluster) metav1.OwnerReference { 641 clusterObj := cluster.resource.(*appsv1alpha1.Cluster) 642 gvk, _ := apiutil.GVKForObject(clusterObj, scheme.Scheme) 643 ownerRef := metav1.OwnerReference{ 644 APIVersion: gvk.GroupVersion().String(), 645 Kind: gvk.Kind, 646 UID: clusterObj.UID, 647 Name: clusterObj.Name, 648 } 649 return ownerRef 650 }