github.com/kubewharf/katalyst-core@v0.5.3/pkg/util/general/healthz.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package general 18 19 import ( 20 "fmt" 21 "sync" 22 "time" 23 ) 24 25 var ( 26 healthzCheckMap = make(map[HealthzCheckName]*healthzCheckStatus) 27 healthzCheckLock sync.RWMutex 28 ) 29 30 // HealthzCheckName describes which rule name for this check 31 type HealthzCheckName string 32 33 // HealthzCheckState describes the checking results 34 type HealthzCheckState string 35 36 type HealthzCheckMode string 37 38 type HealthzCheckResult struct { 39 Ready bool `json:"ready"` 40 Message string `json:"message"` 41 } 42 43 type healthzCheckStatus struct { 44 State HealthzCheckState `json:"state"` 45 Message string `json:"message"` 46 LastUpdateTime time.Time `json:"lastUpdateTime"` 47 48 Mode HealthzCheckMode `json:"mode"` 49 50 // in HealthzCheckModeHeartBeat mode, when LastUpdateTime is not updated for more than TimeoutPeriod, we consider this rule is failed. 51 // 0 or negative value means no need to check the LastUpdateTime. 52 TimeoutPeriod time.Duration `json:"timeoutPeriod"` 53 UnhealthyStartTime time.Time `json:"unhealthyStartTime"` 54 // in HealthzCheckModeHeartBeat mode, when current State is not HealthzCheckStateReady, and it lasts more than 55 // TolerationPeriod, we consider this rule is failed. 0 or negative value means no need to check the UnhealthyStartTime. 56 TolerationPeriod time.Duration `json:"gracePeriod"` 57 58 LatestUnhealthyTime time.Time `json:"latestUnhealthyTime"` 59 // in HealthzCheckModeReport mode, when LatestUnhealthyTime is not earlier than AutoRecoverPeriod ago, we consider this rule 60 // is failed. 61 AutoRecoverPeriod time.Duration `json:"autoRecoverPeriod"` 62 mutex sync.RWMutex 63 } 64 65 func (h *healthzCheckStatus) update(state HealthzCheckState, message string) { 66 h.mutex.Lock() 67 defer h.mutex.Unlock() 68 69 now := time.Now() 70 h.Message = message 71 h.LastUpdateTime = now 72 if h.State == HealthzCheckStateReady && state != HealthzCheckStateReady { 73 h.UnhealthyStartTime = now 74 } 75 if state != HealthzCheckStateReady { 76 h.LatestUnhealthyTime = now 77 } 78 h.State = state 79 } 80 81 const ( 82 HealthzCheckStateReady HealthzCheckState = "Ready" 83 HealthzCheckStateNotReady HealthzCheckState = "NotReady" 84 HealthzCheckStateUnknown HealthzCheckState = "Unknown" 85 HealthzCheckStateFailed HealthzCheckState = "Failed" 86 87 InitMessage = "Init" 88 89 // HealthzCheckModeHeartBeat in this mode, caller should update the check status regularly like a heartbeat, once 90 // the heartbeat stops for more than TimeoutPeriod or the state is not HealthzCheckStateReady for more than GracePeriod, 91 // this rule will be considered as unhealthy. 92 HealthzCheckModeHeartBeat HealthzCheckMode = "heartbeat" 93 // HealthzCheckModeReport in this mode, caller only reports the failed state when the function does not work well. 94 // when the LatestUnhealthyTime is not earlier than the GracePeriod ago, we consider this rule as unhealthy. 95 // if caller doesn't report new failed state for more than GracePeriod, we consider the exception recovered. 96 HealthzCheckModeReport HealthzCheckMode = "report" 97 ) 98 99 // HealthzCheckFunc defined as a common function to define whether the corresponding component is healthy. 100 type HealthzCheckFunc func() (healthzCheckStatus, error) 101 102 func RegisterHeartbeatCheck(name string, timeout time.Duration, initState HealthzCheckState, tolerationPeriod time.Duration) { 103 healthzCheckLock.Lock() 104 defer healthzCheckLock.Unlock() 105 106 healthzCheckMap[HealthzCheckName(name)] = &healthzCheckStatus{ 107 State: initState, 108 Message: InitMessage, 109 LastUpdateTime: time.Now(), 110 TimeoutPeriod: timeout, 111 TolerationPeriod: tolerationPeriod, 112 Mode: HealthzCheckModeHeartBeat, 113 } 114 } 115 116 func RegisterReportCheck(name string, autoRecoverPeriod time.Duration) { 117 healthzCheckLock.Lock() 118 defer healthzCheckLock.Unlock() 119 120 healthzCheckMap[HealthzCheckName(name)] = &healthzCheckStatus{ 121 State: HealthzCheckStateReady, 122 Message: InitMessage, 123 AutoRecoverPeriod: autoRecoverPeriod, 124 Mode: HealthzCheckModeReport, 125 } 126 } 127 128 func UpdateHealthzStateByError(name string, err error) error { 129 if err != nil { 130 return UpdateHealthzState(name, HealthzCheckStateNotReady, err.Error()) 131 } else { 132 return UpdateHealthzState(name, HealthzCheckStateReady, "") 133 } 134 } 135 136 func UpdateHealthzState(name string, state HealthzCheckState, message string) error { 137 healthzCheckLock.RLock() 138 defer healthzCheckLock.RUnlock() 139 140 status, ok := healthzCheckMap[HealthzCheckName(name)] 141 if !ok { 142 Errorf("check rule %v not found", name) 143 return fmt.Errorf("check rule %v not found", name) 144 } 145 status.update(state, message) 146 return nil 147 } 148 149 func GetRegisterReadinessCheckResult() map[HealthzCheckName]HealthzCheckResult { 150 healthzCheckLock.RLock() 151 defer healthzCheckLock.RUnlock() 152 153 results := make(map[HealthzCheckName]HealthzCheckResult) 154 for name, checkStatus := range healthzCheckMap { 155 func() { 156 checkStatus.mutex.RLock() 157 defer checkStatus.mutex.RUnlock() 158 159 ready := true 160 message := checkStatus.Message 161 switch checkStatus.Mode { 162 case HealthzCheckModeHeartBeat: 163 if checkStatus.TimeoutPeriod > 0 && time.Now().Sub(checkStatus.LastUpdateTime) > checkStatus.TimeoutPeriod { 164 ready = false 165 message = fmt.Sprintf("the status has not been updated for more than %v, last update time is %v", checkStatus.TimeoutPeriod, checkStatus.LastUpdateTime) 166 } 167 168 if checkStatus.TolerationPeriod <= 0 && checkStatus.State != HealthzCheckStateReady { 169 ready = false 170 } 171 172 if checkStatus.TolerationPeriod > 0 && time.Now().Sub(checkStatus.UnhealthyStartTime) > checkStatus.TolerationPeriod && 173 checkStatus.State != HealthzCheckStateReady { 174 ready = false 175 } 176 case HealthzCheckModeReport: 177 if checkStatus.LatestUnhealthyTime.After(time.Now().Add(-checkStatus.TolerationPeriod)) { 178 ready = false 179 } 180 } 181 results[name] = HealthzCheckResult{ 182 Ready: ready, 183 Message: message, 184 } 185 }() 186 } 187 return results 188 }