github.com/1aal/kubeblocks@v0.0.0-20231107070852-e1c03e598921/pkg/lorry/operations/replica/checkrunning.go (about) 1 /* 2 Copyright (C) 2022-2023 ApeCloud Co., Ltd 3 4 This file is part of KubeBlocks project 5 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU Affero General Public License as published by 8 the Free Software Foundation, either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU Affero General Public License for more details. 15 16 You should have received a copy of the GNU Affero General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 package replica 21 22 import ( 23 "context" 24 "fmt" 25 "net" 26 "strconv" 27 "time" 28 29 "github.com/go-logr/logr" 30 "github.com/pkg/errors" 31 "github.com/spf13/viper" 32 ctrl "sigs.k8s.io/controller-runtime" 33 34 "github.com/1aal/kubeblocks/pkg/constant" 35 "github.com/1aal/kubeblocks/pkg/lorry/engines/register" 36 "github.com/1aal/kubeblocks/pkg/lorry/operations" 37 "github.com/1aal/kubeblocks/pkg/lorry/util" 38 ) 39 40 // CheckRunning checks whether the binding service is in running status, 41 // If check fails continuously, report an event at FailedEventReportFrequency frequency 42 type CheckRunning struct { 43 operations.Base 44 logger logr.Logger 45 Timeout time.Duration 46 DBAddress string 47 CheckRunningFailedCount int 48 FailedEventReportFrequency int 49 } 50 51 var checkrunning operations.Operation = &CheckRunning{} 52 53 func init() { 54 err := operations.Register("checkrunning", checkrunning) 55 if err != nil { 56 panic(err.Error()) 57 } 58 } 59 60 func (s *CheckRunning) Init(ctx context.Context) error { 61 s.FailedEventReportFrequency = viper.GetInt("KB_FAILED_EVENT_REPORT_FREQUENCY") 62 if s.FailedEventReportFrequency < 300 { 63 s.FailedEventReportFrequency = 300 64 } else if s.FailedEventReportFrequency > 3600 { 65 s.FailedEventReportFrequency = 3600 66 } 67 68 timeoutSeconds := util.DefaultProbeTimeoutSeconds 69 if viper.IsSet(constant.KBEnvRoleProbeTimeout) { 70 timeoutSeconds = viper.GetInt(constant.KBEnvRoleProbeTimeout) 71 } 72 // lorry utilizes the pod readiness probe to trigger probe and 'timeoutSeconds' is directly copied from the 'probe.timeoutSeconds' field of pod. 73 // here we give 80% of the total time to probe job and leave the remaining 20% to kubelet to handle the readiness probe related tasks. 74 s.Timeout = time.Duration(timeoutSeconds) * (800 * time.Millisecond) 75 s.DBAddress = s.getAddress() 76 s.logger = ctrl.Log.WithName("checkrunning") 77 return nil 78 } 79 80 func (s *CheckRunning) Do(ctx context.Context, req *operations.OpsRequest) (*operations.OpsResponse, error) { 81 manager, err := register.GetDBManager() 82 if err != nil { 83 return nil, errors.Wrap(err, "get manager failed") 84 } 85 86 var message string 87 opsRsp := &operations.OpsResponse{} 88 opsRsp.Data["operation"] = util.CheckRunningOperation 89 90 dbPort, err := manager.GetPort() 91 if err != nil { 92 return nil, errors.Wrap(err, "get db port failed") 93 } 94 95 host := net.JoinHostPort(s.DBAddress, strconv.Itoa(dbPort)) 96 // sql exec timeout needs to be less than httpget's timeout which by default 1s. 97 conn, err := net.DialTimeout("tcp", host, 500*time.Millisecond) 98 if err != nil { 99 message = fmt.Sprintf("running check %s error", host) 100 s.logger.Error(err, message) 101 opsRsp.Data["event"] = util.OperationFailed 102 opsRsp.Data["message"] = message 103 if s.CheckRunningFailedCount%s.FailedEventReportFrequency == 0 { 104 s.logger.Info("running checks failed continuously", "times", s.CheckRunningFailedCount) 105 // resp.Metadata[StatusCode] = OperationFailedHTTPCode 106 err = util.SentEventForProbe(ctx, opsRsp.Data) 107 } 108 s.CheckRunningFailedCount++ 109 return opsRsp, err 110 } 111 defer conn.Close() 112 s.CheckRunningFailedCount = 0 113 message = "TCP Connection Established Successfully!" 114 if tcpCon, ok := conn.(*net.TCPConn); ok { 115 err := tcpCon.SetLinger(0) 116 s.logger.Error(err, "running check, set tcp linger failed") 117 } 118 opsRsp.Data["event"] = util.OperationSuccess 119 opsRsp.Data["message"] = message 120 return opsRsp, nil 121 } 122 123 // getAddress returns component service address, if component is not listening on 124 // 127.0.0.1, the Operation needs to overwrite this function and set ops.DBAddress 125 func (s *CheckRunning) getAddress() string { 126 return "127.0.0.1" 127 }