github.com/1aal/kubeblocks@v0.0.0-20231107070852-e1c03e598921/pkg/lorry/operations/replica/checkrunning.go (about)

     1  /*
     2  Copyright (C) 2022-2023 ApeCloud Co., Ltd
     3  
     4  This file is part of KubeBlocks project
     5  
     6  This program is free software: you can redistribute it and/or modify
     7  it under the terms of the GNU Affero General Public License as published by
     8  the Free Software Foundation, either version 3 of the License, or
     9  (at your option) any later version.
    10  
    11  This program is distributed in the hope that it will be useful
    12  but WITHOUT ANY WARRANTY; without even the implied warranty of
    13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14  GNU Affero General Public License for more details.
    15  
    16  You should have received a copy of the GNU Affero General Public License
    17  along with this program.  If not, see <http://www.gnu.org/licenses/>.
    18  */
    19  
    20  package replica
    21  
    22  import (
    23  	"context"
    24  	"fmt"
    25  	"net"
    26  	"strconv"
    27  	"time"
    28  
    29  	"github.com/go-logr/logr"
    30  	"github.com/pkg/errors"
    31  	"github.com/spf13/viper"
    32  	ctrl "sigs.k8s.io/controller-runtime"
    33  
    34  	"github.com/1aal/kubeblocks/pkg/constant"
    35  	"github.com/1aal/kubeblocks/pkg/lorry/engines/register"
    36  	"github.com/1aal/kubeblocks/pkg/lorry/operations"
    37  	"github.com/1aal/kubeblocks/pkg/lorry/util"
    38  )
    39  
    40  // CheckRunning checks whether the binding service is in running status,
    41  // If check fails continuously, report an event at FailedEventReportFrequency frequency
    42  type CheckRunning struct {
    43  	operations.Base
    44  	logger                     logr.Logger
    45  	Timeout                    time.Duration
    46  	DBAddress                  string
    47  	CheckRunningFailedCount    int
    48  	FailedEventReportFrequency int
    49  }
    50  
    51  var checkrunning operations.Operation = &CheckRunning{}
    52  
    53  func init() {
    54  	err := operations.Register("checkrunning", checkrunning)
    55  	if err != nil {
    56  		panic(err.Error())
    57  	}
    58  }
    59  
    60  func (s *CheckRunning) Init(ctx context.Context) error {
    61  	s.FailedEventReportFrequency = viper.GetInt("KB_FAILED_EVENT_REPORT_FREQUENCY")
    62  	if s.FailedEventReportFrequency < 300 {
    63  		s.FailedEventReportFrequency = 300
    64  	} else if s.FailedEventReportFrequency > 3600 {
    65  		s.FailedEventReportFrequency = 3600
    66  	}
    67  
    68  	timeoutSeconds := util.DefaultProbeTimeoutSeconds
    69  	if viper.IsSet(constant.KBEnvRoleProbeTimeout) {
    70  		timeoutSeconds = viper.GetInt(constant.KBEnvRoleProbeTimeout)
    71  	}
    72  	// lorry utilizes the pod readiness probe to trigger probe and 'timeoutSeconds' is directly copied from the 'probe.timeoutSeconds' field of pod.
    73  	// here we give 80% of the total time to probe job and leave the remaining 20% to kubelet to handle the readiness probe related tasks.
    74  	s.Timeout = time.Duration(timeoutSeconds) * (800 * time.Millisecond)
    75  	s.DBAddress = s.getAddress()
    76  	s.logger = ctrl.Log.WithName("checkrunning")
    77  	return nil
    78  }
    79  
    80  func (s *CheckRunning) Do(ctx context.Context, req *operations.OpsRequest) (*operations.OpsResponse, error) {
    81  	manager, err := register.GetDBManager()
    82  	if err != nil {
    83  		return nil, errors.Wrap(err, "get manager failed")
    84  	}
    85  
    86  	var message string
    87  	opsRsp := &operations.OpsResponse{}
    88  	opsRsp.Data["operation"] = util.CheckRunningOperation
    89  
    90  	dbPort, err := manager.GetPort()
    91  	if err != nil {
    92  		return nil, errors.Wrap(err, "get db port failed")
    93  	}
    94  
    95  	host := net.JoinHostPort(s.DBAddress, strconv.Itoa(dbPort))
    96  	// sql exec timeout needs to be less than httpget's timeout which by default 1s.
    97  	conn, err := net.DialTimeout("tcp", host, 500*time.Millisecond)
    98  	if err != nil {
    99  		message = fmt.Sprintf("running check %s error", host)
   100  		s.logger.Error(err, message)
   101  		opsRsp.Data["event"] = util.OperationFailed
   102  		opsRsp.Data["message"] = message
   103  		if s.CheckRunningFailedCount%s.FailedEventReportFrequency == 0 {
   104  			s.logger.Info("running checks failed continuously", "times", s.CheckRunningFailedCount)
   105  			// resp.Metadata[StatusCode] = OperationFailedHTTPCode
   106  			err = util.SentEventForProbe(ctx, opsRsp.Data)
   107  		}
   108  		s.CheckRunningFailedCount++
   109  		return opsRsp, err
   110  	}
   111  	defer conn.Close()
   112  	s.CheckRunningFailedCount = 0
   113  	message = "TCP Connection Established Successfully!"
   114  	if tcpCon, ok := conn.(*net.TCPConn); ok {
   115  		err := tcpCon.SetLinger(0)
   116  		s.logger.Error(err, "running check, set tcp linger failed")
   117  	}
   118  	opsRsp.Data["event"] = util.OperationSuccess
   119  	opsRsp.Data["message"] = message
   120  	return opsRsp, nil
   121  }
   122  
   123  // getAddress returns component service address, if component is not listening on
   124  // 127.0.0.1, the Operation needs to overwrite this function and set ops.DBAddress
   125  func (s *CheckRunning) getAddress() string {
   126  	return "127.0.0.1"
   127  }