github.com/1aal/kubeblocks@v0.0.0-20231107070852-e1c03e598921/pkg/lorry/operations/volume/protect.go (about)

     1  /*
     2  Copyright (C) 2022-2023 ApeCloud Co., Ltd
     3  
     4  This file is part of KubeBlocks project
     5  
     6  This program is free software: you can redistribute it and/or modify
     7  it under the terms of the GNU Affero General Public License as published by
     8  the Free Software Foundation, either version 3 of the License, or
     9  (at your option) any later version.
    10  
    11  This program is distributed in the hope that it will be useful
    12  but WITHOUT ANY WARRANTY; without even the implied warranty of
    13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14  GNU Affero General Public License for more details.
    15  
    16  You should have received a copy of the GNU Affero General Public License
    17  along with this program.  If not, see <http://www.gnu.org/licenses/>.
    18  */
    19  
    20  package volume
    21  
    22  import (
    23  	"context"
    24  	"crypto/tls"
    25  	"crypto/x509"
    26  	"encoding/json"
    27  	"fmt"
    28  	"io"
    29  	"net/http"
    30  	"os"
    31  	"strconv"
    32  
    33  	"github.com/go-logr/logr"
    34  	"github.com/pkg/errors"
    35  	"github.com/spf13/viper"
    36  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    37  	"k8s.io/client-go/kubernetes"
    38  	"k8s.io/client-go/rest"
    39  	statsv1alpha1 "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
    40  	ctrl "sigs.k8s.io/controller-runtime"
    41  
    42  	appsv1alpha1 "github.com/1aal/kubeblocks/apis/apps/v1alpha1"
    43  	"github.com/1aal/kubeblocks/pkg/constant"
    44  	"github.com/1aal/kubeblocks/pkg/lorry/engines"
    45  	"github.com/1aal/kubeblocks/pkg/lorry/engines/register"
    46  	"github.com/1aal/kubeblocks/pkg/lorry/operations"
    47  	"github.com/1aal/kubeblocks/pkg/lorry/util"
    48  )
    49  
    50  const (
    51  	kubeletStatsSummaryURL = "https://%s:%s/stats/summary"
    52  
    53  	certFile  = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
    54  	tokenFile = "/var/run/secrets/kubernetes.io/serviceaccount/token"
    55  
    56  	reasonLock   = "HighVolumeWatermark"
    57  	reasonUnlock = "LowVolumeWatermark" // TODO
    58  )
    59  
    60  type volumeStatsRequester interface {
    61  	init(ctx context.Context) error
    62  	request(ctx context.Context) ([]byte, error)
    63  }
    64  
    65  type volumeExt struct {
    66  	Name          string
    67  	HighWatermark int
    68  	Stats         statsv1alpha1.VolumeStats
    69  }
    70  
    71  type Protection struct {
    72  	operations.Base
    73  	dbManager     engines.DBManager
    74  	Requester     volumeStatsRequester
    75  	Pod           string
    76  	HighWatermark int
    77  	Volumes       map[string]volumeExt
    78  	Readonly      bool
    79  	SendEvent     bool // to disable event for testing
    80  	Logger        logr.Logger
    81  }
    82  
    83  func (p *Protection) Init(ctx context.Context) error {
    84  	p.Logger = ctrl.Log.WithName("Volume-Protection")
    85  	if p.Requester == nil {
    86  		p.Requester = &httpsVolumeStatsRequester{
    87  			logger: p.Logger,
    88  		}
    89  	}
    90  	p.SendEvent = true
    91  
    92  	dbManager, err := register.GetDBManager()
    93  	if err != nil {
    94  		return errors.Wrap(err, "get manager failed")
    95  	}
    96  	p.dbManager = dbManager
    97  
    98  	if err := p.Requester.init(ctx); err != nil {
    99  		return err
   100  	}
   101  
   102  	p.Pod = viper.GetString(constant.KBEnvPodName)
   103  	if err := p.initVolumes(); err != nil {
   104  		p.Logger.Error(err, "init volumes to monitor error")
   105  	}
   106  	p.Logger.Info("succeed to init volume protection", "pod", p.Pod, "spec", p.buildVolumesMsg())
   107  	return nil
   108  }
   109  
   110  func (p *Protection) PreCheck(ctx context.Context, req *operations.OpsRequest) error {
   111  	return nil
   112  }
   113  
   114  func (p *Protection) Do(ctx context.Context, req *operations.OpsRequest) (*operations.OpsResponse, error) {
   115  	if p.disabled() {
   116  		p.Logger.Info("the volume protection operation is disabled")
   117  		return nil, nil
   118  	}
   119  
   120  	summary, err := p.Requester.request(ctx)
   121  	if err != nil {
   122  		p.Logger.Error(err, "request stats summary from kubelet error")
   123  		return nil, err
   124  	}
   125  
   126  	if err = p.updateVolumeStats(summary); err != nil {
   127  		return nil, err
   128  	}
   129  
   130  	volumeUsages, err := p.checkUsage(ctx)
   131  	resp := &operations.OpsResponse{
   132  		Data: map[string]any{},
   133  	}
   134  	if err == nil {
   135  		resp.Data["protect"] = volumeUsages
   136  	}
   137  	return resp, err
   138  }
   139  
   140  func (p *Protection) initVolumes() error {
   141  	spec := &appsv1alpha1.VolumeProtectionSpec{}
   142  	raw := viper.GetString(constant.KBEnvVolumeProtectionSpec)
   143  	if err := json.Unmarshal([]byte(raw), spec); err != nil {
   144  		p.Logger.Error(err, "unmarshal volume protection spec error", "raw spec", raw)
   145  		return err
   146  	}
   147  
   148  	p.HighWatermark = normalizeVolumeWatermark(&spec.HighWatermark, 0)
   149  
   150  	if p.Volumes == nil {
   151  		p.Volumes = make(map[string]volumeExt)
   152  	}
   153  	for _, v := range spec.Volumes {
   154  		p.Volumes[v.Name] = volumeExt{
   155  			Name:          v.Name,
   156  			HighWatermark: normalizeVolumeWatermark(v.HighWatermark, p.HighWatermark),
   157  			Stats: statsv1alpha1.VolumeStats{
   158  				Name: v.Name,
   159  			},
   160  		}
   161  	}
   162  	return nil
   163  }
   164  
   165  func (p *Protection) disabled() bool {
   166  	// TODO: check the role and skip secondary instances.
   167  	if len(p.Pod) == 0 || len(p.Volumes) == 0 {
   168  		return true
   169  	}
   170  	for _, v := range p.Volumes {
   171  		// take (0, 100] as enabled
   172  		if v.HighWatermark > 0 && v.HighWatermark <= 100 {
   173  			return false
   174  		}
   175  	}
   176  	return true
   177  }
   178  
   179  func (p *Protection) updateVolumeStats(payload []byte) error {
   180  	summary := &statsv1alpha1.Summary{}
   181  	if err := json.Unmarshal(payload, summary); err != nil {
   182  		p.Logger.Error(err, "stats summary obtained from kubelet error")
   183  		return err
   184  	}
   185  	for _, pod := range summary.Pods {
   186  		if pod.PodRef.Name == p.Pod {
   187  			for _, stats := range pod.VolumeStats {
   188  				if _, ok := p.Volumes[stats.Name]; !ok {
   189  					continue
   190  				}
   191  				v := p.Volumes[stats.Name]
   192  				v.Stats = stats
   193  				p.Volumes[stats.Name] = v
   194  			}
   195  			break
   196  		}
   197  	}
   198  	return nil
   199  }
   200  
   201  func (p *Protection) checkUsage(ctx context.Context) (map[string]any, error) {
   202  	lower := make([]string, 0)
   203  	higher := make([]string, 0)
   204  	for name, v := range p.Volumes {
   205  		ret := p.checkVolumeWatermark(v)
   206  		if ret == 0 {
   207  			lower = append(lower, name)
   208  		} else {
   209  			higher = append(higher, name)
   210  		}
   211  	}
   212  
   213  	volumeUsages := p.buildVolumesMsg()
   214  	readonly := p.Readonly
   215  	// the instance is running normally and there have volume(s) over the space usage threshold.
   216  	if !readonly && len(higher) > 0 {
   217  		if err := p.highWatermark(ctx, volumeUsages); err != nil {
   218  			return volumeUsages, err
   219  		}
   220  	}
   221  	// the instance is protected in RO mode, and all volumes' space usage are under the threshold.
   222  	if readonly && len(lower) == len(p.Volumes) {
   223  		if err := p.lowWatermark(ctx, volumeUsages); err != nil {
   224  			return volumeUsages, err
   225  		}
   226  	}
   227  	return volumeUsages, nil
   228  }
   229  
   230  // checkVolumeWatermark checks whether the volume's space usage is over the threshold.
   231  //
   232  //	returns 0 if the volume will not be taken in account or its space usage is under the threshold
   233  //	returns non-zero if the volume space usage is over the threshold
   234  func (p *Protection) checkVolumeWatermark(v volumeExt) int {
   235  	if v.HighWatermark == 0 { // disabled
   236  		return 0
   237  	}
   238  	if v.Stats.CapacityBytes == nil || v.Stats.UsedBytes == nil {
   239  		return 0
   240  	}
   241  	thresholdBytes := *v.Stats.CapacityBytes / 100 * uint64(v.HighWatermark)
   242  	if *v.Stats.UsedBytes < thresholdBytes {
   243  		return 0
   244  	}
   245  	return 1
   246  }
   247  
   248  func (p *Protection) highWatermark(ctx context.Context, volumeUsages map[string]any) error {
   249  	if p.Readonly { // double check
   250  		return nil
   251  	}
   252  	if err := p.lockInstance(ctx); err != nil {
   253  		p.Logger.Error(err, "set instance to read-only error", "volumes", volumeUsages)
   254  		return err
   255  	}
   256  
   257  	p.Logger.Info("set instance to read-only OK", "msg", volumeUsages)
   258  	p.Readonly = true
   259  
   260  	if err := p.sendEvent(ctx, reasonLock, volumeUsages); err != nil {
   261  		p.Logger.Error(err, "send volume protection (lock) event error", "volumes", volumeUsages)
   262  		return err
   263  	}
   264  	return nil
   265  }
   266  
   267  func (p *Protection) lowWatermark(ctx context.Context, volumeUsages map[string]any) error {
   268  	if !p.Readonly { // double check
   269  		return nil
   270  	}
   271  	if err := p.unlockInstance(ctx); err != nil {
   272  		p.Logger.Error(err, "reset instance to read-write error", "volumes", volumeUsages)
   273  		return err
   274  	}
   275  
   276  	p.Logger.Info("reset instance to read-write OK", "msg", volumeUsages)
   277  	p.Readonly = false
   278  
   279  	if err := p.sendEvent(ctx, reasonUnlock, volumeUsages); err != nil {
   280  		p.Logger.Error(err, "send volume protection (unlock) event error", "volumes", volumeUsages)
   281  		return err
   282  	}
   283  	return nil
   284  }
   285  
   286  func (p *Protection) lockInstance(ctx context.Context) error {
   287  	return p.dbManager.Lock(ctx, "disk full")
   288  }
   289  
   290  func (p *Protection) unlockInstance(ctx context.Context) error {
   291  	return p.dbManager.Unlock(ctx)
   292  }
   293  
   294  func (p *Protection) buildVolumesMsg() map[string]any {
   295  	volumes := make([]map[string]string, 0)
   296  	for _, v := range p.Volumes {
   297  		usage := make(map[string]string)
   298  		if v.HighWatermark != p.HighWatermark {
   299  			usage["highWatermark"] = fmt.Sprintf("%d", v.HighWatermark)
   300  		}
   301  		stats := v.Stats
   302  		if stats.UsedBytes == nil || stats.CapacityBytes == nil {
   303  			usage[v.Name] = "<nil>"
   304  		} else {
   305  			usage[v.Name] = fmt.Sprintf("%d%%", int(*stats.UsedBytes*100 / *stats.CapacityBytes))
   306  		}
   307  		volumes = append(volumes, usage)
   308  	}
   309  	usages := map[string]any{
   310  		"highWatermark": fmt.Sprintf("%d", p.HighWatermark),
   311  		"volumes":       volumes,
   312  	}
   313  	return usages
   314  }
   315  
   316  func (p *Protection) sendEvent(ctx context.Context, reason string, volumeUsages map[string]any) error {
   317  	if p.SendEvent {
   318  		event, err := util.CreateEvent(reason, volumeUsages)
   319  		if err != nil {
   320  			return errors.Wrap(err, "create volume protection event failed")
   321  		}
   322  		return util.SendEvent(ctx, event)
   323  	}
   324  	return nil
   325  }
   326  
   327  type httpsVolumeStatsRequester struct {
   328  	logger logr.Logger
   329  	cli    *http.Client
   330  	req    *http.Request
   331  }
   332  
   333  var _ volumeStatsRequester = &httpsVolumeStatsRequester{}
   334  
   335  func (r *httpsVolumeStatsRequester) init(ctx context.Context) error {
   336  	var err error
   337  	if r.cli, err = httpClient(); err != nil {
   338  		r.logger.Error(err, "build HTTP client error at setup")
   339  		return err
   340  	}
   341  	// if r.req, err = httpRequest(ctx); err != nil {
   342  	// 	r.logger.Error(err, "build HTTP request error at setup, will try it later")
   343  	// }
   344  	return nil
   345  }
   346  
   347  func (r *httpsVolumeStatsRequester) request(ctx context.Context) ([]byte, error) {
   348  	if r.cli == nil {
   349  		return nil, fmt.Errorf("HTTP client for kubelet is unavailable")
   350  	}
   351  	if r.req == nil {
   352  		// try to build http request again
   353  		var err error
   354  		r.req, err = httpRequest(ctx)
   355  		if err != nil {
   356  			r.logger.Error(err, "build HTTP request to query kubelet error")
   357  			return nil, err
   358  		}
   359  	}
   360  
   361  	req := r.req.WithContext(ctx)
   362  	rsp, err := r.cli.Do(req)
   363  	if err != nil {
   364  		r.logger.Error(err, "issue request to kubelet error")
   365  		return nil, err
   366  	}
   367  	if rsp.StatusCode != 200 {
   368  		r.logger.Error(nil, fmt.Sprintf("HTTP response from kubelet error: %s", rsp.Status))
   369  		return nil, fmt.Errorf(rsp.Status)
   370  	}
   371  
   372  	defer rsp.Body.Close()
   373  	return io.ReadAll(rsp.Body)
   374  }
   375  
   376  func httpClient() (*http.Client, error) {
   377  	cert, err := os.ReadFile(certFile)
   378  	if err != nil {
   379  		return nil, err
   380  	}
   381  	certPool := x509.NewCertPool()
   382  	certPool.AppendCertsFromPEM(cert)
   383  	return &http.Client{
   384  		Transport: &http.Transport{
   385  			TLSClientConfig: &tls.Config{
   386  				RootCAs: certPool,
   387  			},
   388  		},
   389  	}, nil
   390  }
   391  
   392  func httpRequest(ctx context.Context) (*http.Request, error) {
   393  	host, err := kubeletEndpointHost(ctx)
   394  	if err != nil {
   395  		return nil, err
   396  	}
   397  	port, err := kubeletEndpointPort(ctx)
   398  	if err != nil {
   399  		return nil, err
   400  	}
   401  	url := fmt.Sprintf(kubeletStatsSummaryURL, host, port)
   402  
   403  	accessToken, err := os.ReadFile(tokenFile)
   404  	if err != nil {
   405  		return nil, err
   406  	}
   407  
   408  	req, err := http.NewRequest("GET", url, nil)
   409  	if err != nil {
   410  		return nil, err
   411  	}
   412  	if len(accessToken) > 0 {
   413  		req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", accessToken))
   414  	}
   415  	return req, nil
   416  }
   417  
   418  func kubeletEndpointHost(ctx context.Context) (string, error) {
   419  	return viper.GetString(constant.KBEnvHostIP), nil
   420  }
   421  
   422  func kubeletEndpointPort(ctx context.Context) (string, error) {
   423  	config, err := rest.InClusterConfig()
   424  	if err != nil {
   425  		return "", err
   426  	}
   427  	cliset, err := kubernetes.NewForConfig(config)
   428  	if err != nil {
   429  		return "", err
   430  	}
   431  	node, err := cliset.CoreV1().Nodes().Get(ctx, viper.GetString(constant.KBEnvNodeName), metav1.GetOptions{})
   432  	if err != nil {
   433  		return "", err
   434  	}
   435  	return strconv.Itoa(int(node.Status.DaemonEndpoints.KubeletEndpoint.Port)), nil
   436  }
   437  
   438  func normalizeVolumeWatermark(watermark *int, defaultVal int) int {
   439  	if watermark == nil || *watermark < 0 || *watermark > 100 {
   440  		return defaultVal
   441  	}
   442  	return *watermark
   443  }