github.com/1aal/kubeblocks@v0.0.0-20231107070852-e1c03e598921/pkg/lorry/operations/volume/protect.go (about) 1 /* 2 Copyright (C) 2022-2023 ApeCloud Co., Ltd 3 4 This file is part of KubeBlocks project 5 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU Affero General Public License as published by 8 the Free Software Foundation, either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU Affero General Public License for more details. 15 16 You should have received a copy of the GNU Affero General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 package volume 21 22 import ( 23 "context" 24 "crypto/tls" 25 "crypto/x509" 26 "encoding/json" 27 "fmt" 28 "io" 29 "net/http" 30 "os" 31 "strconv" 32 33 "github.com/go-logr/logr" 34 "github.com/pkg/errors" 35 "github.com/spf13/viper" 36 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 37 "k8s.io/client-go/kubernetes" 38 "k8s.io/client-go/rest" 39 statsv1alpha1 "k8s.io/kubelet/pkg/apis/stats/v1alpha1" 40 ctrl "sigs.k8s.io/controller-runtime" 41 42 appsv1alpha1 "github.com/1aal/kubeblocks/apis/apps/v1alpha1" 43 "github.com/1aal/kubeblocks/pkg/constant" 44 "github.com/1aal/kubeblocks/pkg/lorry/engines" 45 "github.com/1aal/kubeblocks/pkg/lorry/engines/register" 46 "github.com/1aal/kubeblocks/pkg/lorry/operations" 47 "github.com/1aal/kubeblocks/pkg/lorry/util" 48 ) 49 50 const ( 51 kubeletStatsSummaryURL = "https://%s:%s/stats/summary" 52 53 certFile = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" 54 tokenFile = "/var/run/secrets/kubernetes.io/serviceaccount/token" 55 56 reasonLock = "HighVolumeWatermark" 57 reasonUnlock = "LowVolumeWatermark" // TODO 58 ) 59 60 type volumeStatsRequester interface { 61 init(ctx context.Context) error 62 request(ctx context.Context) ([]byte, error) 63 } 64 65 type volumeExt struct { 66 Name string 67 HighWatermark int 68 Stats statsv1alpha1.VolumeStats 69 } 70 71 type Protection struct { 72 operations.Base 73 dbManager engines.DBManager 74 Requester volumeStatsRequester 75 Pod string 76 HighWatermark int 77 Volumes map[string]volumeExt 78 Readonly bool 79 SendEvent bool // to disable event for testing 80 Logger logr.Logger 81 } 82 83 func (p *Protection) Init(ctx context.Context) error { 84 p.Logger = ctrl.Log.WithName("Volume-Protection") 85 if p.Requester == nil { 86 p.Requester = &httpsVolumeStatsRequester{ 87 logger: p.Logger, 88 } 89 } 90 p.SendEvent = true 91 92 dbManager, err := register.GetDBManager() 93 if err != nil { 94 return errors.Wrap(err, "get manager failed") 95 } 96 p.dbManager = dbManager 97 98 if err := p.Requester.init(ctx); err != nil { 99 return err 100 } 101 102 p.Pod = viper.GetString(constant.KBEnvPodName) 103 if err := p.initVolumes(); err != nil { 104 p.Logger.Error(err, "init volumes to monitor error") 105 } 106 p.Logger.Info("succeed to init volume protection", "pod", p.Pod, "spec", p.buildVolumesMsg()) 107 return nil 108 } 109 110 func (p *Protection) PreCheck(ctx context.Context, req *operations.OpsRequest) error { 111 return nil 112 } 113 114 func (p *Protection) Do(ctx context.Context, req *operations.OpsRequest) (*operations.OpsResponse, error) { 115 if p.disabled() { 116 p.Logger.Info("the volume protection operation is disabled") 117 return nil, nil 118 } 119 120 summary, err := p.Requester.request(ctx) 121 if err != nil { 122 p.Logger.Error(err, "request stats summary from kubelet error") 123 return nil, err 124 } 125 126 if err = p.updateVolumeStats(summary); err != nil { 127 return nil, err 128 } 129 130 volumeUsages, err := p.checkUsage(ctx) 131 resp := &operations.OpsResponse{ 132 Data: map[string]any{}, 133 } 134 if err == nil { 135 resp.Data["protect"] = volumeUsages 136 } 137 return resp, err 138 } 139 140 func (p *Protection) initVolumes() error { 141 spec := &appsv1alpha1.VolumeProtectionSpec{} 142 raw := viper.GetString(constant.KBEnvVolumeProtectionSpec) 143 if err := json.Unmarshal([]byte(raw), spec); err != nil { 144 p.Logger.Error(err, "unmarshal volume protection spec error", "raw spec", raw) 145 return err 146 } 147 148 p.HighWatermark = normalizeVolumeWatermark(&spec.HighWatermark, 0) 149 150 if p.Volumes == nil { 151 p.Volumes = make(map[string]volumeExt) 152 } 153 for _, v := range spec.Volumes { 154 p.Volumes[v.Name] = volumeExt{ 155 Name: v.Name, 156 HighWatermark: normalizeVolumeWatermark(v.HighWatermark, p.HighWatermark), 157 Stats: statsv1alpha1.VolumeStats{ 158 Name: v.Name, 159 }, 160 } 161 } 162 return nil 163 } 164 165 func (p *Protection) disabled() bool { 166 // TODO: check the role and skip secondary instances. 167 if len(p.Pod) == 0 || len(p.Volumes) == 0 { 168 return true 169 } 170 for _, v := range p.Volumes { 171 // take (0, 100] as enabled 172 if v.HighWatermark > 0 && v.HighWatermark <= 100 { 173 return false 174 } 175 } 176 return true 177 } 178 179 func (p *Protection) updateVolumeStats(payload []byte) error { 180 summary := &statsv1alpha1.Summary{} 181 if err := json.Unmarshal(payload, summary); err != nil { 182 p.Logger.Error(err, "stats summary obtained from kubelet error") 183 return err 184 } 185 for _, pod := range summary.Pods { 186 if pod.PodRef.Name == p.Pod { 187 for _, stats := range pod.VolumeStats { 188 if _, ok := p.Volumes[stats.Name]; !ok { 189 continue 190 } 191 v := p.Volumes[stats.Name] 192 v.Stats = stats 193 p.Volumes[stats.Name] = v 194 } 195 break 196 } 197 } 198 return nil 199 } 200 201 func (p *Protection) checkUsage(ctx context.Context) (map[string]any, error) { 202 lower := make([]string, 0) 203 higher := make([]string, 0) 204 for name, v := range p.Volumes { 205 ret := p.checkVolumeWatermark(v) 206 if ret == 0 { 207 lower = append(lower, name) 208 } else { 209 higher = append(higher, name) 210 } 211 } 212 213 volumeUsages := p.buildVolumesMsg() 214 readonly := p.Readonly 215 // the instance is running normally and there have volume(s) over the space usage threshold. 216 if !readonly && len(higher) > 0 { 217 if err := p.highWatermark(ctx, volumeUsages); err != nil { 218 return volumeUsages, err 219 } 220 } 221 // the instance is protected in RO mode, and all volumes' space usage are under the threshold. 222 if readonly && len(lower) == len(p.Volumes) { 223 if err := p.lowWatermark(ctx, volumeUsages); err != nil { 224 return volumeUsages, err 225 } 226 } 227 return volumeUsages, nil 228 } 229 230 // checkVolumeWatermark checks whether the volume's space usage is over the threshold. 231 // 232 // returns 0 if the volume will not be taken in account or its space usage is under the threshold 233 // returns non-zero if the volume space usage is over the threshold 234 func (p *Protection) checkVolumeWatermark(v volumeExt) int { 235 if v.HighWatermark == 0 { // disabled 236 return 0 237 } 238 if v.Stats.CapacityBytes == nil || v.Stats.UsedBytes == nil { 239 return 0 240 } 241 thresholdBytes := *v.Stats.CapacityBytes / 100 * uint64(v.HighWatermark) 242 if *v.Stats.UsedBytes < thresholdBytes { 243 return 0 244 } 245 return 1 246 } 247 248 func (p *Protection) highWatermark(ctx context.Context, volumeUsages map[string]any) error { 249 if p.Readonly { // double check 250 return nil 251 } 252 if err := p.lockInstance(ctx); err != nil { 253 p.Logger.Error(err, "set instance to read-only error", "volumes", volumeUsages) 254 return err 255 } 256 257 p.Logger.Info("set instance to read-only OK", "msg", volumeUsages) 258 p.Readonly = true 259 260 if err := p.sendEvent(ctx, reasonLock, volumeUsages); err != nil { 261 p.Logger.Error(err, "send volume protection (lock) event error", "volumes", volumeUsages) 262 return err 263 } 264 return nil 265 } 266 267 func (p *Protection) lowWatermark(ctx context.Context, volumeUsages map[string]any) error { 268 if !p.Readonly { // double check 269 return nil 270 } 271 if err := p.unlockInstance(ctx); err != nil { 272 p.Logger.Error(err, "reset instance to read-write error", "volumes", volumeUsages) 273 return err 274 } 275 276 p.Logger.Info("reset instance to read-write OK", "msg", volumeUsages) 277 p.Readonly = false 278 279 if err := p.sendEvent(ctx, reasonUnlock, volumeUsages); err != nil { 280 p.Logger.Error(err, "send volume protection (unlock) event error", "volumes", volumeUsages) 281 return err 282 } 283 return nil 284 } 285 286 func (p *Protection) lockInstance(ctx context.Context) error { 287 return p.dbManager.Lock(ctx, "disk full") 288 } 289 290 func (p *Protection) unlockInstance(ctx context.Context) error { 291 return p.dbManager.Unlock(ctx) 292 } 293 294 func (p *Protection) buildVolumesMsg() map[string]any { 295 volumes := make([]map[string]string, 0) 296 for _, v := range p.Volumes { 297 usage := make(map[string]string) 298 if v.HighWatermark != p.HighWatermark { 299 usage["highWatermark"] = fmt.Sprintf("%d", v.HighWatermark) 300 } 301 stats := v.Stats 302 if stats.UsedBytes == nil || stats.CapacityBytes == nil { 303 usage[v.Name] = "<nil>" 304 } else { 305 usage[v.Name] = fmt.Sprintf("%d%%", int(*stats.UsedBytes*100 / *stats.CapacityBytes)) 306 } 307 volumes = append(volumes, usage) 308 } 309 usages := map[string]any{ 310 "highWatermark": fmt.Sprintf("%d", p.HighWatermark), 311 "volumes": volumes, 312 } 313 return usages 314 } 315 316 func (p *Protection) sendEvent(ctx context.Context, reason string, volumeUsages map[string]any) error { 317 if p.SendEvent { 318 event, err := util.CreateEvent(reason, volumeUsages) 319 if err != nil { 320 return errors.Wrap(err, "create volume protection event failed") 321 } 322 return util.SendEvent(ctx, event) 323 } 324 return nil 325 } 326 327 type httpsVolumeStatsRequester struct { 328 logger logr.Logger 329 cli *http.Client 330 req *http.Request 331 } 332 333 var _ volumeStatsRequester = &httpsVolumeStatsRequester{} 334 335 func (r *httpsVolumeStatsRequester) init(ctx context.Context) error { 336 var err error 337 if r.cli, err = httpClient(); err != nil { 338 r.logger.Error(err, "build HTTP client error at setup") 339 return err 340 } 341 // if r.req, err = httpRequest(ctx); err != nil { 342 // r.logger.Error(err, "build HTTP request error at setup, will try it later") 343 // } 344 return nil 345 } 346 347 func (r *httpsVolumeStatsRequester) request(ctx context.Context) ([]byte, error) { 348 if r.cli == nil { 349 return nil, fmt.Errorf("HTTP client for kubelet is unavailable") 350 } 351 if r.req == nil { 352 // try to build http request again 353 var err error 354 r.req, err = httpRequest(ctx) 355 if err != nil { 356 r.logger.Error(err, "build HTTP request to query kubelet error") 357 return nil, err 358 } 359 } 360 361 req := r.req.WithContext(ctx) 362 rsp, err := r.cli.Do(req) 363 if err != nil { 364 r.logger.Error(err, "issue request to kubelet error") 365 return nil, err 366 } 367 if rsp.StatusCode != 200 { 368 r.logger.Error(nil, fmt.Sprintf("HTTP response from kubelet error: %s", rsp.Status)) 369 return nil, fmt.Errorf(rsp.Status) 370 } 371 372 defer rsp.Body.Close() 373 return io.ReadAll(rsp.Body) 374 } 375 376 func httpClient() (*http.Client, error) { 377 cert, err := os.ReadFile(certFile) 378 if err != nil { 379 return nil, err 380 } 381 certPool := x509.NewCertPool() 382 certPool.AppendCertsFromPEM(cert) 383 return &http.Client{ 384 Transport: &http.Transport{ 385 TLSClientConfig: &tls.Config{ 386 RootCAs: certPool, 387 }, 388 }, 389 }, nil 390 } 391 392 func httpRequest(ctx context.Context) (*http.Request, error) { 393 host, err := kubeletEndpointHost(ctx) 394 if err != nil { 395 return nil, err 396 } 397 port, err := kubeletEndpointPort(ctx) 398 if err != nil { 399 return nil, err 400 } 401 url := fmt.Sprintf(kubeletStatsSummaryURL, host, port) 402 403 accessToken, err := os.ReadFile(tokenFile) 404 if err != nil { 405 return nil, err 406 } 407 408 req, err := http.NewRequest("GET", url, nil) 409 if err != nil { 410 return nil, err 411 } 412 if len(accessToken) > 0 { 413 req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", accessToken)) 414 } 415 return req, nil 416 } 417 418 func kubeletEndpointHost(ctx context.Context) (string, error) { 419 return viper.GetString(constant.KBEnvHostIP), nil 420 } 421 422 func kubeletEndpointPort(ctx context.Context) (string, error) { 423 config, err := rest.InClusterConfig() 424 if err != nil { 425 return "", err 426 } 427 cliset, err := kubernetes.NewForConfig(config) 428 if err != nil { 429 return "", err 430 } 431 node, err := cliset.CoreV1().Nodes().Get(ctx, viper.GetString(constant.KBEnvNodeName), metav1.GetOptions{}) 432 if err != nil { 433 return "", err 434 } 435 return strconv.Itoa(int(node.Status.DaemonEndpoints.KubeletEndpoint.Port)), nil 436 } 437 438 func normalizeVolumeWatermark(watermark *int, defaultVal int) int { 439 if watermark == nil || *watermark < 0 || *watermark > 100 { 440 return defaultVal 441 } 442 return *watermark 443 }