github.com/intel/goresctrl@v0.5.0/pkg/blockio/blockio.go (about) 1 /* 2 Copyright 2019-2021 Intel Corporation 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // Package blockio implements class-based cgroup blockio controller 18 // management for containers. 19 // 20 // Input: configuration of classes with blockio controller parameters 21 // (weights, throttling) for sets of block devices. 22 // 23 // Outputs: 24 // Option 1: Write blockio parameters of a class to a cgroup directory. 25 // Option 2: Return blockio parameters of a class in a OCI LinuxBlockIO 26 // 27 // structure, that can be passed to OCI-compliant container 28 // runtime. 29 // 30 // Notes: 31 // - Using Weight requires bfq or cfq I/O scheduler to be 32 // effective for the block devices where Weight is used. 33 // 34 // Configuration example: 35 // 36 // Classes: 37 // 38 // # Define a blockio class "LowPrioThrottled". 39 // # Containers in this class will be throttled and handled as 40 // # low priority in the I/O scheduler. 41 // 42 // LowPrioThrottled: 43 // 44 // # Weight without a Devices list specifies the default 45 // # I/O scheduler weight for all devices 46 // # that are not explicitly mentioned in following items. 47 // # This will be written to cgroups(.bfq).weight. 48 // # Weights range from 10 to 1000, the default is 100. 49 // 50 // - Weight: 80 51 // 52 // # Set all parameters for all /dev/sd* and /dev/vd* block 53 // # devices. 54 // 55 // - Devices: 56 // - /dev/sd[a-z] 57 // - /dev/vd[a-z] 58 // ThrottleReadBps: 50M # max read bytes per second 59 // ThrottleWriteBps: 10M # max write bytes per second 60 // ThrottleReadIOPS: 10k # max read io operations per second 61 // ThrottleWriteIOPS: 5k # max write io operations per second 62 // Weight: 50 # I/O scheduler (cfq/bfq) weight for 63 // # these devices will be written to 64 // # cgroups(.bfq).weight_device 65 // 66 // # Set parameters particularly for SSD devices. 67 // # This configuration overrides above configurations for those 68 // # /dev/sd* and /dev/vd* devices whose disk id contains "SSD". 69 // 70 // - Devices: 71 // - /dev/disk/by-id/*SSD* 72 // ThrottleReadBps: 100M 73 // ThrottleWriteBps: 40M 74 // # Not mentioning Throttle*IOPS means no I/O operations 75 // # throttling on matching devices. 76 // Weight: 50 77 // 78 // # Define a blockio class "HighPrioFullSpeed". 79 // # There is no throttling on these containers, and 80 // # they will be prioritized by the I/O scheduler. 81 // 82 // HighPrioFullSpeed: 83 // - Weight: 400 84 // 85 // Usage example: 86 // 87 // blockio.SetLogger(logrus.New()) 88 // if err := blockio.SetConfigFromFile("/etc/containers/blockio.yaml", false); err != nil { 89 // return err 90 // } 91 // // Output option 1: write directly to cgroup "/mytestgroup" 92 // if err := blockio.SetCgroupClass("/mytestgroup", "LowPrioThrottled"); err != nil { 93 // return err 94 // } 95 // // Output option 2: OCI LinuxBlockIO of a blockio class 96 // if lbio, err := blockio.OciLinuxBlockIO("LowPrioThrottled"); err != nil { 97 // return err 98 // } else { 99 // fmt.Printf("OCI LinuxBlockIO for LowPrioThrottled:\n%+v\n", lbio) 100 // } 101 package blockio 102 103 import ( 104 "errors" 105 "fmt" 106 stdlog "log" 107 "os" 108 "path/filepath" 109 "sort" 110 "strings" 111 "syscall" 112 113 "golang.org/x/sys/unix" 114 115 "k8s.io/apimachinery/pkg/api/resource" 116 "sigs.k8s.io/yaml" 117 118 "github.com/intel/goresctrl/pkg/cgroups" 119 grclog "github.com/intel/goresctrl/pkg/log" 120 goresctrlpath "github.com/intel/goresctrl/pkg/path" 121 ) 122 123 const ( 124 // sysfsBlockDeviceIOSchedulerPaths expands (with glob) to block device scheduler files. 125 // If modified, check how to parse device node from expanded paths. 126 sysfsBlockDeviceIOSchedulerPaths = "/sys/block/*/queue/scheduler" 127 ) 128 129 // tBlockDeviceInfo holds information on a block device to be configured. 130 // As users can specify block devices using wildcards ("/dev/disk/by-id/*SSD*") 131 // tBlockDeviceInfo.Origin is maintained for traceability: why this 132 // block device is included in configuration. 133 // tBlockDeviceInfo.DevNode contains resolved device node, like "/dev/sda". 134 type tBlockDeviceInfo struct { 135 Major int64 136 Minor int64 137 DevNode string 138 Origin string 139 } 140 141 // Our logger instance. 142 var log grclog.Logger = grclog.NewLoggerWrapper(stdlog.New(os.Stderr, "[ blockio ] ", 0)) 143 144 // classBlockIO connects user-defined block I/O classes to 145 // corresponding cgroups blockio controller parameters. 146 var classBlockIO = map[string]cgroups.BlockIOParameters{} 147 148 // SetLogger sets the logger instance to be used by the package. 149 // Examples: 150 // 151 // // Log to standard logger: 152 // stdlog := log.New(os.Stderr, "blockio:", 0) 153 // blockio.SetLogger(goresctrllog.NewLoggerWrapper(stdlog)) 154 // // Log to logrus: 155 // blockio.SetLogger(logrus.New()) 156 func SetLogger(l grclog.Logger) { 157 log = l 158 } 159 160 // SetConfigFromFile reads and applies blockio configuration from the 161 // filesystem. 162 func SetConfigFromFile(filename string, force bool) error { 163 if data, err := os.ReadFile(filename); err == nil { 164 if err = SetConfigFromData(data, force); err != nil { 165 return fmt.Errorf("failed to set configuration from file %q: %s", filename, err) 166 } 167 return nil 168 } else { 169 return fmt.Errorf("failed to read config file %q: %v", filename, err) 170 } 171 } 172 173 // SetConfigFromData parses and applies configuration from data. 174 func SetConfigFromData(data []byte, force bool) error { 175 config := &Config{} 176 if err := yaml.Unmarshal(data, &config); err != nil { 177 return err 178 } 179 return SetConfig(config, force) 180 } 181 182 // SetConfig scans available block devices and applies new configuration. 183 func SetConfig(opt *Config, force bool) error { 184 if opt == nil { 185 // Setting nil configuration clears current configuration. 186 // SetConfigFromData([]byte(""), dontcare) arrives here. 187 classBlockIO = map[string]cgroups.BlockIOParameters{} 188 return nil 189 } 190 191 currentIOSchedulers, ioSchedulerDetectionError := getCurrentIOSchedulers() 192 if ioSchedulerDetectionError != nil { 193 log.Warnf("configuration validation partly disabled due to I/O scheduler detection error %#v", ioSchedulerDetectionError.Error()) 194 } 195 196 classBlockIO = map[string]cgroups.BlockIOParameters{} 197 // Create cgroup blockio parameters for each blockio class 198 for class := range opt.Classes { 199 cgBlockIO, err := devicesParametersToCgBlockIO(opt.Classes[class], currentIOSchedulers) 200 if err != nil { 201 if force { 202 log.Warnf("ignoring: %v", err) 203 } else { 204 return err 205 } 206 } 207 classBlockIO[class] = cgBlockIO 208 } 209 return nil 210 } 211 212 // GetClasses returns block I/O class names 213 func GetClasses() []string { 214 classNames := make([]string, 0, len(classBlockIO)) 215 for name := range classBlockIO { 216 classNames = append(classNames, name) 217 } 218 sort.Strings(classNames) 219 return classNames 220 } 221 222 // SetCgroupClass sets cgroup blkio controller parameters to match 223 // blockio class. "group" is the cgroup directory of the container 224 // without mountpoint and controller (blkio) directories: 225 // "/kubepods/burstable/POD_ID/CONTAINER_ID". 226 func SetCgroupClass(group string, class string) error { 227 cgBlockIO, ok := classBlockIO[class] 228 if !ok { 229 return fmt.Errorf("no BlockIO parameters for class %#v", class) 230 } 231 err := cgroups.ResetBlkioParameters(group, cgBlockIO) 232 if err != nil { 233 return fmt.Errorf("assigning container in cgroup %q to class %#v failed: %w", group, class, err) 234 } 235 return nil 236 } 237 238 // getCurrentIOSchedulers returns currently active I/O scheduler used for each block device in the system. 239 // Returns schedulers in a map: {"/dev/sda": "bfq"} 240 func getCurrentIOSchedulers() (map[string]string, error) { 241 var ios = map[string]string{} 242 glob := goresctrlpath.Path(sysfsBlockDeviceIOSchedulerPaths) 243 schedulerFiles, err := filepath.Glob(glob) 244 if err != nil { 245 return ios, fmt.Errorf("error in I/O scheduler wildcards %#v: %w", glob, err) 246 } 247 for _, schedulerFile := range schedulerFiles { 248 devName := strings.SplitN(schedulerFile, "/", 5)[3] 249 schedulerDataB, err := os.ReadFile(schedulerFile) 250 if err != nil { 251 // A block device may be disconnected. 252 log.Errorf("failed to read current I/O scheduler %#v: %v\n", schedulerFile, err) 253 continue 254 } 255 schedulerData := strings.Trim(string(schedulerDataB), "\n") 256 currentScheduler := "" 257 if strings.IndexByte(schedulerData, ' ') == -1 { 258 currentScheduler = schedulerData 259 } else { 260 openB := strings.Index(schedulerData, "[") 261 closeB := strings.Index(schedulerData, "]") 262 if -1 < openB && openB < closeB { 263 currentScheduler = schedulerData[openB+1 : closeB] 264 } 265 } 266 if currentScheduler == "" { 267 log.Errorf("could not parse current scheduler in %#v\n", schedulerFile) 268 continue 269 } 270 271 ios["/dev/"+devName] = currentScheduler 272 } 273 return ios, nil 274 } 275 276 // deviceParametersToCgBlockIO converts single blockio class parameters into cgroups blkio format. 277 func devicesParametersToCgBlockIO(dps []DevicesParameters, currentIOSchedulers map[string]string) (cgroups.BlockIOParameters, error) { 278 errs := []error{} 279 blkio := cgroups.NewBlockIOParameters() 280 for _, dp := range dps { 281 var err error 282 var weight, throttleReadBps, throttleWriteBps, throttleReadIOPS, throttleWriteIOPS int64 283 weight, err = parseAndValidateQuantity("Weight", dp.Weight, -1, 10, 1000) 284 errs = append(errs, err) 285 throttleReadBps, err = parseAndValidateQuantity("ThrottleReadBps", dp.ThrottleReadBps, -1, 0, -1) 286 errs = append(errs, err) 287 throttleWriteBps, err = parseAndValidateQuantity("ThrottleWriteBps", dp.ThrottleWriteBps, -1, 0, -1) 288 errs = append(errs, err) 289 throttleReadIOPS, err = parseAndValidateQuantity("ThrottleReadIOPS", dp.ThrottleReadIOPS, -1, 0, -1) 290 errs = append(errs, err) 291 throttleWriteIOPS, err = parseAndValidateQuantity("ThrottleWriteIOPS", dp.ThrottleWriteIOPS, -1, 0, -1) 292 errs = append(errs, err) 293 if dp.Devices == nil { 294 if weight > -1 { 295 blkio.Weight = weight 296 } 297 if throttleReadBps > -1 || throttleWriteBps > -1 || throttleReadIOPS > -1 || throttleWriteIOPS > -1 { 298 errs = append(errs, fmt.Errorf("ignoring throttling (rbps=%#v wbps=%#v riops=%#v wiops=%#v): Devices not listed", 299 dp.ThrottleReadBps, dp.ThrottleWriteBps, dp.ThrottleReadIOPS, dp.ThrottleWriteIOPS)) 300 } 301 } else { 302 blockDevices, err := currentPlatform.configurableBlockDevices(dp.Devices) 303 if err != nil { 304 // Problems in matching block device wildcards and resolving symlinks 305 // are worth reporting, but must not block configuring blkio where possible. 306 log.Warnf("%v", err) 307 } 308 if len(blockDevices) == 0 { 309 log.Warnf("no matches on any of Devices: %v, parameters ignored", dp.Devices) 310 } 311 for _, blockDeviceInfo := range blockDevices { 312 if weight != -1 { 313 if ios, found := currentIOSchedulers[blockDeviceInfo.DevNode]; found { 314 if ios != "bfq" && ios != "cfq" { 315 log.Warnf("weight has no effect on device %#v due to "+ 316 "incompatible I/O scheduler %#v (bfq or cfq required)", blockDeviceInfo.DevNode, ios) 317 } 318 } 319 blkio.WeightDevice.Update(blockDeviceInfo.Major, blockDeviceInfo.Minor, weight) 320 } 321 if throttleReadBps != -1 { 322 blkio.ThrottleReadBpsDevice.Update(blockDeviceInfo.Major, blockDeviceInfo.Minor, throttleReadBps) 323 } 324 if throttleWriteBps != -1 { 325 blkio.ThrottleWriteBpsDevice.Update(blockDeviceInfo.Major, blockDeviceInfo.Minor, throttleWriteBps) 326 } 327 if throttleReadIOPS != -1 { 328 blkio.ThrottleReadIOPSDevice.Update(blockDeviceInfo.Major, blockDeviceInfo.Minor, throttleReadIOPS) 329 } 330 if throttleWriteIOPS != -1 { 331 blkio.ThrottleWriteIOPSDevice.Update(blockDeviceInfo.Major, blockDeviceInfo.Minor, throttleWriteIOPS) 332 } 333 } 334 } 335 } 336 return blkio, errors.Join(errs...) 337 } 338 339 // parseAndValidateQuantity parses quantities, like "64 M", and validates that they are in given range. 340 func parseAndValidateQuantity(fieldName string, fieldContent string, 341 defaultValue int64, min int64, max int64) (int64, error) { 342 // Returns field content 343 if fieldContent == "" { 344 return defaultValue, nil 345 } 346 qty, err := resource.ParseQuantity(fieldContent) 347 if err != nil { 348 return defaultValue, fmt.Errorf("syntax error in %#v (%#v)", fieldName, fieldContent) 349 } 350 value := qty.Value() 351 if min != -1 && min > value { 352 return defaultValue, fmt.Errorf("value of %#v (%#v) smaller than minimum (%#v)", fieldName, value, min) 353 } 354 if max != -1 && value > max { 355 return defaultValue, fmt.Errorf("value of %#v (%#v) bigger than maximum (%#v)", fieldName, value, max) 356 } 357 return value, nil 358 } 359 360 // platformInterface includes functions that access the system. Enables mocking the system. 361 type platformInterface interface { 362 configurableBlockDevices(devWildcards []string) ([]tBlockDeviceInfo, error) 363 } 364 365 // defaultPlatform versions of platformInterface functions access the underlying system. 366 type defaultPlatform struct{} 367 368 // currentPlatform defines which platformInterface is used: defaultPlatform or a mock, for instance. 369 var currentPlatform platformInterface = defaultPlatform{} 370 371 // configurableBlockDevices finds major:minor numbers for device filenames. Wildcards are allowed in filenames. 372 func (dpm defaultPlatform) configurableBlockDevices(devWildcards []string) ([]tBlockDeviceInfo, error) { 373 // Return map {devNode: tBlockDeviceInfo} 374 // Example: {"/dev/sda": {Major:8, Minor:0, Origin:"from symlink /dev/disk/by-id/ata-VendorXSSD from wildcard /dev/disk/by-id/*SSD*"}} 375 errs := []error{} 376 blockDevices := []tBlockDeviceInfo{} 377 var origin string 378 379 // 1. Expand wildcards to device filenames (may be symlinks) 380 // Example: devMatches["/dev/disk/by-id/ata-VendorSSD"] == "from wildcard \"dev/disk/by-id/*SSD*\"" 381 devMatches := map[string]string{} // {devNodeOrSymlink: origin} 382 for _, devWildcard := range devWildcards { 383 devWildcardMatches, err := filepath.Glob(devWildcard) 384 if err != nil { 385 errs = append(errs, fmt.Errorf("bad device wildcard %#v: %w", devWildcard, err)) 386 continue 387 } 388 if len(devWildcardMatches) == 0 { 389 errs = append(errs, fmt.Errorf("device wildcard %#v does not match any device nodes", devWildcard)) 390 continue 391 } 392 for _, devMatch := range devWildcardMatches { 393 if devMatch != devWildcard { 394 origin = fmt.Sprintf("from wildcard %#v", devWildcard) 395 } else { 396 origin = "" 397 } 398 devMatches[devMatch] = strings.TrimSpace(fmt.Sprintf("%v %v", devMatches[devMatch], origin)) 399 } 400 } 401 402 // 2. Find out real device nodes behind symlinks 403 // Example: devRealPaths["/dev/sda"] == "from symlink \"/dev/disk/by-id/ata-VendorSSD\"" 404 devRealpaths := map[string]string{} // {devNode: origin} 405 for devMatch, devOrigin := range devMatches { 406 realDevNode, err := filepath.EvalSymlinks(devMatch) 407 if err != nil { 408 errs = append(errs, fmt.Errorf("cannot filepath.EvalSymlinks(%#v): %w", devMatch, err)) 409 continue 410 } 411 if realDevNode != devMatch { 412 origin = fmt.Sprintf("from symlink %#v %v", devMatch, devOrigin) 413 } else { 414 origin = devOrigin 415 } 416 devRealpaths[realDevNode] = strings.TrimSpace(fmt.Sprintf("%v %v", devRealpaths[realDevNode], origin)) 417 } 418 419 // 3. Filter out everything but block devices that are not partitions 420 // Example: blockDevices[0] == {Major: 8, Minor: 0, DevNode: "/dev/sda", Origin: "..."} 421 for devRealpath, devOrigin := range devRealpaths { 422 origin := "" 423 if devOrigin != "" { 424 origin = fmt.Sprintf(" (origin: %s)", devOrigin) 425 } 426 fileInfo, err := os.Stat(devRealpath) 427 if err != nil { 428 errs = append(errs, fmt.Errorf("cannot os.Stat(%#v): %w%s", devRealpath, err, origin)) 429 continue 430 } 431 fileMode := fileInfo.Mode() 432 if fileMode&os.ModeDevice == 0 { 433 errs = append(errs, fmt.Errorf("file %#v is not a device%s", devRealpath, origin)) 434 continue 435 } 436 if fileMode&os.ModeCharDevice != 0 { 437 errs = append(errs, fmt.Errorf("file %#v is a character device%s", devRealpath, origin)) 438 continue 439 } 440 sys, ok := fileInfo.Sys().(*syscall.Stat_t) 441 major := unix.Major(uint64(sys.Rdev)) 442 minor := unix.Minor(uint64(sys.Rdev)) 443 if !ok { 444 errs = append(errs, fmt.Errorf("cannot get syscall stat_t from %#v: %w%s", devRealpath, err, origin)) 445 continue 446 } 447 blockDevices = append(blockDevices, tBlockDeviceInfo{ 448 Major: int64(major), 449 Minor: int64(minor), 450 DevNode: devRealpath, 451 Origin: devOrigin, 452 }) 453 } 454 return blockDevices, errors.Join(errs...) 455 }