github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ios/iostat.go (about) 1 // Package ios is a collection of interfaces to the local storage subsystem; 2 // the package includes OS-dependent implementations for those interfaces. 3 /* 4 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 5 */ 6 package ios 7 8 import ( 9 "fmt" 10 "path/filepath" 11 "strings" 12 "sync" 13 ratomic "sync/atomic" 14 "time" 15 16 "github.com/NVIDIA/aistore/cmn" 17 "github.com/NVIDIA/aistore/cmn/atomic" 18 "github.com/NVIDIA/aistore/cmn/cos" 19 "github.com/NVIDIA/aistore/cmn/debug" 20 "github.com/NVIDIA/aistore/cmn/mono" 21 "github.com/NVIDIA/aistore/cmn/nlog" 22 ) 23 24 const statsdir = "/sys/class/block" 25 26 // public 27 type ( 28 IOS interface { 29 GetAllMpathUtils() *MpathUtil 30 GetMpathUtil(mpath string) int64 31 AddMpath(mpath, fs string, label Label, config *cmn.Config) (FsDisks, error) 32 RemoveMpath(mpath string, testingEnv bool) 33 FillDiskStats(m AllDiskStats) 34 } 35 FsDisks map[string]int64 // disk name => sector size 36 MpathUtil sync.Map 37 ) 38 39 // internal 40 type ( 41 cache struct { 42 ioms map[string]int64 // IO millis 43 util map[string]int64 // utilization 44 rms map[string]int64 // read millis 45 rbytes map[string]int64 // read bytes 46 reads map[string]int64 // completed read requests 47 rbps map[string]int64 // read B/s 48 ravg map[string]int64 // average read size 49 wms map[string]int64 // write millis 50 wbytes map[string]int64 // written bytes 51 writes map[string]int64 // completed write requests 52 wbps map[string]int64 // write B/s 53 wavg map[string]int64 // average write size 54 55 mpathUtil map[string]int64 // Average utilization of the disks, range [0, 100]. 56 mpathUtilRO MpathUtil // Read-only copy of `mpathUtil`. 57 58 expireTime int64 59 timestamp int64 60 } 61 ios struct { 62 mpath2disks map[string]FsDisks 63 disk2mpath cos.StrKVs 64 disk2sysfn cos.StrKVs 65 blockStats allBlockStats 66 lsblk ratomic.Pointer[LsBlk] 67 cache ratomic.Pointer[cache] 68 cacheHst [16]*cache 69 cacheIdx int 70 mu sync.Mutex 71 busy atomic.Bool 72 } 73 ) 74 75 // interface guard 76 var _ IOS = (*ios)(nil) 77 78 /////////////// 79 // MpathUtil // 80 /////////////// 81 82 func (x *MpathUtil) Get(mpath string) int64 { 83 if v, ok := (*sync.Map)(x).Load(mpath); ok { 84 util := v.(int64) 85 return util 86 } 87 return 100 // assume the worst 88 } 89 90 func (x *MpathUtil) Set(mpath string, util int64) { 91 (*sync.Map)(x).Store(mpath, util) 92 } 93 94 ///////// 95 // ios // 96 ///////// 97 98 func New(num int) IOS { 99 ios := &ios{ 100 mpath2disks: make(map[string]FsDisks, num), 101 disk2mpath: make(cos.StrKVs, num), 102 disk2sysfn: make(cos.StrKVs, num), 103 blockStats: make(allBlockStats, num), 104 } 105 for i := range len(ios.cacheHst) { 106 ios.cacheHst[i] = newCache(num) 107 } 108 ios._put(ios.cacheHst[0]) 109 ios.cacheIdx = 0 110 ios.busy.Store(false) // redundant on purpose 111 112 // once (cleared via Clblk) 113 if res := lsblk("new-ios", true); res != nil { 114 ios.lsblk.Store(res) 115 } 116 117 return ios 118 } 119 120 func Clblk(i IOS) { 121 ios := i.(*ios) 122 ios.lsblk.Store(nil) 123 } 124 125 func newCache(num int) *cache { 126 return &cache{ 127 ioms: make(map[string]int64, num), 128 util: make(map[string]int64, num), 129 rms: make(map[string]int64, num), 130 rbytes: make(map[string]int64, num), 131 reads: make(map[string]int64, num), 132 rbps: make(map[string]int64, num), 133 ravg: make(map[string]int64, num), 134 wms: make(map[string]int64, num), 135 wbytes: make(map[string]int64, num), 136 writes: make(map[string]int64, num), 137 wbps: make(map[string]int64, num), 138 wavg: make(map[string]int64, num), 139 mpathUtil: make(map[string]int64, num), 140 } 141 } 142 143 func (ios *ios) _get() *cache { return ios.cache.Load() } 144 func (ios *ios) _put(cache *cache) { ios.cache.Store(cache) } 145 146 // 147 // add mountpath 148 // 149 150 func (ios *ios) AddMpath(mpath, fs string, label Label, config *cmn.Config) (fsdisks FsDisks, err error) { 151 var ( 152 warn string 153 testingEnv = config.TestingEnv() 154 fspaths = config.LocalConfig.FSP.Paths 155 ) 156 if pres := ios.lsblk.Load(); pres != nil { 157 res := *pres 158 fsdisks, err = fs2disks(&res, fs, label, len(fspaths), testingEnv) 159 } else { 160 res := lsblk(fs, testingEnv) 161 if res != nil { 162 fsdisks, err = fs2disks(res, fs, label, len(fspaths), testingEnv) 163 } 164 } 165 if len(fsdisks) == 0 || err != nil { 166 return 167 } 168 ios.mu.Lock() 169 warn, err = ios._add(mpath, label, fsdisks, fspaths, testingEnv) 170 ios.mu.Unlock() 171 172 if err != nil { 173 nlog.Errorln(err) 174 } 175 if warn != "" { 176 nlog.Infoln(warn) 177 } 178 return 179 } 180 181 func (ios *ios) _add(mpath string, label Label, fsdisks FsDisks, fspaths cos.StrKVs, testingEnv bool) (warn string, _ error) { 182 if dd, ok := ios.mpath2disks[mpath]; ok { 183 return "", fmt.Errorf("duplicate mountpath %s (disks %s, %s)", mpath, dd._str(), fsdisks._str()) 184 } 185 186 ios.mpath2disks[mpath] = fsdisks 187 for disk := range fsdisks { 188 if mp, ok := ios.disk2mpath[disk]; ok && !testingEnv && !cmn.AllowSharedDisksAndNoDisks { 189 if label.IsNil() { 190 return "", fmt.Errorf("disk %s is shared between mountpaths %s and %s", disk, mpath, mp) 191 } 192 var otherLabel Label 193 if o, ok := fspaths[mp]; ok { 194 otherLabel = Label(o) 195 } 196 warn = fmt.Sprintf("Warning: disk %s is shared between %s%s and %s%s", 197 disk, mpath, label.ToLog(), mp, otherLabel.ToLog()) 198 } 199 ios.disk2mpath[disk] = mpath 200 ios.blockStats[disk] = &blockStats{} 201 } 202 203 for disk, mountpath := range ios.disk2mpath { 204 if _, ok := ios.disk2sysfn[disk]; ok { 205 continue 206 } 207 path := filepath.Join(statsdir, disk, "stat") 208 ios.disk2sysfn[disk] = path 209 210 // multipath NVMe: alternative block-stats location 211 cdisk, err := icn(disk, statsdir) 212 if err != nil { 213 if label.IsNil() { 214 return "", err 215 } 216 if warn != "" { 217 warn += "\n" 218 } 219 warn += fmt.Sprint("Warning:", err) 220 } 221 if cdisk != "" { 222 cpath := filepath.Join(statsdir, cdisk, "stat") 223 if icnPath(ios.disk2sysfn[disk], cpath, mountpath) { 224 if warn != "" { 225 warn += "\n" 226 } 227 warn += fmt.Sprint("Info: alternative block-stats path:", disk, path, "=>", cdisk, cpath) 228 ios.disk2sysfn[disk] = cpath 229 } 230 } 231 } 232 if len(ios.disk2sysfn) != len(ios.disk2mpath) { 233 for disk := range ios.disk2sysfn { 234 if _, ok := ios.disk2mpath[disk]; !ok { 235 delete(ios.disk2sysfn, disk) 236 } 237 } 238 } 239 return warn, nil 240 } 241 242 // 243 // remove mountpath 244 // 245 246 func (ios *ios) RemoveMpath(mpath string, testingEnv bool) { 247 ios.mu.Lock() 248 ios._del(mpath, testingEnv) 249 ios.mu.Unlock() 250 } 251 252 func (ios *ios) _del(mpath string, testingEnv bool) { 253 oldDisks, ok := ios.mpath2disks[mpath] 254 if !ok { 255 nlog.Warningf("mountpath %s already removed", mpath) 256 return 257 } 258 for disk := range oldDisks { 259 if testingEnv { 260 ios._delDiskTesting(mpath, disk) 261 } else { 262 ios._delDisk(mpath, disk) 263 } 264 } 265 delete(ios.mpath2disks, mpath) 266 } 267 268 // TestingEnv ("disk sharing"): 269 // If another mountpath containing the same disk is found, the disk2mpath map 270 // gets updated. Otherwise, go ahead and remove the "disk". 271 func (ios *ios) _delDiskTesting(mpath, disk string) { 272 if _, ok := ios.disk2mpath[disk]; !ok { 273 return 274 } 275 for path, disks := range ios.mpath2disks { 276 if path == mpath { 277 continue 278 } 279 for dsk := range disks { 280 if dsk == disk { 281 ios.disk2mpath[disk] = path // found - keeping 282 return 283 } 284 } 285 } 286 delete(ios.mpath2disks, disk) 287 } 288 289 func (ios *ios) _delDisk(mpath, disk string) { 290 mp, ok := ios.disk2mpath[disk] 291 if !ok { 292 return 293 } 294 debug.Assertf(mp == mpath, "(mpath %s => disk %s => mpath %s) violation", mp, disk, mpath) 295 delete(ios.disk2mpath, disk) 296 delete(ios.blockStats, disk) 297 } 298 299 // 300 // get utilization and stats; refresh stats periodically 301 // 302 303 func (ios *ios) GetAllMpathUtils() *MpathUtil { 304 cache := ios.refresh() 305 return &cache.mpathUtilRO 306 } 307 308 func (ios *ios) GetMpathUtil(mpath string) int64 { 309 return ios.GetAllMpathUtils().Get(mpath) 310 } 311 312 func (ios *ios) FillDiskStats(m AllDiskStats) { 313 cache := ios.refresh() 314 for disk := range cache.ioms { 315 m[disk] = DiskStats{ 316 RBps: cache.rbps[disk], 317 Ravg: cache.ravg[disk], 318 WBps: cache.wbps[disk], 319 Wavg: cache.wavg[disk], 320 Util: cache.util[disk], 321 } 322 } 323 for disk := range m { 324 if _, ok := cache.ioms[disk]; !ok { 325 delete(m, disk) 326 } 327 } 328 } 329 330 // update iostat cache 331 func (ios *ios) refresh() *cache { 332 var ( 333 nowTs = mono.NanoTime() 334 statsCache = ios._get() 335 ) 336 if statsCache.expireTime > nowTs { 337 return statsCache 338 } 339 if !ios.busy.CAS(false, true) { 340 return statsCache // never want callers to wait 341 } 342 343 ncache := ios.doRefresh(nowTs) 344 ios.busy.Store(false) 345 return ncache 346 } 347 348 func (ios *ios) doRefresh(nowTs int64) *cache { 349 config := cmn.GCO.Get() 350 ios.mu.Lock() 351 ncache, maxUtil, missingInfo := ios._ref(config) 352 ios.mu.Unlock() 353 354 var expireTime int64 355 if missingInfo { 356 expireTime = int64(config.Disk.IostatTimeShort) 357 } else { // use the maximum utilization to determine expiration time 358 var ( 359 lowm = max(config.Disk.DiskUtilLowWM, 1) 360 hiwm = min(config.Disk.DiskUtilHighWM, 100) 361 delta = int64(config.Disk.IostatTimeLong - config.Disk.IostatTimeShort) 362 utilRatio = cos.RatioPct(hiwm, lowm, maxUtil) 363 ) 364 utilRatio = (utilRatio + 5) / 10 * 10 // round to nearest tenth 365 expireTime = int64(config.Disk.IostatTimeShort) + delta*(100-utilRatio)/100 366 } 367 ncache.expireTime = nowTs + expireTime 368 ios._put(ncache) 369 370 return ncache 371 } 372 373 func (ios *ios) _ref(config *cmn.Config) (ncache *cache, maxUtil int64, missingInfo bool) { 374 ios.cacheIdx++ 375 ios.cacheIdx %= len(ios.cacheHst) 376 ncache = ios.cacheHst[ios.cacheIdx] // from a pool 377 378 var ( 379 statsCache = ios._get() 380 nowTs = mono.NanoTime() 381 elapsed = nowTs - statsCache.timestamp 382 elapsedSeconds = cos.DivRound(elapsed, int64(time.Second)) 383 elapsedMillis = cos.DivRound(elapsed, int64(time.Millisecond)) 384 ) 385 386 ncache.timestamp = nowTs 387 for mpath := range ios.mpath2disks { 388 ncache.mpathUtil[mpath] = 0 389 } 390 for disk := range ncache.ioms { 391 if _, ok := ios.disk2mpath[disk]; !ok { 392 ncache = newCache(len(statsCache.ioms)) 393 ios.cacheHst[ios.cacheIdx] = ncache 394 } 395 } 396 397 readStats(ios.disk2mpath, ios.disk2sysfn, ios.blockStats) 398 for disk, mpath := range ios.disk2mpath { 399 ncache.rbps[disk] = 0 400 ncache.wbps[disk] = 0 401 ncache.util[disk] = 0 402 ncache.ravg[disk] = 0 403 ncache.wavg[disk] = 0 404 ds := ios.blockStats[disk] 405 ncache.ioms[disk] = ds.IOMs() 406 ncache.rms[disk] = ds.ReadMs() 407 ncache.rbytes[disk] = ds.ReadBytes() 408 ncache.reads[disk] = ds.Reads() 409 ncache.wms[disk] = ds.WriteMs() 410 ncache.wbytes[disk] = ds.WriteBytes() 411 ncache.writes[disk] = ds.Writes() 412 413 if _, ok := statsCache.ioms[disk]; !ok { 414 missingInfo = true 415 continue 416 } 417 // deltas 418 var ( 419 ioMs = ncache.ioms[disk] - statsCache.ioms[disk] 420 reads = ncache.reads[disk] - statsCache.reads[disk] 421 writes = ncache.writes[disk] - statsCache.writes[disk] 422 readBytes = ncache.rbytes[disk] - statsCache.rbytes[disk] 423 writeBytes = ncache.wbytes[disk] - statsCache.wbytes[disk] 424 ) 425 if elapsedMillis > 0 { 426 // On macOS computation of `diskUtil` may sometimes exceed 100% 427 // which may cause some further inaccuracies. 428 if ioMs >= elapsedMillis { 429 ncache.util[disk] = 100 430 } else { 431 ncache.util[disk] = cos.DivRound(ioMs*100, elapsedMillis) 432 } 433 } else { 434 ncache.util[disk] = statsCache.util[disk] 435 } 436 if !config.TestingEnv() { 437 ncache.mpathUtil[mpath] += ncache.util[disk] 438 } 439 if elapsedSeconds > 0 { 440 ncache.rbps[disk] = cos.DivRound(readBytes, elapsedSeconds) 441 ncache.wbps[disk] = cos.DivRound(writeBytes, elapsedSeconds) 442 } else { 443 ncache.rbps[disk] = statsCache.rbps[disk] 444 ncache.wbps[disk] = statsCache.wbps[disk] 445 } 446 if reads > 0 { 447 ncache.ravg[disk] = cos.DivRound(readBytes, reads) 448 } else if elapsedSeconds == 0 { 449 ncache.ravg[disk] = statsCache.ravg[disk] 450 } else { 451 ncache.ravg[disk] = 0 452 } 453 if writes > 0 { 454 ncache.wavg[disk] = cos.DivRound(writeBytes, writes) 455 } else if elapsedSeconds == 0 { 456 ncache.wavg[disk] = statsCache.wavg[disk] 457 } else { 458 ncache.wavg[disk] = 0 459 } 460 } 461 462 // average and max 463 if config.TestingEnv() { 464 for mpath, disks := range ios.mpath2disks { 465 debug.Assert(len(disks) <= 1) // testing env: one (shared) disk per mpath 466 var u int64 467 for d := range disks { 468 u = ncache.util[d] 469 ncache.mpathUtil[mpath] = u 470 break 471 } 472 ncache.mpathUtilRO.Set(mpath, u) 473 maxUtil = max(maxUtil, u) 474 } 475 return 476 } 477 478 for mpath, disks := range ios.mpath2disks { 479 num := int64(len(disks)) 480 if num == 0 { 481 debug.Assert(ncache.mpathUtil[mpath] == 0) 482 continue 483 } 484 u := cos.DivRound(ncache.mpathUtil[mpath], num) 485 ncache.mpathUtil[mpath] = u 486 ncache.mpathUtilRO.Set(mpath, u) 487 maxUtil = max(maxUtil, u) 488 } 489 return 490 } 491 492 func (disks FsDisks) _str() string { 493 s := fmt.Sprintf("%v", disks) // with sector sizes 494 return strings.TrimPrefix(s, "map") 495 }