github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/erasure.go (about) 1 // Copyright (c) 2015-2021 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package cmd 19 20 import ( 21 "context" 22 "errors" 23 "fmt" 24 "math/rand" 25 "os" 26 "runtime" 27 "sort" 28 "sync" 29 "time" 30 31 "github.com/minio/madmin-go/v3" 32 "github.com/minio/minio/internal/dsync" 33 xioutil "github.com/minio/minio/internal/ioutil" 34 "github.com/minio/minio/internal/logger" 35 "github.com/minio/pkg/v2/sync/errgroup" 36 ) 37 38 // list all errors that can be ignore in a bucket operation. 39 var bucketOpIgnoredErrs = append(baseIgnoredErrs, errDiskAccessDenied, errUnformattedDisk) 40 41 // list all errors that can be ignored in a bucket metadata operation. 42 var bucketMetadataOpIgnoredErrs = append(bucketOpIgnoredErrs, errVolumeNotFound) 43 44 // OfflineDisk represents an unavailable disk. 45 var OfflineDisk StorageAPI // zero value is nil 46 47 // erasureObjects - Implements ER object layer. 48 type erasureObjects struct { 49 setDriveCount int 50 defaultParityCount int 51 52 setIndex int 53 poolIndex int 54 55 // getDisks returns list of storageAPIs. 56 getDisks func() []StorageAPI 57 58 // getLockers returns list of remote and local lockers. 59 getLockers func() ([]dsync.NetLocker, string) 60 61 // getEndpoints returns list of endpoint belonging this set. 62 // some may be local and some remote. 63 getEndpoints func() []Endpoint 64 65 // getEndpoints returns list of endpoint strings belonging this set. 66 // some may be local and some remote. 67 getEndpointStrings func() []string 68 69 // Locker mutex map. 70 nsMutex *nsLockMap 71 } 72 73 // NewNSLock - initialize a new namespace RWLocker instance. 74 func (er erasureObjects) NewNSLock(bucket string, objects ...string) RWLocker { 75 return er.nsMutex.NewNSLock(er.getLockers, bucket, objects...) 76 } 77 78 // Shutdown function for object storage interface. 79 func (er erasureObjects) Shutdown(ctx context.Context) error { 80 // Add any object layer shutdown activities here. 81 closeStorageDisks(er.getDisks()...) 82 return nil 83 } 84 85 // defaultWQuorum write quorum based on setDriveCount and defaultParityCount 86 func (er erasureObjects) defaultWQuorum() int { 87 dataCount := er.setDriveCount - er.defaultParityCount 88 if dataCount == er.defaultParityCount { 89 return dataCount + 1 90 } 91 return dataCount 92 } 93 94 func diskErrToDriveState(err error) (state string) { 95 switch { 96 case errors.Is(err, errDiskNotFound) || errors.Is(err, context.DeadlineExceeded): 97 state = madmin.DriveStateOffline 98 case errors.Is(err, errCorruptedFormat) || errors.Is(err, errCorruptedBackend): 99 state = madmin.DriveStateCorrupt 100 case errors.Is(err, errUnformattedDisk): 101 state = madmin.DriveStateUnformatted 102 case errors.Is(err, errDiskAccessDenied): 103 state = madmin.DriveStatePermission 104 case errors.Is(err, errFaultyDisk): 105 state = madmin.DriveStateFaulty 106 case err == nil: 107 state = madmin.DriveStateOk 108 default: 109 state = fmt.Sprintf("%s (cause: %s)", madmin.DriveStateUnknown, err) 110 } 111 112 return 113 } 114 115 func getOnlineOfflineDisksStats(disksInfo []madmin.Disk) (onlineDisks, offlineDisks madmin.BackendDisks) { 116 onlineDisks = make(madmin.BackendDisks) 117 offlineDisks = make(madmin.BackendDisks) 118 119 for _, disk := range disksInfo { 120 ep := disk.Endpoint 121 if _, ok := offlineDisks[ep]; !ok { 122 offlineDisks[ep] = 0 123 } 124 if _, ok := onlineDisks[ep]; !ok { 125 onlineDisks[ep] = 0 126 } 127 } 128 129 // Wait for the routines. 130 for _, disk := range disksInfo { 131 ep := disk.Endpoint 132 state := disk.State 133 if state != madmin.DriveStateOk && state != madmin.DriveStateUnformatted { 134 offlineDisks[ep]++ 135 continue 136 } 137 onlineDisks[ep]++ 138 } 139 140 rootDiskCount := 0 141 for _, di := range disksInfo { 142 if di.RootDisk { 143 rootDiskCount++ 144 } 145 } 146 147 // Count offline disks as well to ensure consistent 148 // reportability of offline drives on local setups. 149 if len(disksInfo) == (rootDiskCount + offlineDisks.Sum()) { 150 // Success. 151 return onlineDisks, offlineDisks 152 } 153 154 // Root disk should be considered offline 155 for i := range disksInfo { 156 ep := disksInfo[i].Endpoint 157 if disksInfo[i].RootDisk { 158 offlineDisks[ep]++ 159 onlineDisks[ep]-- 160 } 161 } 162 163 return onlineDisks, offlineDisks 164 } 165 166 // getDisksInfo - fetch disks info across all other storage API. 167 func getDisksInfo(disks []StorageAPI, endpoints []Endpoint, metrics bool) (disksInfo []madmin.Disk) { 168 disksInfo = make([]madmin.Disk, len(disks)) 169 170 g := errgroup.WithNErrs(len(disks)) 171 for index := range disks { 172 index := index 173 g.Go(func() error { 174 di := madmin.Disk{ 175 Endpoint: endpoints[index].String(), 176 PoolIndex: endpoints[index].PoolIdx, 177 SetIndex: endpoints[index].SetIdx, 178 DiskIndex: endpoints[index].DiskIdx, 179 Local: endpoints[index].IsLocal, 180 } 181 if disks[index] == OfflineDisk { 182 di.State = diskErrToDriveState(errDiskNotFound) 183 disksInfo[index] = di 184 return nil 185 } 186 info, err := disks[index].DiskInfo(context.TODO(), DiskInfoOptions{Metrics: metrics}) 187 di.DrivePath = info.MountPath 188 di.TotalSpace = info.Total 189 di.UsedSpace = info.Used 190 di.AvailableSpace = info.Free 191 di.UUID = info.ID 192 di.Major = info.Major 193 di.Minor = info.Minor 194 di.RootDisk = info.RootDisk 195 di.Healing = info.Healing 196 di.Scanning = info.Scanning 197 di.State = diskErrToDriveState(err) 198 di.FreeInodes = info.FreeInodes 199 di.UsedInodes = info.UsedInodes 200 if info.Healing { 201 if hi := disks[index].Healing(); hi != nil { 202 hd := hi.toHealingDisk() 203 di.HealInfo = &hd 204 } 205 } 206 di.Metrics = &madmin.DiskMetrics{ 207 LastMinute: make(map[string]madmin.TimedAction, len(info.Metrics.LastMinute)), 208 APICalls: make(map[string]uint64, len(info.Metrics.APICalls)), 209 TotalErrorsAvailability: info.Metrics.TotalErrorsAvailability, 210 TotalErrorsTimeout: info.Metrics.TotalErrorsTimeout, 211 TotalWaiting: info.Metrics.TotalWaiting, 212 } 213 for k, v := range info.Metrics.LastMinute { 214 if v.N > 0 { 215 di.Metrics.LastMinute[k] = v.asTimedAction() 216 } 217 } 218 for k, v := range info.Metrics.APICalls { 219 di.Metrics.APICalls[k] = v 220 } 221 if info.Total > 0 { 222 di.Utilization = float64(info.Used / info.Total * 100) 223 } 224 disksInfo[index] = di 225 return nil 226 }, index) 227 } 228 229 g.Wait() 230 return disksInfo 231 } 232 233 // Get an aggregated storage info across all disks. 234 func getStorageInfo(disks []StorageAPI, endpoints []Endpoint, metrics bool) StorageInfo { 235 disksInfo := getDisksInfo(disks, endpoints, metrics) 236 237 // Sort so that the first element is the smallest. 238 sort.Slice(disksInfo, func(i, j int) bool { 239 return disksInfo[i].TotalSpace < disksInfo[j].TotalSpace 240 }) 241 242 storageInfo := StorageInfo{ 243 Disks: disksInfo, 244 } 245 246 storageInfo.Backend.Type = madmin.Erasure 247 return storageInfo 248 } 249 250 // StorageInfo - returns underlying storage statistics. 251 func (er erasureObjects) StorageInfo(ctx context.Context) StorageInfo { 252 disks := er.getDisks() 253 endpoints := er.getEndpoints() 254 return getStorageInfo(disks, endpoints, true) 255 } 256 257 // LocalStorageInfo - returns underlying local storage statistics. 258 func (er erasureObjects) LocalStorageInfo(ctx context.Context, metrics bool) StorageInfo { 259 disks := er.getDisks() 260 endpoints := er.getEndpoints() 261 262 var localDisks []StorageAPI 263 var localEndpoints []Endpoint 264 265 for i, endpoint := range endpoints { 266 if endpoint.IsLocal { 267 localDisks = append(localDisks, disks[i]) 268 localEndpoints = append(localEndpoints, endpoint) 269 } 270 } 271 272 return getStorageInfo(localDisks, localEndpoints, metrics) 273 } 274 275 // getOnlineDisksWithHealingAndInfo - returns online disks and overall healing status. 276 // Disks are randomly ordered, but in the following groups: 277 // - Non-scanning disks 278 // - Non-healing disks 279 // - Healing disks (if inclHealing is true) 280 func (er erasureObjects) getOnlineDisksWithHealingAndInfo(inclHealing bool) (newDisks []StorageAPI, newInfos []DiskInfo, healing bool) { 281 var wg sync.WaitGroup 282 disks := er.getDisks() 283 infos := make([]DiskInfo, len(disks)) 284 r := rand.New(rand.NewSource(time.Now().UnixNano())) 285 for _, i := range r.Perm(len(disks)) { 286 i := i 287 wg.Add(1) 288 go func() { 289 defer wg.Done() 290 291 disk := disks[i] 292 if disk == nil { 293 infos[i].Error = errDiskNotFound.Error() 294 return 295 } 296 297 di, err := disk.DiskInfo(context.Background(), DiskInfoOptions{}) 298 infos[i] = di 299 if err != nil { 300 // - Do not consume disks which are not reachable 301 // unformatted or simply not accessible for some reason. 302 infos[i].Error = err.Error() 303 } 304 }() 305 } 306 wg.Wait() 307 308 var scanningDisks, healingDisks []StorageAPI 309 var scanningInfos, healingInfos []DiskInfo 310 311 for i, info := range infos { 312 // Check if one of the drives in the set is being healed. 313 // this information is used by scanner to skip healing 314 // this erasure set while it calculates the usage. 315 if info.Error != "" || disks[i] == nil { 316 continue 317 } 318 if info.Healing { 319 healing = true 320 if inclHealing { 321 healingDisks = append(healingDisks, disks[i]) 322 healingInfos = append(healingInfos, infos[i]) 323 } 324 continue 325 } 326 327 if !info.Scanning { 328 newDisks = append(newDisks, disks[i]) 329 newInfos = append(newInfos, infos[i]) 330 } else { 331 scanningDisks = append(scanningDisks, disks[i]) 332 scanningInfos = append(scanningInfos, infos[i]) 333 } 334 } 335 336 // Prefer non-scanning disks over disks which are currently being scanned. 337 newDisks = append(newDisks, scanningDisks...) 338 newInfos = append(newInfos, scanningInfos...) 339 340 /// Then add healing disks. 341 newDisks = append(newDisks, healingDisks...) 342 newInfos = append(newInfos, healingInfos...) 343 344 return newDisks, newInfos, healing 345 } 346 347 func (er erasureObjects) getOnlineDisksWithHealing(inclHealing bool) (newDisks []StorageAPI, healing bool) { 348 newDisks, _, healing = er.getOnlineDisksWithHealingAndInfo(inclHealing) 349 return 350 } 351 352 // Clean-up previously deleted objects. from .minio.sys/tmp/.trash/ 353 func (er erasureObjects) cleanupDeletedObjects(ctx context.Context) { 354 // run multiple cleanup's local to this server. 355 var wg sync.WaitGroup 356 for _, disk := range er.getLocalDisks() { 357 if disk != nil { 358 wg.Add(1) 359 go func(disk StorageAPI) { 360 defer wg.Done() 361 diskPath := disk.Endpoint().Path 362 readDirFn(pathJoin(diskPath, minioMetaTmpDeletedBucket), func(ddir string, typ os.FileMode) error { 363 w := xioutil.NewDeadlineWorker(globalDriveConfig.GetMaxTimeout()) 364 return w.Run(func() error { 365 wait := deletedCleanupSleeper.Timer(ctx) 366 removeAll(pathJoin(diskPath, minioMetaTmpDeletedBucket, ddir)) 367 wait() 368 return nil 369 }) 370 }) 371 }(disk) 372 } 373 } 374 wg.Wait() 375 } 376 377 // nsScanner will start scanning buckets and send updated totals as they are traversed. 378 // Updates are sent on a regular basis and the caller *must* consume them. 379 func (er erasureObjects) nsScanner(ctx context.Context, buckets []BucketInfo, wantCycle uint32, updates chan<- dataUsageCache, healScanMode madmin.HealScanMode) error { 380 if len(buckets) == 0 { 381 return nil 382 } 383 384 // Collect disks we can use. 385 disks, healing := er.getOnlineDisksWithHealing(false) 386 if len(disks) == 0 { 387 logger.LogIf(ctx, errors.New("data-scanner: all drives are offline or being healed, skipping scanner cycle")) 388 return nil 389 } 390 391 // Load bucket totals 392 oldCache := dataUsageCache{} 393 if err := oldCache.load(ctx, er, dataUsageCacheName); err != nil { 394 return err 395 } 396 397 // New cache.. 398 cache := dataUsageCache{ 399 Info: dataUsageCacheInfo{ 400 Name: dataUsageRoot, 401 NextCycle: oldCache.Info.NextCycle, 402 }, 403 Cache: make(map[string]dataUsageEntry, len(oldCache.Cache)), 404 } 405 406 // Put all buckets into channel. 407 bucketCh := make(chan BucketInfo, len(buckets)) 408 409 // Shuffle buckets to ensure total randomness of buckets, being scanned. 410 // Otherwise same set of buckets get scanned across erasure sets always. 411 // at any given point in time. This allows different buckets to be scanned 412 // in different order per erasure set, this wider spread is needed when 413 // there are lots of buckets with different order of objects in them. 414 r := rand.New(rand.NewSource(time.Now().UnixNano())) 415 permutes := r.Perm(len(buckets)) 416 // Add new buckets first 417 for _, idx := range permutes { 418 b := buckets[idx] 419 if e := oldCache.find(b.Name); e == nil { 420 bucketCh <- b 421 } 422 } 423 for _, idx := range permutes { 424 b := buckets[idx] 425 if e := oldCache.find(b.Name); e != nil { 426 cache.replace(b.Name, dataUsageRoot, *e) 427 bucketCh <- b 428 } 429 } 430 xioutil.SafeClose(bucketCh) 431 432 bucketResults := make(chan dataUsageEntryInfo, len(disks)) 433 434 // Start async collector/saver. 435 // This goroutine owns the cache. 436 var saverWg sync.WaitGroup 437 saverWg.Add(1) 438 go func() { 439 // Add jitter to the update time so multiple sets don't sync up. 440 updateTime := 30*time.Second + time.Duration(float64(10*time.Second)*rand.Float64()) 441 t := time.NewTicker(updateTime) 442 defer t.Stop() 443 defer saverWg.Done() 444 var lastSave time.Time 445 446 for { 447 select { 448 case <-t.C: 449 if cache.Info.LastUpdate.Equal(lastSave) { 450 continue 451 } 452 logger.LogOnceIf(ctx, cache.save(ctx, er, dataUsageCacheName), "nsscanner-cache-update") 453 updates <- cache.clone() 454 455 lastSave = cache.Info.LastUpdate 456 case v, ok := <-bucketResults: 457 if !ok { 458 // Save final state... 459 cache.Info.NextCycle = wantCycle 460 cache.Info.LastUpdate = time.Now() 461 logger.LogOnceIf(ctx, cache.save(ctx, er, dataUsageCacheName), "nsscanner-channel-closed") 462 updates <- cache.clone() 463 return 464 } 465 cache.replace(v.Name, v.Parent, v.Entry) 466 cache.Info.LastUpdate = time.Now() 467 } 468 } 469 }() 470 471 // Restrict parallelism for disk usage scanner 472 // upto GOMAXPROCS if GOMAXPROCS is < len(disks) 473 maxProcs := runtime.GOMAXPROCS(0) 474 if maxProcs < len(disks) { 475 disks = disks[:maxProcs] 476 } 477 478 // Start one scanner per disk 479 var wg sync.WaitGroup 480 wg.Add(len(disks)) 481 482 for i := range disks { 483 go func(i int) { 484 defer wg.Done() 485 disk := disks[i] 486 487 for bucket := range bucketCh { 488 select { 489 case <-ctx.Done(): 490 return 491 default: 492 } 493 494 // Load cache for bucket 495 cacheName := pathJoin(bucket.Name, dataUsageCacheName) 496 cache := dataUsageCache{} 497 logger.LogIf(ctx, cache.load(ctx, er, cacheName)) 498 if cache.Info.Name == "" { 499 cache.Info.Name = bucket.Name 500 } 501 cache.Info.SkipHealing = healing 502 cache.Info.NextCycle = wantCycle 503 if cache.Info.Name != bucket.Name { 504 cache.Info = dataUsageCacheInfo{ 505 Name: bucket.Name, 506 LastUpdate: time.Time{}, 507 NextCycle: wantCycle, 508 } 509 } 510 // Collect updates. 511 updates := make(chan dataUsageEntry, 1) 512 var wg sync.WaitGroup 513 wg.Add(1) 514 go func(name string) { 515 defer wg.Done() 516 for update := range updates { 517 select { 518 case <-ctx.Done(): 519 case bucketResults <- dataUsageEntryInfo{ 520 Name: name, 521 Parent: dataUsageRoot, 522 Entry: update, 523 }: 524 } 525 } 526 }(cache.Info.Name) 527 // Calc usage 528 before := cache.Info.LastUpdate 529 var err error 530 cache, err = disk.NSScanner(ctx, cache, updates, healScanMode, nil) 531 if err != nil { 532 if !cache.Info.LastUpdate.IsZero() && cache.Info.LastUpdate.After(before) { 533 logger.LogIf(ctx, cache.save(ctx, er, cacheName)) 534 } else { 535 logger.LogIf(ctx, err) 536 } 537 // This ensures that we don't close 538 // bucketResults channel while the 539 // updates-collector goroutine still 540 // holds a reference to this. 541 wg.Wait() 542 continue 543 } 544 545 wg.Wait() 546 var root dataUsageEntry 547 if r := cache.root(); r != nil { 548 root = cache.flatten(*r) 549 } 550 select { 551 case <-ctx.Done(): 552 return 553 case bucketResults <- dataUsageEntryInfo{ 554 Name: cache.Info.Name, 555 Parent: dataUsageRoot, 556 Entry: root, 557 }: 558 } 559 560 // Save cache 561 logger.LogIf(ctx, cache.save(ctx, er, cacheName)) 562 } 563 }(i) 564 } 565 wg.Wait() 566 xioutil.SafeClose(bucketResults) 567 saverWg.Wait() 568 569 return nil 570 }