github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/erasure.go (about)

     1  // Copyright (c) 2015-2021 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package cmd
    19  
    20  import (
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"math/rand"
    25  	"os"
    26  	"runtime"
    27  	"sort"
    28  	"sync"
    29  	"time"
    30  
    31  	"github.com/minio/madmin-go/v3"
    32  	"github.com/minio/minio/internal/dsync"
    33  	xioutil "github.com/minio/minio/internal/ioutil"
    34  	"github.com/minio/minio/internal/logger"
    35  	"github.com/minio/pkg/v2/sync/errgroup"
    36  )
    37  
    38  // list all errors that can be ignore in a bucket operation.
    39  var bucketOpIgnoredErrs = append(baseIgnoredErrs, errDiskAccessDenied, errUnformattedDisk)
    40  
    41  // list all errors that can be ignored in a bucket metadata operation.
    42  var bucketMetadataOpIgnoredErrs = append(bucketOpIgnoredErrs, errVolumeNotFound)
    43  
    44  // OfflineDisk represents an unavailable disk.
    45  var OfflineDisk StorageAPI // zero value is nil
    46  
    47  // erasureObjects - Implements ER object layer.
    48  type erasureObjects struct {
    49  	setDriveCount      int
    50  	defaultParityCount int
    51  
    52  	setIndex  int
    53  	poolIndex int
    54  
    55  	// getDisks returns list of storageAPIs.
    56  	getDisks func() []StorageAPI
    57  
    58  	// getLockers returns list of remote and local lockers.
    59  	getLockers func() ([]dsync.NetLocker, string)
    60  
    61  	// getEndpoints returns list of endpoint belonging this set.
    62  	// some may be local and some remote.
    63  	getEndpoints func() []Endpoint
    64  
    65  	// getEndpoints returns list of endpoint strings belonging this set.
    66  	// some may be local and some remote.
    67  	getEndpointStrings func() []string
    68  
    69  	// Locker mutex map.
    70  	nsMutex *nsLockMap
    71  }
    72  
    73  // NewNSLock - initialize a new namespace RWLocker instance.
    74  func (er erasureObjects) NewNSLock(bucket string, objects ...string) RWLocker {
    75  	return er.nsMutex.NewNSLock(er.getLockers, bucket, objects...)
    76  }
    77  
    78  // Shutdown function for object storage interface.
    79  func (er erasureObjects) Shutdown(ctx context.Context) error {
    80  	// Add any object layer shutdown activities here.
    81  	closeStorageDisks(er.getDisks()...)
    82  	return nil
    83  }
    84  
    85  // defaultWQuorum write quorum based on setDriveCount and defaultParityCount
    86  func (er erasureObjects) defaultWQuorum() int {
    87  	dataCount := er.setDriveCount - er.defaultParityCount
    88  	if dataCount == er.defaultParityCount {
    89  		return dataCount + 1
    90  	}
    91  	return dataCount
    92  }
    93  
    94  func diskErrToDriveState(err error) (state string) {
    95  	switch {
    96  	case errors.Is(err, errDiskNotFound) || errors.Is(err, context.DeadlineExceeded):
    97  		state = madmin.DriveStateOffline
    98  	case errors.Is(err, errCorruptedFormat) || errors.Is(err, errCorruptedBackend):
    99  		state = madmin.DriveStateCorrupt
   100  	case errors.Is(err, errUnformattedDisk):
   101  		state = madmin.DriveStateUnformatted
   102  	case errors.Is(err, errDiskAccessDenied):
   103  		state = madmin.DriveStatePermission
   104  	case errors.Is(err, errFaultyDisk):
   105  		state = madmin.DriveStateFaulty
   106  	case err == nil:
   107  		state = madmin.DriveStateOk
   108  	default:
   109  		state = fmt.Sprintf("%s (cause: %s)", madmin.DriveStateUnknown, err)
   110  	}
   111  
   112  	return
   113  }
   114  
   115  func getOnlineOfflineDisksStats(disksInfo []madmin.Disk) (onlineDisks, offlineDisks madmin.BackendDisks) {
   116  	onlineDisks = make(madmin.BackendDisks)
   117  	offlineDisks = make(madmin.BackendDisks)
   118  
   119  	for _, disk := range disksInfo {
   120  		ep := disk.Endpoint
   121  		if _, ok := offlineDisks[ep]; !ok {
   122  			offlineDisks[ep] = 0
   123  		}
   124  		if _, ok := onlineDisks[ep]; !ok {
   125  			onlineDisks[ep] = 0
   126  		}
   127  	}
   128  
   129  	// Wait for the routines.
   130  	for _, disk := range disksInfo {
   131  		ep := disk.Endpoint
   132  		state := disk.State
   133  		if state != madmin.DriveStateOk && state != madmin.DriveStateUnformatted {
   134  			offlineDisks[ep]++
   135  			continue
   136  		}
   137  		onlineDisks[ep]++
   138  	}
   139  
   140  	rootDiskCount := 0
   141  	for _, di := range disksInfo {
   142  		if di.RootDisk {
   143  			rootDiskCount++
   144  		}
   145  	}
   146  
   147  	// Count offline disks as well to ensure consistent
   148  	// reportability of offline drives on local setups.
   149  	if len(disksInfo) == (rootDiskCount + offlineDisks.Sum()) {
   150  		// Success.
   151  		return onlineDisks, offlineDisks
   152  	}
   153  
   154  	// Root disk should be considered offline
   155  	for i := range disksInfo {
   156  		ep := disksInfo[i].Endpoint
   157  		if disksInfo[i].RootDisk {
   158  			offlineDisks[ep]++
   159  			onlineDisks[ep]--
   160  		}
   161  	}
   162  
   163  	return onlineDisks, offlineDisks
   164  }
   165  
   166  // getDisksInfo - fetch disks info across all other storage API.
   167  func getDisksInfo(disks []StorageAPI, endpoints []Endpoint, metrics bool) (disksInfo []madmin.Disk) {
   168  	disksInfo = make([]madmin.Disk, len(disks))
   169  
   170  	g := errgroup.WithNErrs(len(disks))
   171  	for index := range disks {
   172  		index := index
   173  		g.Go(func() error {
   174  			di := madmin.Disk{
   175  				Endpoint:  endpoints[index].String(),
   176  				PoolIndex: endpoints[index].PoolIdx,
   177  				SetIndex:  endpoints[index].SetIdx,
   178  				DiskIndex: endpoints[index].DiskIdx,
   179  				Local:     endpoints[index].IsLocal,
   180  			}
   181  			if disks[index] == OfflineDisk {
   182  				di.State = diskErrToDriveState(errDiskNotFound)
   183  				disksInfo[index] = di
   184  				return nil
   185  			}
   186  			info, err := disks[index].DiskInfo(context.TODO(), DiskInfoOptions{Metrics: metrics})
   187  			di.DrivePath = info.MountPath
   188  			di.TotalSpace = info.Total
   189  			di.UsedSpace = info.Used
   190  			di.AvailableSpace = info.Free
   191  			di.UUID = info.ID
   192  			di.Major = info.Major
   193  			di.Minor = info.Minor
   194  			di.RootDisk = info.RootDisk
   195  			di.Healing = info.Healing
   196  			di.Scanning = info.Scanning
   197  			di.State = diskErrToDriveState(err)
   198  			di.FreeInodes = info.FreeInodes
   199  			di.UsedInodes = info.UsedInodes
   200  			if info.Healing {
   201  				if hi := disks[index].Healing(); hi != nil {
   202  					hd := hi.toHealingDisk()
   203  					di.HealInfo = &hd
   204  				}
   205  			}
   206  			di.Metrics = &madmin.DiskMetrics{
   207  				LastMinute:              make(map[string]madmin.TimedAction, len(info.Metrics.LastMinute)),
   208  				APICalls:                make(map[string]uint64, len(info.Metrics.APICalls)),
   209  				TotalErrorsAvailability: info.Metrics.TotalErrorsAvailability,
   210  				TotalErrorsTimeout:      info.Metrics.TotalErrorsTimeout,
   211  				TotalWaiting:            info.Metrics.TotalWaiting,
   212  			}
   213  			for k, v := range info.Metrics.LastMinute {
   214  				if v.N > 0 {
   215  					di.Metrics.LastMinute[k] = v.asTimedAction()
   216  				}
   217  			}
   218  			for k, v := range info.Metrics.APICalls {
   219  				di.Metrics.APICalls[k] = v
   220  			}
   221  			if info.Total > 0 {
   222  				di.Utilization = float64(info.Used / info.Total * 100)
   223  			}
   224  			disksInfo[index] = di
   225  			return nil
   226  		}, index)
   227  	}
   228  
   229  	g.Wait()
   230  	return disksInfo
   231  }
   232  
   233  // Get an aggregated storage info across all disks.
   234  func getStorageInfo(disks []StorageAPI, endpoints []Endpoint, metrics bool) StorageInfo {
   235  	disksInfo := getDisksInfo(disks, endpoints, metrics)
   236  
   237  	// Sort so that the first element is the smallest.
   238  	sort.Slice(disksInfo, func(i, j int) bool {
   239  		return disksInfo[i].TotalSpace < disksInfo[j].TotalSpace
   240  	})
   241  
   242  	storageInfo := StorageInfo{
   243  		Disks: disksInfo,
   244  	}
   245  
   246  	storageInfo.Backend.Type = madmin.Erasure
   247  	return storageInfo
   248  }
   249  
   250  // StorageInfo - returns underlying storage statistics.
   251  func (er erasureObjects) StorageInfo(ctx context.Context) StorageInfo {
   252  	disks := er.getDisks()
   253  	endpoints := er.getEndpoints()
   254  	return getStorageInfo(disks, endpoints, true)
   255  }
   256  
   257  // LocalStorageInfo - returns underlying local storage statistics.
   258  func (er erasureObjects) LocalStorageInfo(ctx context.Context, metrics bool) StorageInfo {
   259  	disks := er.getDisks()
   260  	endpoints := er.getEndpoints()
   261  
   262  	var localDisks []StorageAPI
   263  	var localEndpoints []Endpoint
   264  
   265  	for i, endpoint := range endpoints {
   266  		if endpoint.IsLocal {
   267  			localDisks = append(localDisks, disks[i])
   268  			localEndpoints = append(localEndpoints, endpoint)
   269  		}
   270  	}
   271  
   272  	return getStorageInfo(localDisks, localEndpoints, metrics)
   273  }
   274  
   275  // getOnlineDisksWithHealingAndInfo - returns online disks and overall healing status.
   276  // Disks are randomly ordered, but in the following groups:
   277  // - Non-scanning disks
   278  // - Non-healing disks
   279  // - Healing disks (if inclHealing is true)
   280  func (er erasureObjects) getOnlineDisksWithHealingAndInfo(inclHealing bool) (newDisks []StorageAPI, newInfos []DiskInfo, healing bool) {
   281  	var wg sync.WaitGroup
   282  	disks := er.getDisks()
   283  	infos := make([]DiskInfo, len(disks))
   284  	r := rand.New(rand.NewSource(time.Now().UnixNano()))
   285  	for _, i := range r.Perm(len(disks)) {
   286  		i := i
   287  		wg.Add(1)
   288  		go func() {
   289  			defer wg.Done()
   290  
   291  			disk := disks[i]
   292  			if disk == nil {
   293  				infos[i].Error = errDiskNotFound.Error()
   294  				return
   295  			}
   296  
   297  			di, err := disk.DiskInfo(context.Background(), DiskInfoOptions{})
   298  			infos[i] = di
   299  			if err != nil {
   300  				// - Do not consume disks which are not reachable
   301  				//   unformatted or simply not accessible for some reason.
   302  				infos[i].Error = err.Error()
   303  			}
   304  		}()
   305  	}
   306  	wg.Wait()
   307  
   308  	var scanningDisks, healingDisks []StorageAPI
   309  	var scanningInfos, healingInfos []DiskInfo
   310  
   311  	for i, info := range infos {
   312  		// Check if one of the drives in the set is being healed.
   313  		// this information is used by scanner to skip healing
   314  		// this erasure set while it calculates the usage.
   315  		if info.Error != "" || disks[i] == nil {
   316  			continue
   317  		}
   318  		if info.Healing {
   319  			healing = true
   320  			if inclHealing {
   321  				healingDisks = append(healingDisks, disks[i])
   322  				healingInfos = append(healingInfos, infos[i])
   323  			}
   324  			continue
   325  		}
   326  
   327  		if !info.Scanning {
   328  			newDisks = append(newDisks, disks[i])
   329  			newInfos = append(newInfos, infos[i])
   330  		} else {
   331  			scanningDisks = append(scanningDisks, disks[i])
   332  			scanningInfos = append(scanningInfos, infos[i])
   333  		}
   334  	}
   335  
   336  	// Prefer non-scanning disks over disks which are currently being scanned.
   337  	newDisks = append(newDisks, scanningDisks...)
   338  	newInfos = append(newInfos, scanningInfos...)
   339  
   340  	/// Then add healing disks.
   341  	newDisks = append(newDisks, healingDisks...)
   342  	newInfos = append(newInfos, healingInfos...)
   343  
   344  	return newDisks, newInfos, healing
   345  }
   346  
   347  func (er erasureObjects) getOnlineDisksWithHealing(inclHealing bool) (newDisks []StorageAPI, healing bool) {
   348  	newDisks, _, healing = er.getOnlineDisksWithHealingAndInfo(inclHealing)
   349  	return
   350  }
   351  
   352  // Clean-up previously deleted objects. from .minio.sys/tmp/.trash/
   353  func (er erasureObjects) cleanupDeletedObjects(ctx context.Context) {
   354  	// run multiple cleanup's local to this server.
   355  	var wg sync.WaitGroup
   356  	for _, disk := range er.getLocalDisks() {
   357  		if disk != nil {
   358  			wg.Add(1)
   359  			go func(disk StorageAPI) {
   360  				defer wg.Done()
   361  				diskPath := disk.Endpoint().Path
   362  				readDirFn(pathJoin(diskPath, minioMetaTmpDeletedBucket), func(ddir string, typ os.FileMode) error {
   363  					w := xioutil.NewDeadlineWorker(globalDriveConfig.GetMaxTimeout())
   364  					return w.Run(func() error {
   365  						wait := deletedCleanupSleeper.Timer(ctx)
   366  						removeAll(pathJoin(diskPath, minioMetaTmpDeletedBucket, ddir))
   367  						wait()
   368  						return nil
   369  					})
   370  				})
   371  			}(disk)
   372  		}
   373  	}
   374  	wg.Wait()
   375  }
   376  
   377  // nsScanner will start scanning buckets and send updated totals as they are traversed.
   378  // Updates are sent on a regular basis and the caller *must* consume them.
   379  func (er erasureObjects) nsScanner(ctx context.Context, buckets []BucketInfo, wantCycle uint32, updates chan<- dataUsageCache, healScanMode madmin.HealScanMode) error {
   380  	if len(buckets) == 0 {
   381  		return nil
   382  	}
   383  
   384  	// Collect disks we can use.
   385  	disks, healing := er.getOnlineDisksWithHealing(false)
   386  	if len(disks) == 0 {
   387  		logger.LogIf(ctx, errors.New("data-scanner: all drives are offline or being healed, skipping scanner cycle"))
   388  		return nil
   389  	}
   390  
   391  	// Load bucket totals
   392  	oldCache := dataUsageCache{}
   393  	if err := oldCache.load(ctx, er, dataUsageCacheName); err != nil {
   394  		return err
   395  	}
   396  
   397  	// New cache..
   398  	cache := dataUsageCache{
   399  		Info: dataUsageCacheInfo{
   400  			Name:      dataUsageRoot,
   401  			NextCycle: oldCache.Info.NextCycle,
   402  		},
   403  		Cache: make(map[string]dataUsageEntry, len(oldCache.Cache)),
   404  	}
   405  
   406  	// Put all buckets into channel.
   407  	bucketCh := make(chan BucketInfo, len(buckets))
   408  
   409  	// Shuffle buckets to ensure total randomness of buckets, being scanned.
   410  	// Otherwise same set of buckets get scanned across erasure sets always.
   411  	// at any given point in time. This allows different buckets to be scanned
   412  	// in different order per erasure set, this wider spread is needed when
   413  	// there are lots of buckets with different order of objects in them.
   414  	r := rand.New(rand.NewSource(time.Now().UnixNano()))
   415  	permutes := r.Perm(len(buckets))
   416  	// Add new buckets first
   417  	for _, idx := range permutes {
   418  		b := buckets[idx]
   419  		if e := oldCache.find(b.Name); e == nil {
   420  			bucketCh <- b
   421  		}
   422  	}
   423  	for _, idx := range permutes {
   424  		b := buckets[idx]
   425  		if e := oldCache.find(b.Name); e != nil {
   426  			cache.replace(b.Name, dataUsageRoot, *e)
   427  			bucketCh <- b
   428  		}
   429  	}
   430  	xioutil.SafeClose(bucketCh)
   431  
   432  	bucketResults := make(chan dataUsageEntryInfo, len(disks))
   433  
   434  	// Start async collector/saver.
   435  	// This goroutine owns the cache.
   436  	var saverWg sync.WaitGroup
   437  	saverWg.Add(1)
   438  	go func() {
   439  		// Add jitter to the update time so multiple sets don't sync up.
   440  		updateTime := 30*time.Second + time.Duration(float64(10*time.Second)*rand.Float64())
   441  		t := time.NewTicker(updateTime)
   442  		defer t.Stop()
   443  		defer saverWg.Done()
   444  		var lastSave time.Time
   445  
   446  		for {
   447  			select {
   448  			case <-t.C:
   449  				if cache.Info.LastUpdate.Equal(lastSave) {
   450  					continue
   451  				}
   452  				logger.LogOnceIf(ctx, cache.save(ctx, er, dataUsageCacheName), "nsscanner-cache-update")
   453  				updates <- cache.clone()
   454  
   455  				lastSave = cache.Info.LastUpdate
   456  			case v, ok := <-bucketResults:
   457  				if !ok {
   458  					// Save final state...
   459  					cache.Info.NextCycle = wantCycle
   460  					cache.Info.LastUpdate = time.Now()
   461  					logger.LogOnceIf(ctx, cache.save(ctx, er, dataUsageCacheName), "nsscanner-channel-closed")
   462  					updates <- cache.clone()
   463  					return
   464  				}
   465  				cache.replace(v.Name, v.Parent, v.Entry)
   466  				cache.Info.LastUpdate = time.Now()
   467  			}
   468  		}
   469  	}()
   470  
   471  	// Restrict parallelism for disk usage scanner
   472  	// upto GOMAXPROCS if GOMAXPROCS is < len(disks)
   473  	maxProcs := runtime.GOMAXPROCS(0)
   474  	if maxProcs < len(disks) {
   475  		disks = disks[:maxProcs]
   476  	}
   477  
   478  	// Start one scanner per disk
   479  	var wg sync.WaitGroup
   480  	wg.Add(len(disks))
   481  
   482  	for i := range disks {
   483  		go func(i int) {
   484  			defer wg.Done()
   485  			disk := disks[i]
   486  
   487  			for bucket := range bucketCh {
   488  				select {
   489  				case <-ctx.Done():
   490  					return
   491  				default:
   492  				}
   493  
   494  				// Load cache for bucket
   495  				cacheName := pathJoin(bucket.Name, dataUsageCacheName)
   496  				cache := dataUsageCache{}
   497  				logger.LogIf(ctx, cache.load(ctx, er, cacheName))
   498  				if cache.Info.Name == "" {
   499  					cache.Info.Name = bucket.Name
   500  				}
   501  				cache.Info.SkipHealing = healing
   502  				cache.Info.NextCycle = wantCycle
   503  				if cache.Info.Name != bucket.Name {
   504  					cache.Info = dataUsageCacheInfo{
   505  						Name:       bucket.Name,
   506  						LastUpdate: time.Time{},
   507  						NextCycle:  wantCycle,
   508  					}
   509  				}
   510  				// Collect updates.
   511  				updates := make(chan dataUsageEntry, 1)
   512  				var wg sync.WaitGroup
   513  				wg.Add(1)
   514  				go func(name string) {
   515  					defer wg.Done()
   516  					for update := range updates {
   517  						select {
   518  						case <-ctx.Done():
   519  						case bucketResults <- dataUsageEntryInfo{
   520  							Name:   name,
   521  							Parent: dataUsageRoot,
   522  							Entry:  update,
   523  						}:
   524  						}
   525  					}
   526  				}(cache.Info.Name)
   527  				// Calc usage
   528  				before := cache.Info.LastUpdate
   529  				var err error
   530  				cache, err = disk.NSScanner(ctx, cache, updates, healScanMode, nil)
   531  				if err != nil {
   532  					if !cache.Info.LastUpdate.IsZero() && cache.Info.LastUpdate.After(before) {
   533  						logger.LogIf(ctx, cache.save(ctx, er, cacheName))
   534  					} else {
   535  						logger.LogIf(ctx, err)
   536  					}
   537  					// This ensures that we don't close
   538  					// bucketResults channel while the
   539  					// updates-collector goroutine still
   540  					// holds a reference to this.
   541  					wg.Wait()
   542  					continue
   543  				}
   544  
   545  				wg.Wait()
   546  				var root dataUsageEntry
   547  				if r := cache.root(); r != nil {
   548  					root = cache.flatten(*r)
   549  				}
   550  				select {
   551  				case <-ctx.Done():
   552  					return
   553  				case bucketResults <- dataUsageEntryInfo{
   554  					Name:   cache.Info.Name,
   555  					Parent: dataUsageRoot,
   556  					Entry:  root,
   557  				}:
   558  				}
   559  
   560  				// Save cache
   561  				logger.LogIf(ctx, cache.save(ctx, er, cacheName))
   562  			}
   563  		}(i)
   564  	}
   565  	wg.Wait()
   566  	xioutil.SafeClose(bucketResults)
   567  	saverWg.Wait()
   568  
   569  	return nil
   570  }