storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/cmd/erasure.go (about)

     1  /*
     2   * MinIO Cloud Storage, (C) 2016-2020 MinIO, Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package cmd
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"math/rand"
    24  	"os"
    25  	"sort"
    26  	"sync"
    27  	"time"
    28  
    29  	"storj.io/minio/cmd/logger"
    30  	"storj.io/minio/pkg/bpool"
    31  	"storj.io/minio/pkg/color"
    32  	"storj.io/minio/pkg/dsync"
    33  	"storj.io/minio/pkg/madmin"
    34  	"storj.io/minio/pkg/sync/errgroup"
    35  )
    36  
    37  // OfflineDisk represents an unavailable disk.
    38  var OfflineDisk StorageAPI // zero value is nil
    39  
    40  // partialOperation is a successful upload/delete of an object
    41  // but not written in all disks (having quorum)
    42  type partialOperation struct {
    43  	bucket    string
    44  	object    string
    45  	versionID string
    46  	failedSet int
    47  }
    48  
    49  // erasureObjects - Implements ER object layer.
    50  type erasureObjects struct {
    51  	GatewayUnsupported
    52  
    53  	setDriveCount      int
    54  	defaultParityCount int
    55  
    56  	setIndex  int
    57  	poolIndex int
    58  
    59  	// getDisks returns list of storageAPIs.
    60  	getDisks func() []StorageAPI
    61  
    62  	// getLockers returns list of remote and local lockers.
    63  	getLockers func() ([]dsync.NetLocker, string)
    64  
    65  	// getEndpoints returns list of endpoint strings belonging this set.
    66  	// some may be local and some remote.
    67  	getEndpoints func() []string
    68  
    69  	// Locker mutex map.
    70  	nsMutex *nsLockMap
    71  
    72  	// Byte pools used for temporary i/o buffers.
    73  	bp *bpool.BytePoolCap
    74  
    75  	mrfOpCh chan partialOperation
    76  
    77  	deletedCleanupSleeper *dynamicSleeper
    78  }
    79  
    80  // NewNSLock - initialize a new namespace RWLocker instance.
    81  func (er erasureObjects) NewNSLock(bucket string, objects ...string) RWLocker {
    82  	return er.nsMutex.NewNSLock(er.getLockers, bucket, objects...)
    83  }
    84  
    85  // Shutdown function for object storage interface.
    86  func (er erasureObjects) Shutdown(ctx context.Context) error {
    87  	// Add any object layer shutdown activities here.
    88  	closeStorageDisks(er.getDisks())
    89  	return nil
    90  }
    91  
    92  // byDiskTotal is a collection satisfying sort.Interface.
    93  type byDiskTotal []madmin.Disk
    94  
    95  func (d byDiskTotal) Len() int      { return len(d) }
    96  func (d byDiskTotal) Swap(i, j int) { d[i], d[j] = d[j], d[i] }
    97  func (d byDiskTotal) Less(i, j int) bool {
    98  	return d[i].TotalSpace < d[j].TotalSpace
    99  }
   100  
   101  func diskErrToDriveState(err error) (state string) {
   102  	state = madmin.DriveStateUnknown
   103  	switch {
   104  	case errors.Is(err, errDiskNotFound):
   105  		state = madmin.DriveStateOffline
   106  	case errors.Is(err, errCorruptedFormat):
   107  		state = madmin.DriveStateCorrupt
   108  	case errors.Is(err, errUnformattedDisk):
   109  		state = madmin.DriveStateUnformatted
   110  	case errors.Is(err, errDiskAccessDenied):
   111  		state = madmin.DriveStatePermission
   112  	case errors.Is(err, errFaultyDisk):
   113  		state = madmin.DriveStateFaulty
   114  	case err == nil:
   115  		state = madmin.DriveStateOk
   116  	}
   117  	return
   118  }
   119  
   120  func getOnlineOfflineDisksStats(disksInfo []madmin.Disk) (onlineDisks, offlineDisks madmin.BackendDisks) {
   121  	onlineDisks = make(madmin.BackendDisks)
   122  	offlineDisks = make(madmin.BackendDisks)
   123  
   124  	for _, disk := range disksInfo {
   125  		ep := disk.Endpoint
   126  		if _, ok := offlineDisks[ep]; !ok {
   127  			offlineDisks[ep] = 0
   128  		}
   129  		if _, ok := onlineDisks[ep]; !ok {
   130  			onlineDisks[ep] = 0
   131  		}
   132  	}
   133  
   134  	// Wait for the routines.
   135  	for _, disk := range disksInfo {
   136  		ep := disk.Endpoint
   137  		state := disk.State
   138  		if state != madmin.DriveStateOk && state != madmin.DriveStateUnformatted {
   139  			offlineDisks[ep]++
   140  			continue
   141  		}
   142  		onlineDisks[ep]++
   143  	}
   144  
   145  	rootDiskCount := 0
   146  	for _, di := range disksInfo {
   147  		if di.RootDisk {
   148  			rootDiskCount++
   149  		}
   150  	}
   151  
   152  	// Count offline disks as well to ensure consistent
   153  	// reportability of offline drives on local setups.
   154  	if len(disksInfo) == (rootDiskCount + offlineDisks.Sum()) {
   155  		// Success.
   156  		return onlineDisks, offlineDisks
   157  	}
   158  
   159  	// Root disk should be considered offline
   160  	for i := range disksInfo {
   161  		ep := disksInfo[i].Endpoint
   162  		if disksInfo[i].RootDisk {
   163  			offlineDisks[ep]++
   164  			onlineDisks[ep]--
   165  		}
   166  	}
   167  
   168  	return onlineDisks, offlineDisks
   169  }
   170  
   171  // getDisksInfo - fetch disks info across all other storage API.
   172  func getDisksInfo(disks []StorageAPI, endpoints []string) (disksInfo []madmin.Disk, errs []error) {
   173  	disksInfo = make([]madmin.Disk, len(disks))
   174  
   175  	g := errgroup.WithNErrs(len(disks))
   176  	for index := range disks {
   177  		index := index
   178  		g.Go(func() error {
   179  			if disks[index] == OfflineDisk {
   180  				logger.LogIf(GlobalContext, fmt.Errorf("%s: %s", errDiskNotFound, endpoints[index]))
   181  				disksInfo[index] = madmin.Disk{
   182  					State:    diskErrToDriveState(errDiskNotFound),
   183  					Endpoint: endpoints[index],
   184  				}
   185  				// Storage disk is empty, perhaps ignored disk or not available.
   186  				return errDiskNotFound
   187  			}
   188  			info, err := disks[index].DiskInfo(context.TODO())
   189  			di := madmin.Disk{
   190  				Endpoint:       info.Endpoint,
   191  				DrivePath:      info.MountPath,
   192  				TotalSpace:     info.Total,
   193  				UsedSpace:      info.Used,
   194  				AvailableSpace: info.Free,
   195  				UUID:           info.ID,
   196  				RootDisk:       info.RootDisk,
   197  				Healing:        info.Healing,
   198  				State:          diskErrToDriveState(err),
   199  			}
   200  			di.PoolIndex, di.SetIndex, di.DiskIndex = disks[index].GetDiskLoc()
   201  			if info.Healing {
   202  				if hi := disks[index].Healing(); hi != nil {
   203  					hd := hi.toHealingDisk()
   204  					di.HealInfo = &hd
   205  				}
   206  			}
   207  			di.Metrics = &madmin.DiskMetrics{
   208  				APILatencies: make(map[string]string),
   209  				APICalls:     make(map[string]uint64),
   210  			}
   211  			for k, v := range info.Metrics.APILatencies {
   212  				di.Metrics.APILatencies[k] = v
   213  			}
   214  			for k, v := range info.Metrics.APICalls {
   215  				di.Metrics.APICalls[k] = v
   216  			}
   217  			if info.Total > 0 {
   218  				di.Utilization = float64(info.Used / info.Total * 100)
   219  			}
   220  			disksInfo[index] = di
   221  			return err
   222  		}, index)
   223  	}
   224  
   225  	return disksInfo, g.Wait()
   226  }
   227  
   228  // Get an aggregated storage info across all disks.
   229  func getStorageInfo(disks []StorageAPI, endpoints []string) (StorageInfo, []error) {
   230  	disksInfo, errs := getDisksInfo(disks, endpoints)
   231  
   232  	// Sort so that the first element is the smallest.
   233  	sort.Sort(byDiskTotal(disksInfo))
   234  
   235  	storageInfo := StorageInfo{
   236  		Disks: disksInfo,
   237  	}
   238  
   239  	storageInfo.Backend.Type = madmin.Erasure
   240  	return storageInfo, errs
   241  }
   242  
   243  // StorageInfo - returns underlying storage statistics.
   244  func (er erasureObjects) StorageInfo(ctx context.Context) (StorageInfo, []error) {
   245  	disks := er.getDisks()
   246  	endpoints := er.getEndpoints()
   247  	return getStorageInfo(disks, endpoints)
   248  }
   249  
   250  // LocalStorageInfo - returns underlying local storage statistics.
   251  func (er erasureObjects) LocalStorageInfo(ctx context.Context) (StorageInfo, []error) {
   252  	disks := er.getLocalDisks()
   253  	endpoints := make([]string, len(disks))
   254  	for i, disk := range disks {
   255  		if disk != nil {
   256  			endpoints[i] = disk.String()
   257  		}
   258  	}
   259  	return getStorageInfo(disks, endpoints)
   260  }
   261  
   262  func (er erasureObjects) getOnlineDisksWithHealing() (newDisks []StorageAPI, healing bool) {
   263  	var wg sync.WaitGroup
   264  	disks := er.getDisks()
   265  	infos := make([]DiskInfo, len(disks))
   266  	for _, i := range hashOrder(UTCNow().String(), len(disks)) {
   267  		i := i
   268  		wg.Add(1)
   269  		go func() {
   270  			defer wg.Done()
   271  
   272  			disk := disks[i-1]
   273  
   274  			if disk == nil {
   275  				infos[i-1].Error = "nil disk"
   276  				return
   277  			}
   278  
   279  			di, err := disk.DiskInfo(context.Background())
   280  			if err != nil {
   281  				// - Do not consume disks which are not reachable
   282  				//   unformatted or simply not accessible for some reason.
   283  				//
   284  				//
   285  				// - Future: skip busy disks
   286  				infos[i-1].Error = err.Error()
   287  				return
   288  			}
   289  
   290  			infos[i-1] = di
   291  		}()
   292  	}
   293  	wg.Wait()
   294  
   295  	for i, info := range infos {
   296  		// Check if one of the drives in the set is being healed.
   297  		// this information is used by scanner to skip healing
   298  		// this erasure set while it calculates the usage.
   299  		if info.Healing || info.Error != "" {
   300  			healing = true
   301  			continue
   302  		}
   303  		newDisks = append(newDisks, disks[i])
   304  	}
   305  
   306  	return newDisks, healing
   307  }
   308  
   309  // Clean-up previously deleted objects. from .minio.sys/tmp/.trash/
   310  func (er erasureObjects) cleanupDeletedObjects(ctx context.Context) {
   311  	// run multiple cleanup's local to this server.
   312  	var wg sync.WaitGroup
   313  	for _, disk := range er.getLoadBalancedLocalDisks() {
   314  		if disk != nil {
   315  			wg.Add(1)
   316  			go func(disk StorageAPI) {
   317  				defer wg.Done()
   318  				diskPath := disk.Endpoint().Path
   319  				readDirFn(pathJoin(diskPath, minioMetaTmpDeletedBucket), func(ddir string, typ os.FileMode) error {
   320  					wait := er.deletedCleanupSleeper.Timer(ctx)
   321  					removeAll(pathJoin(diskPath, minioMetaTmpDeletedBucket, ddir))
   322  					wait()
   323  					return nil
   324  				})
   325  			}(disk)
   326  		}
   327  	}
   328  	wg.Wait()
   329  }
   330  
   331  // nsScanner will start scanning buckets and send updated totals as they are traversed.
   332  // Updates are sent on a regular basis and the caller *must* consume them.
   333  func (er erasureObjects) nsScanner(ctx context.Context, buckets []BucketInfo, bf *bloomFilter, updates chan<- dataUsageCache) error {
   334  	if len(buckets) == 0 {
   335  		return nil
   336  	}
   337  
   338  	// Collect disks we can use.
   339  	disks, healing := er.getOnlineDisksWithHealing()
   340  	if len(disks) == 0 {
   341  		logger.Info(color.Green("data-scanner:") + " all disks are offline or being healed, skipping scanner")
   342  		return nil
   343  	}
   344  
   345  	// Collect disks for healing.
   346  	allDisks := er.getDisks()
   347  	allDiskIDs := make([]string, 0, len(allDisks))
   348  	for _, disk := range allDisks {
   349  		if disk == OfflineDisk {
   350  			// its possible that disk is OfflineDisk
   351  			continue
   352  		}
   353  		id, _ := disk.GetDiskID()
   354  		if id == "" {
   355  			// its possible that disk is unformatted
   356  			// or just went offline
   357  			continue
   358  		}
   359  		allDiskIDs = append(allDiskIDs, id)
   360  	}
   361  
   362  	// Load bucket totals
   363  	oldCache := dataUsageCache{}
   364  	if err := oldCache.load(ctx, er, dataUsageCacheName); err != nil {
   365  		return err
   366  	}
   367  
   368  	// New cache..
   369  	cache := dataUsageCache{
   370  		Info: dataUsageCacheInfo{
   371  			Name:      dataUsageRoot,
   372  			NextCycle: oldCache.Info.NextCycle,
   373  		},
   374  		Cache: make(map[string]dataUsageEntry, len(oldCache.Cache)),
   375  	}
   376  	bloom := bf.bytes()
   377  
   378  	// Put all buckets into channel.
   379  	bucketCh := make(chan BucketInfo, len(buckets))
   380  	// Add new buckets first
   381  	for _, b := range buckets {
   382  		if oldCache.find(b.Name) == nil {
   383  			bucketCh <- b
   384  		}
   385  	}
   386  
   387  	// Add existing buckets.
   388  	for _, b := range buckets {
   389  		e := oldCache.find(b.Name)
   390  		if e != nil {
   391  			cache.replace(b.Name, dataUsageRoot, *e)
   392  			bucketCh <- b
   393  		}
   394  	}
   395  
   396  	close(bucketCh)
   397  	bucketResults := make(chan dataUsageEntryInfo, len(disks))
   398  
   399  	// Start async collector/saver.
   400  	// This goroutine owns the cache.
   401  	var saverWg sync.WaitGroup
   402  	saverWg.Add(1)
   403  	go func() {
   404  		// Add jitter to the update time so multiple sets don't sync up.
   405  		var updateTime = 30*time.Second + time.Duration(float64(10*time.Second)*rand.Float64())
   406  		t := time.NewTicker(updateTime)
   407  		defer t.Stop()
   408  		defer saverWg.Done()
   409  		var lastSave time.Time
   410  
   411  		for {
   412  			select {
   413  			case <-ctx.Done():
   414  				// Return without saving.
   415  				return
   416  			case <-t.C:
   417  				if cache.Info.LastUpdate.Equal(lastSave) {
   418  					continue
   419  				}
   420  				logger.LogIf(ctx, cache.save(ctx, er, dataUsageCacheName))
   421  				updates <- cache.clone()
   422  				lastSave = cache.Info.LastUpdate
   423  			case v, ok := <-bucketResults:
   424  				if !ok {
   425  					// Save final state...
   426  					cache.Info.NextCycle++
   427  					cache.Info.LastUpdate = time.Now()
   428  					logger.LogIf(ctx, cache.save(ctx, er, dataUsageCacheName))
   429  					updates <- cache
   430  					return
   431  				}
   432  				cache.replace(v.Name, v.Parent, v.Entry)
   433  				cache.Info.LastUpdate = time.Now()
   434  			}
   435  		}
   436  	}()
   437  
   438  	// Shuffle disks to ensure a total randomness of bucket/disk association to ensure
   439  	// that objects that are not present in all disks are accounted and ILM applied.
   440  	r := rand.New(rand.NewSource(time.Now().UnixNano()))
   441  	r.Shuffle(len(disks), func(i, j int) { disks[i], disks[j] = disks[j], disks[i] })
   442  
   443  	// Start one scanner per disk
   444  	var wg sync.WaitGroup
   445  	wg.Add(len(disks))
   446  	for i := range disks {
   447  		go func(i int) {
   448  			defer wg.Done()
   449  			disk := disks[i]
   450  
   451  			for bucket := range bucketCh {
   452  				select {
   453  				case <-ctx.Done():
   454  					return
   455  				default:
   456  				}
   457  
   458  				// Load cache for bucket
   459  				cacheName := pathJoin(bucket.Name, dataUsageCacheName)
   460  				cache := dataUsageCache{}
   461  				logger.LogIf(ctx, cache.load(ctx, er, cacheName))
   462  				if cache.Info.Name == "" {
   463  					cache.Info.Name = bucket.Name
   464  				}
   465  				cache.Info.BloomFilter = bloom
   466  				cache.Info.SkipHealing = healing
   467  				cache.Disks = allDiskIDs
   468  				if cache.Info.Name != bucket.Name {
   469  					logger.LogIf(ctx, fmt.Errorf("cache name mismatch: %s != %s", cache.Info.Name, bucket.Name))
   470  					cache.Info = dataUsageCacheInfo{
   471  						Name:       bucket.Name,
   472  						LastUpdate: time.Time{},
   473  						NextCycle:  0,
   474  					}
   475  				}
   476  
   477  				// Calc usage
   478  				before := cache.Info.LastUpdate
   479  				var err error
   480  				cache, err = disk.NSScanner(ctx, cache)
   481  				cache.Info.BloomFilter = nil
   482  				if err != nil {
   483  					if !cache.Info.LastUpdate.IsZero() && cache.Info.LastUpdate.After(before) {
   484  						logger.LogIf(ctx, cache.save(ctx, er, cacheName))
   485  					} else {
   486  						logger.LogIf(ctx, err)
   487  					}
   488  					continue
   489  				}
   490  
   491  				var root dataUsageEntry
   492  				if r := cache.root(); r != nil {
   493  					root = cache.flatten(*r)
   494  				}
   495  				t := time.Now()
   496  				bucketResults <- dataUsageEntryInfo{
   497  					Name:   cache.Info.Name,
   498  					Parent: dataUsageRoot,
   499  					Entry:  root,
   500  				}
   501  				// We want to avoid synchronizing up all writes in case
   502  				// the results are piled up.
   503  				time.Sleep(time.Duration(float64(time.Since(t)) * rand.Float64()))
   504  				// Save cache
   505  				logger.LogIf(ctx, cache.save(ctx, er, cacheName))
   506  			}
   507  		}(i)
   508  	}
   509  	wg.Wait()
   510  	close(bucketResults)
   511  	saverWg.Wait()
   512  
   513  	return nil
   514  }