storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/cmd/erasure-sets.go

storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/cmd/erasure-sets.go (about)

     1  /*
     2   * MinIO Cloud Storage, (C) 2018-2019 MinIO, Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package cmd
    18  
    19  import (
    20  	"context"
    21  	"encoding/binary"
    22  	"errors"
    23  	"fmt"
    24  	"hash/crc32"
    25  	"math/rand"
    26  	"net/http"
    27  	"sort"
    28  	"sync"
    29  	"time"
    30  
    31  	"github.com/dchest/siphash"
    32  	"github.com/dustin/go-humanize"
    33  	"github.com/google/uuid"
    34  
    35  	"github.com/minio/minio-go/v7/pkg/set"
    36  	"github.com/minio/minio-go/v7/pkg/tags"
    37  
    38  	"storj.io/minio/cmd/logger"
    39  	"storj.io/minio/pkg/bpool"
    40  	"storj.io/minio/pkg/console"
    41  	"storj.io/minio/pkg/dsync"
    42  	"storj.io/minio/pkg/env"
    43  	"storj.io/minio/pkg/madmin"
    44  	"storj.io/minio/pkg/sync/errgroup"
    45  )
    46  
    47  // setsDsyncLockers is encapsulated type for Close()
    48  type setsDsyncLockers [][]dsync.NetLocker
    49  
    50  const envMinioDeleteCleanupInterval = "MINIO_DELETE_CLEANUP_INTERVAL"
    51  
    52  // erasureSets implements ObjectLayer combining a static list of erasure coded
    53  // object sets. NOTE: There is no dynamic scaling allowed or intended in
    54  // current design.
    55  type erasureSets struct {
    56  	GatewayUnsupported
    57  
    58  	sets []*erasureObjects
    59  
    60  	// Reference format.
    61  	format *formatErasureV3
    62  
    63  	// erasureDisks mutex to lock erasureDisks.
    64  	erasureDisksMu sync.RWMutex
    65  
    66  	// Re-ordered list of disks per set.
    67  	erasureDisks [][]StorageAPI
    68  
    69  	// Distributed locker clients.
    70  	erasureLockers setsDsyncLockers
    71  
    72  	// Distributed lock owner (constant per running instance).
    73  	erasureLockOwner string
    74  
    75  	// List of endpoints provided on the command line.
    76  	endpoints Endpoints
    77  
    78  	// String version of all the endpoints, an optimization
    79  	// to avoid url.String() conversion taking CPU on
    80  	// large disk setups.
    81  	endpointStrings []string
    82  
    83  	// Total number of sets and the number of disks per set.
    84  	setCount, setDriveCount int
    85  	defaultParityCount      int
    86  
    87  	poolIndex int
    88  
    89  	// A channel to send the set index to the MRF when
    90  	// any disk belonging to that set is connected
    91  	setReconnectEvent chan int
    92  
    93  	// Distribution algorithm of choice.
    94  	distributionAlgo string
    95  	deploymentID     [16]byte
    96  
    97  	disksStorageInfoCache timedValue
    98  
    99  	mrfMU         sync.Mutex
   100  	mrfOperations map[healSource]int
   101  }
   102  
   103  func isEndpointConnected(diskMap map[string]StorageAPI, endpoint string) bool {
   104  	disk := diskMap[endpoint]
   105  	if disk == nil {
   106  		return false
   107  	}
   108  	return disk.IsOnline()
   109  }
   110  
   111  func (s *erasureSets) getDiskMap() map[string]StorageAPI {
   112  	diskMap := make(map[string]StorageAPI)
   113  
   114  	s.erasureDisksMu.RLock()
   115  	defer s.erasureDisksMu.RUnlock()
   116  
   117  	for i := 0; i < s.setCount; i++ {
   118  		for j := 0; j < s.setDriveCount; j++ {
   119  			disk := s.erasureDisks[i][j]
   120  			if disk == OfflineDisk {
   121  				continue
   122  			}
   123  			if !disk.IsOnline() {
   124  				continue
   125  			}
   126  			diskMap[disk.String()] = disk
   127  		}
   128  	}
   129  	return diskMap
   130  }
   131  
   132  // Initializes a new StorageAPI from the endpoint argument, returns
   133  // StorageAPI and also `format` which exists on the disk.
   134  func connectEndpoint(endpoint Endpoint) (StorageAPI, *formatErasureV3, error) {
   135  	disk, err := newStorageAPIWithoutHealthCheck(endpoint)
   136  	if err != nil {
   137  		return nil, nil, err
   138  	}
   139  
   140  	format, err := loadFormatErasure(disk)
   141  	if err != nil {
   142  		if errors.Is(err, errUnformattedDisk) {
   143  			info, derr := disk.DiskInfo(context.TODO())
   144  			if derr != nil && info.RootDisk {
   145  				return nil, nil, fmt.Errorf("Disk: %s returned %w", disk, derr) // make sure to '%w' to wrap the error
   146  			}
   147  		}
   148  		return nil, nil, fmt.Errorf("Disk: %s returned %w", disk, err) // make sure to '%w' to wrap the error
   149  	}
   150  
   151  	return disk, format, nil
   152  }
   153  
   154  // findDiskIndex - returns the i,j'th position of the input `diskID` against the reference
   155  // format, after successful validation.
   156  //   - i'th position is the set index
   157  //   - j'th position is the disk index in the current set
   158  func findDiskIndexByDiskID(refFormat *formatErasureV3, diskID string) (int, int, error) {
   159  	if diskID == offlineDiskUUID {
   160  		return -1, -1, fmt.Errorf("diskID: %s is offline", diskID)
   161  	}
   162  	for i := 0; i < len(refFormat.Erasure.Sets); i++ {
   163  		for j := 0; j < len(refFormat.Erasure.Sets[0]); j++ {
   164  			if refFormat.Erasure.Sets[i][j] == diskID {
   165  				return i, j, nil
   166  			}
   167  		}
   168  	}
   169  
   170  	return -1, -1, fmt.Errorf("diskID: %s not found", diskID)
   171  }
   172  
   173  // findDiskIndex - returns the i,j'th position of the input `format` against the reference
   174  // format, after successful validation.
   175  //   - i'th position is the set index
   176  //   - j'th position is the disk index in the current set
   177  func findDiskIndex(refFormat, format *formatErasureV3) (int, int, error) {
   178  	if err := formatErasureV3Check(refFormat, format); err != nil {
   179  		return 0, 0, err
   180  	}
   181  
   182  	if format.Erasure.This == offlineDiskUUID {
   183  		return -1, -1, fmt.Errorf("diskID: %s is offline", format.Erasure.This)
   184  	}
   185  
   186  	for i := 0; i < len(refFormat.Erasure.Sets); i++ {
   187  		for j := 0; j < len(refFormat.Erasure.Sets[0]); j++ {
   188  			if refFormat.Erasure.Sets[i][j] == format.Erasure.This {
   189  				return i, j, nil
   190  			}
   191  		}
   192  	}
   193  
   194  	return -1, -1, fmt.Errorf("diskID: %s not found", format.Erasure.This)
   195  }
   196  
   197  // connectDisks - attempt to connect all the endpoints, loads format
   198  // and re-arranges the disks in proper position.
   199  func (s *erasureSets) connectDisks() {
   200  	var wg sync.WaitGroup
   201  	var setsJustConnected = make([]bool, s.setCount)
   202  	diskMap := s.getDiskMap()
   203  	for _, endpoint := range s.endpoints {
   204  		diskPath := endpoint.String()
   205  		if endpoint.IsLocal {
   206  			diskPath = endpoint.Path
   207  		}
   208  		if isEndpointConnected(diskMap, diskPath) {
   209  			continue
   210  		}
   211  		wg.Add(1)
   212  		go func(endpoint Endpoint) {
   213  			defer wg.Done()
   214  			disk, format, err := connectEndpoint(endpoint)
   215  			if err != nil {
   216  				if endpoint.IsLocal && errors.Is(err, errUnformattedDisk) {
   217  					globalBackgroundHealState.pushHealLocalDisks(endpoint)
   218  					logger.Info(fmt.Sprintf("Found unformatted drive %s, attempting to heal...", endpoint))
   219  				} else {
   220  					printEndpointError(endpoint, err, true)
   221  				}
   222  				return
   223  			}
   224  			if disk.IsLocal() && disk.Healing() != nil {
   225  				globalBackgroundHealState.pushHealLocalDisks(disk.Endpoint())
   226  				logger.Info(fmt.Sprintf("Found the drive %s that needs healing, attempting to heal...", disk))
   227  			}
   228  			s.erasureDisksMu.RLock()
   229  			setIndex, diskIndex, err := findDiskIndex(s.format, format)
   230  			s.erasureDisksMu.RUnlock()
   231  			if err != nil {
   232  				printEndpointError(endpoint, err, false)
   233  				return
   234  			}
   235  
   236  			s.erasureDisksMu.Lock()
   237  			if s.erasureDisks[setIndex][diskIndex] != nil {
   238  				s.erasureDisks[setIndex][diskIndex].Close()
   239  			}
   240  			if disk.IsLocal() {
   241  				disk.SetDiskID(format.Erasure.This)
   242  				s.erasureDisks[setIndex][diskIndex] = disk
   243  			} else {
   244  				// Enable healthcheck disk for remote endpoint.
   245  				disk, err = newStorageAPI(endpoint)
   246  				if err != nil {
   247  					printEndpointError(endpoint, err, false)
   248  					return
   249  				}
   250  				disk.SetDiskID(format.Erasure.This)
   251  				s.erasureDisks[setIndex][diskIndex] = disk
   252  			}
   253  			disk.SetDiskLoc(s.poolIndex, setIndex, diskIndex)
   254  			s.endpointStrings[setIndex*s.setDriveCount+diskIndex] = disk.String()
   255  			setsJustConnected[setIndex] = true
   256  			s.erasureDisksMu.Unlock()
   257  		}(endpoint)
   258  	}
   259  
   260  	wg.Wait()
   261  
   262  	go func() {
   263  		idler := time.NewTimer(100 * time.Millisecond)
   264  		defer idler.Stop()
   265  
   266  		for setIndex, justConnected := range setsJustConnected {
   267  			if !justConnected {
   268  				continue
   269  			}
   270  
   271  			// Send a new set connect event with a timeout
   272  			idler.Reset(100 * time.Millisecond)
   273  			select {
   274  			case s.setReconnectEvent <- setIndex:
   275  			case <-idler.C:
   276  			}
   277  		}
   278  	}()
   279  }
   280  
   281  // monitorAndConnectEndpoints this is a monitoring loop to keep track of disconnected
   282  // endpoints by reconnecting them and making sure to place them into right position in
   283  // the set topology, this monitoring happens at a given monitoring interval.
   284  func (s *erasureSets) monitorAndConnectEndpoints(ctx context.Context, monitorInterval time.Duration) {
   285  	r := rand.New(rand.NewSource(time.Now().UnixNano()))
   286  
   287  	time.Sleep(time.Duration(r.Float64() * float64(time.Second)))
   288  
   289  	// Pre-emptively connect the disks if possible.
   290  	s.connectDisks()
   291  
   292  	monitor := time.NewTimer(monitorInterval)
   293  	defer monitor.Stop()
   294  
   295  	for {
   296  		select {
   297  		case <-ctx.Done():
   298  			return
   299  		case <-monitor.C:
   300  			// Reset the timer once fired for required interval.
   301  			monitor.Reset(monitorInterval)
   302  
   303  			if serverDebugLog {
   304  				console.Debugln("running disk monitoring")
   305  			}
   306  
   307  			s.connectDisks()
   308  		}
   309  	}
   310  }
   311  
   312  func (s *erasureSets) GetLockers(setIndex int) func() ([]dsync.NetLocker, string) {
   313  	return func() ([]dsync.NetLocker, string) {
   314  		lockers := make([]dsync.NetLocker, len(s.erasureLockers[setIndex]))
   315  		copy(lockers, s.erasureLockers[setIndex])
   316  		return lockers, s.erasureLockOwner
   317  	}
   318  }
   319  
   320  func (s *erasureSets) GetEndpoints(setIndex int) func() []string {
   321  	return func() []string {
   322  		s.erasureDisksMu.RLock()
   323  		defer s.erasureDisksMu.RUnlock()
   324  
   325  		eps := make([]string, s.setDriveCount)
   326  		for i := 0; i < s.setDriveCount; i++ {
   327  			eps[i] = s.endpointStrings[setIndex*s.setDriveCount+i]
   328  		}
   329  		return eps
   330  	}
   331  }
   332  
   333  // GetDisks returns a closure for a given set, which provides list of disks per set.
   334  func (s *erasureSets) GetDisks(setIndex int) func() []StorageAPI {
   335  	return func() []StorageAPI {
   336  		s.erasureDisksMu.RLock()
   337  		defer s.erasureDisksMu.RUnlock()
   338  		disks := make([]StorageAPI, s.setDriveCount)
   339  		copy(disks, s.erasureDisks[setIndex])
   340  		return disks
   341  	}
   342  }
   343  
   344  // defaultMonitorConnectEndpointInterval is the interval to monitor endpoint connections.
   345  // Must be bigger than defaultMonitorNewDiskInterval.
   346  const defaultMonitorConnectEndpointInterval = defaultMonitorNewDiskInterval + time.Second*5
   347  
   348  // Initialize new set of erasure coded sets.
   349  func newErasureSets(ctx context.Context, endpoints Endpoints, storageDisks []StorageAPI, format *formatErasureV3, defaultParityCount, poolIdx int) (*erasureSets, error) {
   350  	setCount := len(format.Erasure.Sets)
   351  	setDriveCount := len(format.Erasure.Sets[0])
   352  
   353  	endpointStrings := make([]string, len(endpoints))
   354  
   355  	// Initialize the erasure sets instance.
   356  	s := &erasureSets{
   357  		sets:               make([]*erasureObjects, setCount),
   358  		erasureDisks:       make([][]StorageAPI, setCount),
   359  		erasureLockers:     make([][]dsync.NetLocker, setCount),
   360  		erasureLockOwner:   globalLocalNodeName,
   361  		endpoints:          endpoints,
   362  		endpointStrings:    endpointStrings,
   363  		setCount:           setCount,
   364  		setDriveCount:      setDriveCount,
   365  		defaultParityCount: defaultParityCount,
   366  		format:             format,
   367  		setReconnectEvent:  make(chan int),
   368  		distributionAlgo:   format.Erasure.DistributionAlgo,
   369  		deploymentID:       uuid.MustParse(format.ID),
   370  		mrfOperations:      make(map[healSource]int),
   371  		poolIndex:          poolIdx,
   372  	}
   373  
   374  	mutex := newNSLock(globalIsDistErasure)
   375  
   376  	// Number of buffers, max 2GB
   377  	n := (2 * humanize.GiByte) / (blockSizeV2 * 2)
   378  
   379  	// Initialize byte pool once for all sets, bpool size is set to
   380  	// setCount * setDriveCount with each memory upto blockSizeV2.
   381  	bp := bpool.NewBytePoolCap(n, blockSizeV2, blockSizeV2*2)
   382  
   383  	for i := 0; i < setCount; i++ {
   384  		s.erasureDisks[i] = make([]StorageAPI, setDriveCount)
   385  	}
   386  
   387  	var erasureLockers = map[string]dsync.NetLocker{}
   388  	for _, endpoint := range endpoints {
   389  		if _, ok := erasureLockers[endpoint.Host]; !ok {
   390  			erasureLockers[endpoint.Host] = newLockAPI(endpoint)
   391  		}
   392  	}
   393  
   394  	for i := 0; i < setCount; i++ {
   395  		var lockerEpSet = set.NewStringSet()
   396  		for j := 0; j < setDriveCount; j++ {
   397  			endpoint := endpoints[i*setDriveCount+j]
   398  			// Only add lockers only one per endpoint and per erasure set.
   399  			if locker, ok := erasureLockers[endpoint.Host]; ok && !lockerEpSet.Contains(endpoint.Host) {
   400  				lockerEpSet.Add(endpoint.Host)
   401  				s.erasureLockers[i] = append(s.erasureLockers[i], locker)
   402  			}
   403  			disk := storageDisks[i*setDriveCount+j]
   404  			if disk == nil {
   405  				continue
   406  			}
   407  			diskID, derr := disk.GetDiskID()
   408  			if derr != nil {
   409  				continue
   410  			}
   411  			m, n, err := findDiskIndexByDiskID(format, diskID)
   412  			if err != nil {
   413  				continue
   414  			}
   415  			disk.SetDiskLoc(s.poolIndex, m, n)
   416  			s.endpointStrings[m*setDriveCount+n] = disk.String()
   417  			s.erasureDisks[m][n] = disk
   418  		}
   419  
   420  		// Initialize erasure objects for a given set.
   421  		s.sets[i] = &erasureObjects{
   422  			setIndex:              i,
   423  			poolIndex:             poolIdx,
   424  			setDriveCount:         setDriveCount,
   425  			defaultParityCount:    defaultParityCount,
   426  			getDisks:              s.GetDisks(i),
   427  			getLockers:            s.GetLockers(i),
   428  			getEndpoints:          s.GetEndpoints(i),
   429  			deletedCleanupSleeper: newDynamicSleeper(10, 2*time.Second),
   430  			nsMutex:               mutex,
   431  			bp:                    bp,
   432  			mrfOpCh:               make(chan partialOperation, 10000),
   433  		}
   434  	}
   435  
   436  	// cleanup ".trash/" folder every 5m minutes with sufficient sleep cycles, between each
   437  	// deletes a dynamic sleeper is used with a factor of 10 ratio with max delay between
   438  	// deletes to be 2 seconds.
   439  	deletedObjectsCleanupInterval, err := time.ParseDuration(env.Get(envMinioDeleteCleanupInterval, "5m"))
   440  	if err != nil {
   441  		return nil, err
   442  	}
   443  
   444  	// start cleanup stale uploads go-routine.
   445  	go s.cleanupStaleUploads(ctx, GlobalStaleUploadsCleanupInterval, GlobalStaleUploadsExpiry)
   446  
   447  	// start cleanup of deleted objects.
   448  	go s.cleanupDeletedObjects(ctx, deletedObjectsCleanupInterval)
   449  
   450  	// Start the disk monitoring and connect routine.
   451  	go s.monitorAndConnectEndpoints(ctx, defaultMonitorConnectEndpointInterval)
   452  	go s.maintainMRFList()
   453  	go s.healMRFRoutine()
   454  
   455  	return s, nil
   456  }
   457  
   458  func (s *erasureSets) cleanupDeletedObjects(ctx context.Context, cleanupInterval time.Duration) {
   459  	timer := time.NewTimer(cleanupInterval)
   460  	defer timer.Stop()
   461  
   462  	for {
   463  		select {
   464  		case <-ctx.Done():
   465  			return
   466  		case <-timer.C:
   467  			// Reset for the next interval
   468  			timer.Reset(cleanupInterval)
   469  
   470  			for _, set := range s.sets {
   471  				set.cleanupDeletedObjects(ctx)
   472  			}
   473  		}
   474  	}
   475  }
   476  
   477  func (s *erasureSets) cleanupStaleUploads(ctx context.Context, cleanupInterval, expiry time.Duration) {
   478  	timer := time.NewTimer(cleanupInterval)
   479  	defer timer.Stop()
   480  
   481  	for {
   482  		select {
   483  		case <-ctx.Done():
   484  			return
   485  		case <-timer.C:
   486  			// Reset for the next interval
   487  			timer.Reset(cleanupInterval)
   488  
   489  			for _, set := range s.sets {
   490  				set.cleanupStaleUploads(ctx, expiry)
   491  			}
   492  		}
   493  	}
   494  }
   495  
   496  const objectErasureMapKey = "objectErasureMap"
   497  
   498  type auditObjectOp struct {
   499  	Pool  int      `json:"poolId"`
   500  	Set   int      `json:"setId"`
   501  	Disks []string `json:"disks"`
   502  }
   503  
   504  func auditObjectErasureSet(ctx context.Context, object string, set *erasureObjects) {
   505  	if len(logger.AuditTargets) == 0 {
   506  		return
   507  	}
   508  
   509  	object = decodeDirObject(object)
   510  
   511  	op := auditObjectOp{
   512  		Pool:  set.poolIndex + 1,
   513  		Set:   set.setIndex + 1,
   514  		Disks: set.getEndpoints(),
   515  	}
   516  
   517  	var objectErasureSetTag map[string]auditObjectOp
   518  	reqInfo := logger.GetReqInfo(ctx)
   519  	for _, kv := range reqInfo.GetTags() {
   520  		if kv.Key == objectErasureMapKey {
   521  			objectErasureSetTag = kv.Val.(map[string]auditObjectOp)
   522  			break
   523  		}
   524  	}
   525  
   526  	if objectErasureSetTag == nil {
   527  		objectErasureSetTag = make(map[string]auditObjectOp)
   528  	}
   529  
   530  	objectErasureSetTag[object] = op
   531  	reqInfo.SetTags(objectErasureMapKey, objectErasureSetTag)
   532  }
   533  
   534  // NewNSLock - initialize a new namespace RWLocker instance.
   535  func (s *erasureSets) NewNSLock(bucket string, objects ...string) RWLocker {
   536  	if len(objects) == 1 {
   537  		return s.getHashedSet(objects[0]).NewNSLock(bucket, objects...)
   538  	}
   539  	return s.getHashedSet("").NewNSLock(bucket, objects...)
   540  }
   541  
   542  // SetDriveCount returns the current drives per set.
   543  func (s *erasureSets) SetDriveCount() int {
   544  	return s.setDriveCount
   545  }
   546  
   547  // ParityCount returns the default parity count used while erasure
   548  // coding objects
   549  func (s *erasureSets) ParityCount() int {
   550  	return s.defaultParityCount
   551  }
   552  
   553  // StorageUsageInfo - combines output of StorageInfo across all erasure coded object sets.
   554  // This only returns disk usage info for ServerPools to perform placement decision, this call
   555  // is not implemented in Object interface and is not meant to be used by other object
   556  // layer implementations.
   557  func (s *erasureSets) StorageUsageInfo(ctx context.Context) StorageInfo {
   558  	storageUsageInfo := func() StorageInfo {
   559  		var storageInfo StorageInfo
   560  		storageInfos := make([]StorageInfo, len(s.sets))
   561  		storageInfo.Backend.Type = madmin.Erasure
   562  
   563  		g := errgroup.WithNErrs(len(s.sets))
   564  		for index := range s.sets {
   565  			index := index
   566  			g.Go(func() error {
   567  				// ignoring errors on purpose
   568  				storageInfos[index], _ = s.sets[index].StorageInfo(ctx)
   569  				return nil
   570  			}, index)
   571  		}
   572  
   573  		// Wait for the go routines.
   574  		g.Wait()
   575  
   576  		for _, lstorageInfo := range storageInfos {
   577  			storageInfo.Disks = append(storageInfo.Disks, lstorageInfo.Disks...)
   578  		}
   579  
   580  		return storageInfo
   581  	}
   582  
   583  	s.disksStorageInfoCache.Once.Do(func() {
   584  		s.disksStorageInfoCache.TTL = time.Second
   585  		s.disksStorageInfoCache.Update = func() (interface{}, error) {
   586  			return storageUsageInfo(), nil
   587  		}
   588  	})
   589  
   590  	v, _ := s.disksStorageInfoCache.Get()
   591  	return v.(StorageInfo)
   592  }
   593  
   594  // StorageInfo - combines output of StorageInfo across all erasure coded object sets.
   595  func (s *erasureSets) StorageInfo(ctx context.Context) (StorageInfo, []error) {
   596  	var storageInfo madmin.StorageInfo
   597  
   598  	storageInfos := make([]madmin.StorageInfo, len(s.sets))
   599  	storageInfoErrs := make([][]error, len(s.sets))
   600  
   601  	g := errgroup.WithNErrs(len(s.sets))
   602  	for index := range s.sets {
   603  		index := index
   604  		g.Go(func() error {
   605  			storageInfos[index], storageInfoErrs[index] = s.sets[index].StorageInfo(ctx)
   606  			return nil
   607  		}, index)
   608  	}
   609  
   610  	// Wait for the go routines.
   611  	g.Wait()
   612  
   613  	for _, lstorageInfo := range storageInfos {
   614  		storageInfo.Disks = append(storageInfo.Disks, lstorageInfo.Disks...)
   615  	}
   616  
   617  	errs := make([]error, 0, len(s.sets)*s.setDriveCount)
   618  	for i := range s.sets {
   619  		errs = append(errs, storageInfoErrs[i]...)
   620  	}
   621  
   622  	return storageInfo, errs
   623  }
   624  
   625  // StorageInfo - combines output of StorageInfo across all erasure coded object sets.
   626  func (s *erasureSets) LocalStorageInfo(ctx context.Context) (StorageInfo, []error) {
   627  	var storageInfo StorageInfo
   628  
   629  	storageInfos := make([]StorageInfo, len(s.sets))
   630  	storageInfoErrs := make([][]error, len(s.sets))
   631  
   632  	g := errgroup.WithNErrs(len(s.sets))
   633  	for index := range s.sets {
   634  		index := index
   635  		g.Go(func() error {
   636  			storageInfos[index], storageInfoErrs[index] = s.sets[index].LocalStorageInfo(ctx)
   637  			return nil
   638  		}, index)
   639  	}
   640  
   641  	// Wait for the go routines.
   642  	g.Wait()
   643  
   644  	for _, lstorageInfo := range storageInfos {
   645  		storageInfo.Disks = append(storageInfo.Disks, lstorageInfo.Disks...)
   646  	}
   647  
   648  	var errs []error
   649  	for i := range s.sets {
   650  		errs = append(errs, storageInfoErrs[i]...)
   651  	}
   652  
   653  	return storageInfo, errs
   654  }
   655  
   656  // Shutdown shutsdown all erasure coded sets in parallel
   657  // returns error upon first error.
   658  func (s *erasureSets) Shutdown(ctx context.Context) error {
   659  	g := errgroup.WithNErrs(len(s.sets))
   660  
   661  	for index := range s.sets {
   662  		index := index
   663  		g.Go(func() error {
   664  			return s.sets[index].Shutdown(ctx)
   665  		}, index)
   666  	}
   667  
   668  	for _, err := range g.Wait() {
   669  		if err != nil {
   670  			return err
   671  		}
   672  	}
   673  	select {
   674  	case _, ok := <-s.setReconnectEvent:
   675  		if ok {
   676  			close(s.setReconnectEvent)
   677  		}
   678  	default:
   679  		close(s.setReconnectEvent)
   680  	}
   681  	return nil
   682  }
   683  
   684  // MakeBucketLocation - creates a new bucket across all sets simultaneously,
   685  // then return the first encountered error
   686  func (s *erasureSets) MakeBucketWithLocation(ctx context.Context, bucket string, opts BucketOptions) error {
   687  	g := errgroup.WithNErrs(len(s.sets))
   688  
   689  	// Create buckets in parallel across all sets.
   690  	for index := range s.sets {
   691  		index := index
   692  		g.Go(func() error {
   693  			return s.sets[index].MakeBucketWithLocation(ctx, bucket, opts)
   694  		}, index)
   695  	}
   696  
   697  	errs := g.Wait()
   698  
   699  	// Return the first encountered error
   700  	for _, err := range errs {
   701  		if err != nil {
   702  			return err
   703  		}
   704  	}
   705  
   706  	// Success.
   707  	return nil
   708  }
   709  
   710  // hashes the key returning an integer based on the input algorithm.
   711  // This function currently supports
   712  // - CRCMOD
   713  // - SIPMOD
   714  // - all new algos.
   715  func sipHashMod(key string, cardinality int, id [16]byte) int {
   716  	if cardinality <= 0 {
   717  		return -1
   718  	}
   719  	// use the faster version as per siphash docs
   720  	// https://github.com/dchest/siphash#usage
   721  	k0, k1 := binary.LittleEndian.Uint64(id[0:8]), binary.LittleEndian.Uint64(id[8:16])
   722  	sum64 := siphash.Hash(k0, k1, []byte(key))
   723  	return int(sum64 % uint64(cardinality))
   724  }
   725  
   726  func crcHashMod(key string, cardinality int) int {
   727  	if cardinality <= 0 {
   728  		return -1
   729  	}
   730  	keyCrc := crc32.Checksum([]byte(key), crc32.IEEETable)
   731  	return int(keyCrc % uint32(cardinality))
   732  }
   733  
   734  func hashKey(algo string, key string, cardinality int, id [16]byte) int {
   735  	switch algo {
   736  	case formatErasureVersionV2DistributionAlgoV1:
   737  		return crcHashMod(key, cardinality)
   738  	case formatErasureVersionV3DistributionAlgoV2, formatErasureVersionV3DistributionAlgoV3:
   739  		return sipHashMod(key, cardinality, id)
   740  	default:
   741  		// Unknown algorithm returns -1, also if cardinality is lesser than 0.
   742  		return -1
   743  	}
   744  }
   745  
   746  // Returns always a same erasure coded set for a given input.
   747  func (s *erasureSets) getHashedSetIndex(input string) int {
   748  	return hashKey(s.distributionAlgo, input, len(s.sets), s.deploymentID)
   749  }
   750  
   751  // Returns always a same erasure coded set for a given input.
   752  func (s *erasureSets) getHashedSet(input string) (set *erasureObjects) {
   753  	return s.sets[s.getHashedSetIndex(input)]
   754  }
   755  
   756  // GetBucketInfo - returns bucket info from one of the erasure coded set.
   757  func (s *erasureSets) GetBucketInfo(ctx context.Context, bucket string) (bucketInfo BucketInfo, err error) {
   758  	return s.getHashedSet("").GetBucketInfo(ctx, bucket)
   759  }
   760  
   761  // IsNotificationSupported returns whether bucket notification is applicable for this layer.
   762  func (s *erasureSets) IsNotificationSupported() bool {
   763  	return s.getHashedSet("").IsNotificationSupported()
   764  }
   765  
   766  // IsListenSupported returns whether listen bucket notification is applicable for this layer.
   767  func (s *erasureSets) IsListenSupported() bool {
   768  	return true
   769  }
   770  
   771  // IsEncryptionSupported returns whether server side encryption is implemented for this layer.
   772  func (s *erasureSets) IsEncryptionSupported() bool {
   773  	return s.getHashedSet("").IsEncryptionSupported()
   774  }
   775  
   776  // IsCompressionSupported returns whether compression is applicable for this layer.
   777  func (s *erasureSets) IsCompressionSupported() bool {
   778  	return s.getHashedSet("").IsCompressionSupported()
   779  }
   780  
   781  func (s *erasureSets) IsTaggingSupported() bool {
   782  	return true
   783  }
   784  
   785  // DeleteBucket - deletes a bucket on all sets simultaneously,
   786  // even if one of the sets fail to delete buckets, we proceed to
   787  // undo a successful operation.
   788  func (s *erasureSets) DeleteBucket(ctx context.Context, bucket string, forceDelete bool) error {
   789  	g := errgroup.WithNErrs(len(s.sets))
   790  
   791  	// Delete buckets in parallel across all sets.
   792  	for index := range s.sets {
   793  		index := index
   794  		g.Go(func() error {
   795  			return s.sets[index].DeleteBucket(ctx, bucket, forceDelete)
   796  		}, index)
   797  	}
   798  
   799  	errs := g.Wait()
   800  	// For any failure, we attempt undo all the delete buckets operation
   801  	// by creating buckets again on all sets which were successfully deleted.
   802  	for _, err := range errs {
   803  		if err != nil {
   804  			undoDeleteBucketSets(ctx, bucket, s.sets, errs)
   805  			return err
   806  		}
   807  	}
   808  
   809  	// Delete all bucket metadata.
   810  	deleteBucketMetadata(ctx, s, bucket)
   811  
   812  	// Success.
   813  	return nil
   814  }
   815  
   816  // This function is used to undo a successful DeleteBucket operation.
   817  func undoDeleteBucketSets(ctx context.Context, bucket string, sets []*erasureObjects, errs []error) {
   818  	g := errgroup.WithNErrs(len(sets))
   819  
   820  	// Undo previous delete bucket on all underlying sets.
   821  	for index := range sets {
   822  		index := index
   823  		g.Go(func() error {
   824  			if errs[index] == nil {
   825  				return sets[index].MakeBucketWithLocation(ctx, bucket, BucketOptions{})
   826  			}
   827  			return nil
   828  		}, index)
   829  	}
   830  
   831  	g.Wait()
   832  }
   833  
   834  // List all buckets from one of the set, we are not doing merge
   835  // sort here just for simplification. As per design it is assumed
   836  // that all buckets are present on all sets.
   837  func (s *erasureSets) ListBuckets(ctx context.Context) (buckets []BucketInfo, err error) {
   838  	var listBuckets []BucketInfo
   839  	var healBuckets = map[string]VolInfo{}
   840  	for _, set := range s.sets {
   841  		// lists all unique buckets across drives.
   842  		if err := listAllBuckets(ctx, set.getDisks(), healBuckets); err != nil {
   843  			return nil, err
   844  		}
   845  	}
   846  
   847  	for _, v := range healBuckets {
   848  		listBuckets = append(listBuckets, BucketInfo(v))
   849  	}
   850  
   851  	sort.Slice(listBuckets, func(i, j int) bool {
   852  		return listBuckets[i].Name < listBuckets[j].Name
   853  	})
   854  
   855  	return listBuckets, nil
   856  }
   857  
   858  // --- Object Operations ---
   859  
   860  // GetObjectNInfo - returns object info and locked object ReadCloser
   861  func (s *erasureSets) GetObjectNInfo(ctx context.Context, bucket, object string, rs *HTTPRangeSpec, h http.Header, lockType LockType, opts ObjectOptions) (gr *GetObjectReader, err error) {
   862  	set := s.getHashedSet(object)
   863  	auditObjectErasureSet(ctx, object, set)
   864  	return set.GetObjectNInfo(ctx, bucket, object, rs, h, lockType, opts)
   865  }
   866  
   867  func (s *erasureSets) parentDirIsObject(ctx context.Context, bucket, parent string) bool {
   868  	if parent == "." {
   869  		return false
   870  	}
   871  	return s.getHashedSet(parent).parentDirIsObject(ctx, bucket, parent)
   872  }
   873  
   874  // PutObject - writes an object to hashedSet based on the object name.
   875  func (s *erasureSets) PutObject(ctx context.Context, bucket string, object string, data *PutObjReader, opts ObjectOptions) (objInfo ObjectInfo, err error) {
   876  	set := s.getHashedSet(object)
   877  	auditObjectErasureSet(ctx, object, set)
   878  	opts.ParentIsObject = s.parentDirIsObject
   879  	return set.PutObject(ctx, bucket, object, data, opts)
   880  }
   881  
   882  // GetObjectInfo - reads object metadata from the hashedSet based on the object name.
   883  func (s *erasureSets) GetObjectInfo(ctx context.Context, bucket, object string, opts ObjectOptions) (objInfo ObjectInfo, err error) {
   884  	set := s.getHashedSet(object)
   885  	auditObjectErasureSet(ctx, object, set)
   886  	return set.GetObjectInfo(ctx, bucket, object, opts)
   887  }
   888  
   889  // DeleteObject - deletes an object from the hashedSet based on the object name.
   890  func (s *erasureSets) DeleteObject(ctx context.Context, bucket string, object string, opts ObjectOptions) (objInfo ObjectInfo, err error) {
   891  	set := s.getHashedSet(object)
   892  	auditObjectErasureSet(ctx, object, set)
   893  	return set.DeleteObject(ctx, bucket, object, opts)
   894  }
   895  
   896  // DeleteObjects - bulk delete of objects
   897  // Bulk delete is only possible within one set. For that purpose
   898  // objects are group by set first, and then bulk delete is invoked
   899  // for each set, the error response of each delete will be returned
   900  func (s *erasureSets) DeleteObjects(ctx context.Context, bucket string, objects []ObjectToDelete, opts ObjectOptions) ([]DeletedObject, []error) {
   901  	type delObj struct {
   902  		// Set index associated to this object
   903  		setIndex int
   904  		// Original index from the list of arguments
   905  		// where this object is passed
   906  		origIndex int
   907  		// object to delete
   908  		object ObjectToDelete
   909  	}
   910  
   911  	// Transform []delObj to the list of object names
   912  	toNames := func(delObjs []delObj) []ObjectToDelete {
   913  		objs := make([]ObjectToDelete, len(delObjs))
   914  		for i, obj := range delObjs {
   915  			objs[i] = obj.object
   916  		}
   917  		return objs
   918  	}
   919  
   920  	// The result of delete operation on all passed objects
   921  	var delErrs = make([]error, len(objects))
   922  
   923  	// The result of delete objects
   924  	var delObjects = make([]DeletedObject, len(objects))
   925  
   926  	// A map between a set and its associated objects
   927  	var objSetMap = make(map[int][]delObj)
   928  
   929  	// Group objects by set index
   930  	for i, object := range objects {
   931  		index := s.getHashedSetIndex(object.ObjectName)
   932  		objSetMap[index] = append(objSetMap[index], delObj{setIndex: index, origIndex: i, object: object})
   933  	}
   934  
   935  	// Invoke bulk delete on objects per set and save
   936  	// the result of the delete operation
   937  	for _, objsGroup := range objSetMap {
   938  		set := s.getHashedSet(objsGroup[0].object.ObjectName)
   939  		dobjects, errs := set.DeleteObjects(ctx, bucket, toNames(objsGroup), opts)
   940  		for i, obj := range objsGroup {
   941  			delErrs[obj.origIndex] = errs[i]
   942  			delObjects[obj.origIndex] = dobjects[i]
   943  			if errs[i] == nil {
   944  				auditObjectErasureSet(ctx, obj.object.ObjectName, set)
   945  			}
   946  		}
   947  	}
   948  
   949  	return delObjects, delErrs
   950  }
   951  
   952  // CopyObject - copies objects from one hashedSet to another hashedSet, on server side.
   953  func (s *erasureSets) CopyObject(ctx context.Context, srcBucket, srcObject, dstBucket, dstObject string, srcInfo ObjectInfo, srcOpts, dstOpts ObjectOptions) (objInfo ObjectInfo, err error) {
   954  	srcSet := s.getHashedSet(srcObject)
   955  	dstSet := s.getHashedSet(dstObject)
   956  
   957  	auditObjectErasureSet(ctx, dstObject, dstSet)
   958  
   959  	cpSrcDstSame := srcSet == dstSet
   960  	// Check if this request is only metadata update.
   961  	if cpSrcDstSame && srcInfo.metadataOnly {
   962  		// Version ID is set for the destination and source == destination version ID.
   963  		// perform an in-place update.
   964  		if dstOpts.VersionID != "" && srcOpts.VersionID == dstOpts.VersionID {
   965  			return srcSet.CopyObject(ctx, srcBucket, srcObject, dstBucket, dstObject, srcInfo, srcOpts, dstOpts)
   966  		}
   967  		// Destination is not versioned and source version ID is empty
   968  		// perform an in-place update.
   969  		if !dstOpts.Versioned && srcOpts.VersionID == "" {
   970  			return srcSet.CopyObject(ctx, srcBucket, srcObject, dstBucket, dstObject, srcInfo, srcOpts, dstOpts)
   971  		}
   972  		// CopyObject optimization where we don't create an entire copy
   973  		// of the content, instead we add a reference, we disallow legacy
   974  		// objects to be self referenced in this manner so make sure
   975  		// that we actually create a new dataDir for legacy objects.
   976  		if dstOpts.Versioned && srcOpts.VersionID != dstOpts.VersionID && !srcInfo.Legacy {
   977  			srcInfo.versionOnly = true
   978  			return srcSet.CopyObject(ctx, srcBucket, srcObject, dstBucket, dstObject, srcInfo, srcOpts, dstOpts)
   979  		}
   980  	}
   981  
   982  	putOpts := ObjectOptions{
   983  		ServerSideEncryption: dstOpts.ServerSideEncryption,
   984  		UserDefined:          srcInfo.UserDefined,
   985  		Versioned:            dstOpts.Versioned,
   986  		VersionID:            dstOpts.VersionID,
   987  		MTime:                dstOpts.MTime,
   988  	}
   989  
   990  	return dstSet.putObject(ctx, dstBucket, dstObject, srcInfo.PutObjReader, putOpts)
   991  }
   992  
   993  func (s *erasureSets) ListMultipartUploads(ctx context.Context, bucket, prefix, keyMarker, uploadIDMarker, delimiter string, maxUploads int) (result ListMultipartsInfo, err error) {
   994  	// In list multipart uploads we are going to treat input prefix as the object,
   995  	// this means that we are not supporting directory navigation.
   996  	set := s.getHashedSet(prefix)
   997  	auditObjectErasureSet(ctx, prefix, set)
   998  	return set.ListMultipartUploads(ctx, bucket, prefix, keyMarker, uploadIDMarker, delimiter, maxUploads)
   999  }
  1000  
  1001  // Initiate a new multipart upload on a hashedSet based on object name.
  1002  func (s *erasureSets) NewMultipartUpload(ctx context.Context, bucket, object string, opts ObjectOptions) (uploadID string, err error) {
  1003  	set := s.getHashedSet(object)
  1004  	auditObjectErasureSet(ctx, object, set)
  1005  	return set.NewMultipartUpload(ctx, bucket, object, opts)
  1006  }
  1007  
  1008  // Copies a part of an object from source hashedSet to destination hashedSet.
  1009  func (s *erasureSets) CopyObjectPart(ctx context.Context, srcBucket, srcObject, destBucket, destObject string, uploadID string, partID int,
  1010  	startOffset int64, length int64, srcInfo ObjectInfo, srcOpts, dstOpts ObjectOptions) (partInfo PartInfo, err error) {
  1011  	destSet := s.getHashedSet(destObject)
  1012  	auditObjectErasureSet(ctx, destObject, destSet)
  1013  	return destSet.PutObjectPart(ctx, destBucket, destObject, uploadID, partID, NewPutObjReader(srcInfo.Reader), dstOpts)
  1014  }
  1015  
  1016  // PutObjectPart - writes part of an object to hashedSet based on the object name.
  1017  func (s *erasureSets) PutObjectPart(ctx context.Context, bucket, object, uploadID string, partID int, data *PutObjReader, opts ObjectOptions) (info PartInfo, err error) {
  1018  	set := s.getHashedSet(object)
  1019  	auditObjectErasureSet(ctx, object, set)
  1020  	return set.PutObjectPart(ctx, bucket, object, uploadID, partID, data, opts)
  1021  }
  1022  
  1023  // GetMultipartInfo - return multipart metadata info uploaded at hashedSet.
  1024  func (s *erasureSets) GetMultipartInfo(ctx context.Context, bucket, object, uploadID string, opts ObjectOptions) (result MultipartInfo, err error) {
  1025  	set := s.getHashedSet(object)
  1026  	auditObjectErasureSet(ctx, object, set)
  1027  	return set.GetMultipartInfo(ctx, bucket, object, uploadID, opts)
  1028  }
  1029  
  1030  // ListObjectParts - lists all uploaded parts to an object in hashedSet.
  1031  func (s *erasureSets) ListObjectParts(ctx context.Context, bucket, object, uploadID string, partNumberMarker int, maxParts int, opts ObjectOptions) (result ListPartsInfo, err error) {
  1032  	set := s.getHashedSet(object)
  1033  	auditObjectErasureSet(ctx, object, set)
  1034  	return set.ListObjectParts(ctx, bucket, object, uploadID, partNumberMarker, maxParts, opts)
  1035  }
  1036  
  1037  // Aborts an in-progress multipart operation on hashedSet based on the object name.
  1038  func (s *erasureSets) AbortMultipartUpload(ctx context.Context, bucket, object, uploadID string, opts ObjectOptions) error {
  1039  	set := s.getHashedSet(object)
  1040  	auditObjectErasureSet(ctx, object, set)
  1041  	return set.AbortMultipartUpload(ctx, bucket, object, uploadID, opts)
  1042  }
  1043  
  1044  // CompleteMultipartUpload - completes a pending multipart transaction, on hashedSet based on object name.
  1045  func (s *erasureSets) CompleteMultipartUpload(ctx context.Context, bucket, object, uploadID string, uploadedParts []CompletePart, opts ObjectOptions) (objInfo ObjectInfo, err error) {
  1046  	set := s.getHashedSet(object)
  1047  	auditObjectErasureSet(ctx, object, set)
  1048  	opts.ParentIsObject = s.parentDirIsObject
  1049  	return set.CompleteMultipartUpload(ctx, bucket, object, uploadID, uploadedParts, opts)
  1050  }
  1051  
  1052  /*
  1053  
  1054  All disks online
  1055  -----------------
  1056  - All Unformatted - format all and return success.
  1057  - Some Unformatted - format all and return success.
  1058  - Any JBOD inconsistent - return failure
  1059  - Some are corrupt (missing format.json) - return failure
  1060  - Any unrecognized disks - return failure
  1061  
  1062  Some disks are offline and we have quorum.
  1063  -----------------
  1064  - Some unformatted - format all and return success,
  1065    treat disks offline as corrupted.
  1066  - Any JBOD inconsistent - return failure
  1067  - Some are corrupt (missing format.json)
  1068  - Any unrecognized disks - return failure
  1069  
  1070  No read quorum
  1071  -----------------
  1072  failure for all cases.
  1073  
  1074  // Pseudo code for managing `format.json`.
  1075  
  1076  // Generic checks.
  1077  if (no quorum) return error
  1078  if (any disk is corrupt) return error // Always error
  1079  if (jbod inconsistent) return error // Always error.
  1080  if (disks not recognized) // Always error.
  1081  
  1082  // Specific checks.
  1083  if (all disks online)
  1084    if (all disks return format.json)
  1085       if (jbod consistent)
  1086          if (all disks recognized)
  1087            return
  1088    else
  1089       if (all disks return format.json not found)
  1090          return error
  1091       else (some disks return format.json not found)
  1092          (heal format)
  1093          return
  1094       fi
  1095     fi
  1096  else
  1097     if (some disks return format.json not found)
  1098          // Offline disks are marked as dead.
  1099          (heal format) // Offline disks should be marked as dead.
  1100          return success
  1101     fi
  1102  fi
  1103  */
  1104  
  1105  func formatsToDrivesInfo(endpoints Endpoints, formats []*formatErasureV3, sErrs []error) (beforeDrives []madmin.HealDriveInfo) {
  1106  	beforeDrives = make([]madmin.HealDriveInfo, len(endpoints))
  1107  	// Existing formats are available (i.e. ok), so save it in
  1108  	// result, also populate disks to be healed.
  1109  	for i, format := range formats {
  1110  		drive := endpoints.GetString(i)
  1111  		var state = madmin.DriveStateCorrupt
  1112  		switch {
  1113  		case format != nil:
  1114  			state = madmin.DriveStateOk
  1115  		case sErrs[i] == errUnformattedDisk:
  1116  			state = madmin.DriveStateMissing
  1117  		case sErrs[i] == errDiskNotFound:
  1118  			state = madmin.DriveStateOffline
  1119  		}
  1120  		beforeDrives[i] = madmin.HealDriveInfo{
  1121  			UUID: func() string {
  1122  				if format != nil {
  1123  					return format.Erasure.This
  1124  				}
  1125  				return ""
  1126  			}(),
  1127  			Endpoint: drive,
  1128  			State:    state,
  1129  		}
  1130  	}
  1131  
  1132  	return beforeDrives
  1133  }
  1134  
  1135  // If it is a single node Erasure and all disks are root disks, it is most likely a test setup, else it is a production setup.
  1136  // On a test setup we allow creation of format.json on root disks to help with dev/testing.
  1137  func isTestSetup(infos []DiskInfo, errs []error) bool {
  1138  	rootDiskCount := 0
  1139  	for i := range errs {
  1140  		if errs[i] == nil || errs[i] == errUnformattedDisk {
  1141  			if infos[i].RootDisk {
  1142  				rootDiskCount++
  1143  			}
  1144  		}
  1145  	}
  1146  	// It is a test setup if all disks are root disks in quorum.
  1147  	return rootDiskCount >= len(infos)/2+1
  1148  }
  1149  
  1150  func getHealDiskInfos(storageDisks []StorageAPI, errs []error) ([]DiskInfo, []error) {
  1151  	infos := make([]DiskInfo, len(storageDisks))
  1152  	g := errgroup.WithNErrs(len(storageDisks))
  1153  	for index := range storageDisks {
  1154  		index := index
  1155  		g.Go(func() error {
  1156  			if errs[index] != nil && errs[index] != errUnformattedDisk {
  1157  				return errs[index]
  1158  			}
  1159  			if storageDisks[index] == nil {
  1160  				return errDiskNotFound
  1161  			}
  1162  			var err error
  1163  			infos[index], err = storageDisks[index].DiskInfo(context.TODO())
  1164  			return err
  1165  		}, index)
  1166  	}
  1167  	return infos, g.Wait()
  1168  }
  1169  
  1170  // Mark root disks as down so as not to heal them.
  1171  func markRootDisksAsDown(storageDisks []StorageAPI, errs []error) {
  1172  	var infos []DiskInfo
  1173  	infos, errs = getHealDiskInfos(storageDisks, errs)
  1174  	if !isTestSetup(infos, errs) {
  1175  		for i := range storageDisks {
  1176  			if storageDisks[i] != nil && infos[i].RootDisk {
  1177  				// We should not heal on root disk. i.e in a situation where the minio-administrator has unmounted a
  1178  				// defective drive we should not heal a path on the root disk.
  1179  				logger.Info("Disk `%s` the same as the system root disk.\n"+
  1180  					"Disk will not be used. Please supply a separate disk and restart the server.",
  1181  					storageDisks[i].String())
  1182  				storageDisks[i] = nil
  1183  			}
  1184  		}
  1185  	}
  1186  }
  1187  
  1188  // HealFormat - heals missing `format.json` on fresh unformatted disks.
  1189  func (s *erasureSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.HealResultItem, err error) {
  1190  	storageDisks, errs := initStorageDisksWithErrorsWithoutHealthCheck(s.endpoints)
  1191  	for i, derr := range errs {
  1192  		if derr != nil && derr != errDiskNotFound {
  1193  			return madmin.HealResultItem{}, fmt.Errorf("Disk %s: %w", s.endpoints[i], derr)
  1194  		}
  1195  	}
  1196  
  1197  	defer func(storageDisks []StorageAPI) {
  1198  		if err != nil {
  1199  			closeStorageDisks(storageDisks)
  1200  		}
  1201  	}(storageDisks)
  1202  
  1203  	formats, sErrs := loadFormatErasureAll(storageDisks, true)
  1204  	if err = checkFormatErasureValues(formats, storageDisks, s.setDriveCount); err != nil {
  1205  		return madmin.HealResultItem{}, err
  1206  	}
  1207  
  1208  	// Mark all root disks down
  1209  	markRootDisksAsDown(storageDisks, sErrs)
  1210  
  1211  	refFormat, err := getFormatErasureInQuorum(formats)
  1212  	if err != nil {
  1213  		return res, err
  1214  	}
  1215  
  1216  	// Prepare heal-result
  1217  	res = madmin.HealResultItem{
  1218  		Type:      madmin.HealItemMetadata,
  1219  		Detail:    "disk-format",
  1220  		DiskCount: s.setCount * s.setDriveCount,
  1221  		SetCount:  s.setCount,
  1222  	}
  1223  
  1224  	// Fetch all the drive info status.
  1225  	beforeDrives := formatsToDrivesInfo(s.endpoints, formats, sErrs)
  1226  
  1227  	res.After.Drives = make([]madmin.HealDriveInfo, len(beforeDrives))
  1228  	res.Before.Drives = make([]madmin.HealDriveInfo, len(beforeDrives))
  1229  	// Copy "after" drive state too from before.
  1230  	for k, v := range beforeDrives {
  1231  		res.Before.Drives[k] = v
  1232  		res.After.Drives[k] = v
  1233  	}
  1234  
  1235  	if countErrs(sErrs, errUnformattedDisk) == 0 {
  1236  		return res, errNoHealRequired
  1237  	}
  1238  
  1239  	// Initialize a new set of set formats which will be written to disk.
  1240  	newFormatSets := newHealFormatSets(refFormat, s.setCount, s.setDriveCount, formats, sErrs)
  1241  
  1242  	if !dryRun {
  1243  		var tmpNewFormats = make([]*formatErasureV3, s.setCount*s.setDriveCount)
  1244  		for i := range newFormatSets {
  1245  			for j := range newFormatSets[i] {
  1246  				if newFormatSets[i][j] == nil {
  1247  					continue
  1248  				}
  1249  				res.After.Drives[i*s.setDriveCount+j].UUID = newFormatSets[i][j].Erasure.This
  1250  				res.After.Drives[i*s.setDriveCount+j].State = madmin.DriveStateOk
  1251  				tmpNewFormats[i*s.setDriveCount+j] = newFormatSets[i][j]
  1252  			}
  1253  		}
  1254  
  1255  		// Save new formats `format.json` on unformatted disks.
  1256  		if err = saveUnformattedFormat(ctx, storageDisks, tmpNewFormats); err != nil {
  1257  			return madmin.HealResultItem{}, err
  1258  		}
  1259  
  1260  		s.erasureDisksMu.Lock()
  1261  
  1262  		for index, format := range tmpNewFormats {
  1263  			if format == nil {
  1264  				continue
  1265  			}
  1266  
  1267  			m, n, err := findDiskIndexByDiskID(refFormat, format.Erasure.This)
  1268  			if err != nil {
  1269  				continue
  1270  			}
  1271  
  1272  			if s.erasureDisks[m][n] != nil {
  1273  				s.erasureDisks[m][n].Close()
  1274  			}
  1275  			storageDisks[index].SetDiskLoc(s.poolIndex, m, n)
  1276  			s.erasureDisks[m][n] = storageDisks[index]
  1277  			s.endpointStrings[m*s.setDriveCount+n] = storageDisks[index].String()
  1278  		}
  1279  
  1280  		// Replace reference format with what was loaded from disks.
  1281  		s.format = refFormat
  1282  
  1283  		s.erasureDisksMu.Unlock()
  1284  	}
  1285  
  1286  	return res, nil
  1287  }
  1288  
  1289  // HealBucket - heals inconsistent buckets and bucket metadata on all sets.
  1290  func (s *erasureSets) HealBucket(ctx context.Context, bucket string, opts madmin.HealOpts) (result madmin.HealResultItem, err error) {
  1291  	// Initialize heal result info
  1292  	result = madmin.HealResultItem{
  1293  		Type:      madmin.HealItemBucket,
  1294  		Bucket:    bucket,
  1295  		DiskCount: s.setCount * s.setDriveCount,
  1296  		SetCount:  s.setCount,
  1297  	}
  1298  
  1299  	for _, set := range s.sets {
  1300  		var healResult madmin.HealResultItem
  1301  		healResult, err = set.HealBucket(ctx, bucket, opts)
  1302  		if err != nil {
  1303  			return result, toObjectErr(err, bucket)
  1304  		}
  1305  		result.Before.Drives = append(result.Before.Drives, healResult.Before.Drives...)
  1306  		result.After.Drives = append(result.After.Drives, healResult.After.Drives...)
  1307  	}
  1308  
  1309  	// Check if we had quorum to write, if not return an appropriate error.
  1310  	_, afterDriveOnline := result.GetOnlineCounts()
  1311  	if afterDriveOnline < ((s.setCount*s.setDriveCount)/2)+1 {
  1312  		return result, toObjectErr(errErasureWriteQuorum, bucket)
  1313  	}
  1314  
  1315  	return result, nil
  1316  }
  1317  
  1318  // HealObject - heals inconsistent object on a hashedSet based on object name.
  1319  func (s *erasureSets) HealObject(ctx context.Context, bucket, object, versionID string, opts madmin.HealOpts) (madmin.HealResultItem, error) {
  1320  	return s.getHashedSet(object).HealObject(ctx, bucket, object, versionID, opts)
  1321  }
  1322  
  1323  // PutObjectMetadata - replace or add metadata to an existing object/version
  1324  func (s *erasureSets) PutObjectMetadata(ctx context.Context, bucket, object string, opts ObjectOptions) (ObjectInfo, error) {
  1325  	er := s.getHashedSet(object)
  1326  	return er.PutObjectMetadata(ctx, bucket, object, opts)
  1327  }
  1328  
  1329  // PutObjectTags - replace or add tags to an existing object
  1330  func (s *erasureSets) PutObjectTags(ctx context.Context, bucket, object string, tags string, opts ObjectOptions) (ObjectInfo, error) {
  1331  	er := s.getHashedSet(object)
  1332  	return er.PutObjectTags(ctx, bucket, object, tags, opts)
  1333  }
  1334  
  1335  // DeleteObjectTags - delete object tags from an existing object
  1336  func (s *erasureSets) DeleteObjectTags(ctx context.Context, bucket, object string, opts ObjectOptions) (ObjectInfo, error) {
  1337  	er := s.getHashedSet(object)
  1338  	return er.DeleteObjectTags(ctx, bucket, object, opts)
  1339  }
  1340  
  1341  // GetObjectTags - get object tags from an existing object
  1342  func (s *erasureSets) GetObjectTags(ctx context.Context, bucket, object string, opts ObjectOptions) (*tags.Tags, error) {
  1343  	er := s.getHashedSet(object)
  1344  	return er.GetObjectTags(ctx, bucket, object, opts)
  1345  }
  1346  
  1347  // maintainMRFList gathers the list of successful partial uploads
  1348  // from all underlying er.sets and puts them in a global map which
  1349  // should not have more than 10000 entries.
  1350  func (s *erasureSets) maintainMRFList() {
  1351  	var agg = make(chan partialOperation, 10000)
  1352  	for i, er := range s.sets {
  1353  		go func(c <-chan partialOperation, setIndex int) {
  1354  			for msg := range c {
  1355  				msg.failedSet = setIndex
  1356  				select {
  1357  				case agg <- msg:
  1358  				default:
  1359  				}
  1360  			}
  1361  		}(er.mrfOpCh, i)
  1362  	}
  1363  
  1364  	for fOp := range agg {
  1365  		s.mrfMU.Lock()
  1366  		if len(s.mrfOperations) > 10000 {
  1367  			s.mrfMU.Unlock()
  1368  			continue
  1369  		}
  1370  		s.mrfOperations[healSource{
  1371  			bucket:    fOp.bucket,
  1372  			object:    fOp.object,
  1373  			versionID: fOp.versionID,
  1374  			opts:      &madmin.HealOpts{Remove: true},
  1375  		}] = fOp.failedSet
  1376  		s.mrfMU.Unlock()
  1377  	}
  1378  }
  1379  
  1380  // healMRFRoutine monitors new disks connection, sweep the MRF list
  1381  // to find objects related to the new disk that needs to be healed.
  1382  func (s *erasureSets) healMRFRoutine() {
  1383  	// Wait until background heal state is initialized
  1384  	bgSeq := mustGetHealSequence(GlobalContext)
  1385  
  1386  	for setIndex := range s.setReconnectEvent {
  1387  		// Get the list of objects related the er.set
  1388  		// to which the connected disk belongs.
  1389  		var mrfOperations []healSource
  1390  		s.mrfMU.Lock()
  1391  		for k, v := range s.mrfOperations {
  1392  			if v == setIndex {
  1393  				mrfOperations = append(mrfOperations, k)
  1394  			}
  1395  		}
  1396  		s.mrfMU.Unlock()
  1397  
  1398  		// Heal objects
  1399  		for _, u := range mrfOperations {
  1400  			waitForLowHTTPReq(globalHealConfig.IOCount, globalHealConfig.Sleep)
  1401  
  1402  			// Send an object to background heal
  1403  			bgSeq.sourceCh <- u
  1404  
  1405  			s.mrfMU.Lock()
  1406  			delete(s.mrfOperations, u)
  1407  			s.mrfMU.Unlock()
  1408  		}
  1409  	}
  1410  }