github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/erasure-server-pool-rebalance.go (about)

     1  // Copyright (c) 2015-2022 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package cmd
    19  
    20  import (
    21  	"context"
    22  	"encoding/binary"
    23  	"errors"
    24  	"fmt"
    25  	"io"
    26  	"math"
    27  	"math/rand"
    28  	"net/http"
    29  	"strings"
    30  	"time"
    31  
    32  	"github.com/dustin/go-humanize"
    33  	"github.com/lithammer/shortuuid/v4"
    34  	"github.com/minio/madmin-go/v3"
    35  	"github.com/minio/minio/internal/hash"
    36  	xioutil "github.com/minio/minio/internal/ioutil"
    37  	"github.com/minio/minio/internal/logger"
    38  	"github.com/minio/pkg/v2/env"
    39  	"github.com/minio/pkg/v2/workers"
    40  )
    41  
    42  //go:generate msgp -file $GOFILE -unexported
    43  
    44  // rebalanceStats contains per-pool rebalance statistics like number of objects,
    45  // versions and bytes rebalanced out of a pool
    46  type rebalanceStats struct {
    47  	InitFreeSpace uint64 `json:"initFreeSpace" msg:"ifs"` // Pool free space at the start of rebalance
    48  	InitCapacity  uint64 `json:"initCapacity" msg:"ic"`   // Pool capacity at the start of rebalance
    49  
    50  	Buckets           []string      `json:"buckets" msg:"bus"`           // buckets being rebalanced or to be rebalanced
    51  	RebalancedBuckets []string      `json:"rebalancedBuckets" msg:"rbs"` // buckets rebalanced
    52  	Bucket            string        `json:"bucket" msg:"bu"`             // Last rebalanced bucket
    53  	Object            string        `json:"object" msg:"ob"`             // Last rebalanced object
    54  	NumObjects        uint64        `json:"numObjects" msg:"no"`         // Number of objects rebalanced
    55  	NumVersions       uint64        `json:"numVersions" msg:"nv"`        // Number of versions rebalanced
    56  	Bytes             uint64        `json:"bytes" msg:"bs"`              // Number of bytes rebalanced
    57  	Participating     bool          `json:"participating" msg:"par"`
    58  	Info              rebalanceInfo `json:"info" msg:"inf"`
    59  }
    60  
    61  func (rs *rebalanceStats) update(bucket string, fi FileInfo) {
    62  	if fi.IsLatest {
    63  		rs.NumObjects++
    64  	}
    65  
    66  	rs.NumVersions++
    67  	onDiskSz := int64(0)
    68  	if !fi.Deleted {
    69  		onDiskSz = fi.Size * int64(fi.Erasure.DataBlocks+fi.Erasure.ParityBlocks) / int64(fi.Erasure.DataBlocks)
    70  	}
    71  	rs.Bytes += uint64(onDiskSz)
    72  	rs.Bucket = bucket
    73  	rs.Object = fi.Name
    74  }
    75  
    76  type rstats []*rebalanceStats
    77  
    78  //go:generate stringer -type=rebalStatus -trimprefix=rebal $GOFILE
    79  type rebalStatus uint8
    80  
    81  const (
    82  	rebalNone rebalStatus = iota
    83  	rebalStarted
    84  	rebalCompleted
    85  	rebalStopped
    86  	rebalFailed
    87  )
    88  
    89  type rebalanceInfo struct {
    90  	StartTime time.Time   `msg:"startTs"` // Time at which rebalance-start was issued
    91  	EndTime   time.Time   `msg:"stopTs"`  // Time at which rebalance operation completed or rebalance-stop was called
    92  	Status    rebalStatus `msg:"status"`  // Current state of rebalance operation. One of Started|Stopped|Completed|Failed.
    93  }
    94  
    95  // rebalanceMeta contains information pertaining to an ongoing rebalance operation.
    96  type rebalanceMeta struct {
    97  	cancel          context.CancelFunc `msg:"-"` // to be invoked on rebalance-stop
    98  	lastRefreshedAt time.Time          `msg:"-"`
    99  	StoppedAt       time.Time          `msg:"stopTs"` // Time when rebalance-stop was issued.
   100  	ID              string             `msg:"id"`     // ID of the ongoing rebalance operation
   101  	PercentFreeGoal float64            `msg:"pf"`     // Computed from total free space and capacity at the start of rebalance
   102  	PoolStats       []*rebalanceStats  `msg:"rss"`    // Per-pool rebalance stats keyed by pool index
   103  }
   104  
   105  var errRebalanceNotStarted = errors.New("rebalance not started")
   106  
   107  func (z *erasureServerPools) loadRebalanceMeta(ctx context.Context) error {
   108  	r := &rebalanceMeta{}
   109  	err := r.load(ctx, z.serverPools[0])
   110  	if err != nil {
   111  		if errors.Is(err, errConfigNotFound) {
   112  			return nil
   113  		}
   114  		return err
   115  	}
   116  
   117  	z.rebalMu.Lock()
   118  	if len(r.PoolStats) == len(z.serverPools) {
   119  		z.rebalMeta = r
   120  	} else {
   121  		z.updateRebalanceStats(ctx)
   122  	}
   123  	z.rebalMu.Unlock()
   124  
   125  	return nil
   126  }
   127  
   128  // updates rebalance.bin from let's say 2 pool setup in the middle
   129  // of a rebalance, was expanded can cause z.rebalMeta to be outdated
   130  // due to a missing new pool. This function tries to handle this
   131  // scenario, albeit rare it seems to have occurred in the wild.
   132  //
   133  // since we do not explicitly disallow it, but it is okay for them
   134  // expand and then we continue to rebalance.
   135  func (z *erasureServerPools) updateRebalanceStats(ctx context.Context) error {
   136  	var ok bool
   137  	for i := range z.serverPools {
   138  		if z.findIndex(i) == -1 {
   139  			// Also ensure to initialize rebalanceStats to indicate
   140  			// its a new pool that can receive rebalanced data.
   141  			z.rebalMeta.PoolStats = append(z.rebalMeta.PoolStats, &rebalanceStats{})
   142  			ok = true
   143  		}
   144  	}
   145  	if ok {
   146  		lock := z.serverPools[0].NewNSLock(minioMetaBucket, rebalMetaName)
   147  		lkCtx, err := lock.GetLock(ctx, globalOperationTimeout)
   148  		if err != nil {
   149  			logger.LogIf(ctx, fmt.Errorf("failed to acquire write lock on %s/%s: %w", minioMetaBucket, rebalMetaName, err))
   150  			return err
   151  		}
   152  		defer lock.Unlock(lkCtx)
   153  
   154  		ctx = lkCtx.Context()
   155  
   156  		noLockOpts := ObjectOptions{NoLock: true}
   157  		return z.rebalMeta.saveWithOpts(ctx, z.serverPools[0], noLockOpts)
   158  	}
   159  
   160  	return nil
   161  }
   162  
   163  func (z *erasureServerPools) findIndex(index int) int {
   164  	for i := 0; i < len(z.rebalMeta.PoolStats); i++ {
   165  		if i == index {
   166  			return index
   167  		}
   168  	}
   169  	return -1
   170  }
   171  
   172  // initRebalanceMeta initializes rebalance metadata for a new rebalance
   173  // operation and saves it in the object store.
   174  func (z *erasureServerPools) initRebalanceMeta(ctx context.Context, buckets []string) (arn string, err error) {
   175  	r := &rebalanceMeta{
   176  		ID:        shortuuid.New(),
   177  		PoolStats: make([]*rebalanceStats, len(z.serverPools)),
   178  	}
   179  
   180  	// Fetch disk capacity and available space.
   181  	si := z.StorageInfo(ctx, true)
   182  	diskStats := make([]struct {
   183  		AvailableSpace uint64
   184  		TotalSpace     uint64
   185  	}, len(z.serverPools))
   186  	var totalCap, totalFree uint64
   187  	for _, disk := range si.Disks {
   188  		// Ignore invalid.
   189  		if disk.PoolIndex < 0 || len(diskStats) <= disk.PoolIndex {
   190  			// https://github.com/minio/minio/issues/16500
   191  			continue
   192  		}
   193  		totalCap += disk.TotalSpace
   194  		totalFree += disk.AvailableSpace
   195  
   196  		diskStats[disk.PoolIndex].AvailableSpace += disk.AvailableSpace
   197  		diskStats[disk.PoolIndex].TotalSpace += disk.TotalSpace
   198  	}
   199  	r.PercentFreeGoal = float64(totalFree) / float64(totalCap)
   200  
   201  	now := time.Now()
   202  	for idx := range z.serverPools {
   203  		r.PoolStats[idx] = &rebalanceStats{
   204  			Buckets:           make([]string, len(buckets)),
   205  			RebalancedBuckets: make([]string, 0, len(buckets)),
   206  			InitFreeSpace:     diskStats[idx].AvailableSpace,
   207  			InitCapacity:      diskStats[idx].TotalSpace,
   208  		}
   209  		copy(r.PoolStats[idx].Buckets, buckets)
   210  
   211  		if pfi := float64(diskStats[idx].AvailableSpace) / float64(diskStats[idx].TotalSpace); pfi < r.PercentFreeGoal {
   212  			r.PoolStats[idx].Participating = true
   213  			r.PoolStats[idx].Info = rebalanceInfo{
   214  				StartTime: now,
   215  				Status:    rebalStarted,
   216  			}
   217  		}
   218  	}
   219  
   220  	err = r.save(ctx, z.serverPools[0])
   221  	if err != nil {
   222  		return arn, err
   223  	}
   224  
   225  	z.rebalMeta = r
   226  	return r.ID, nil
   227  }
   228  
   229  func (z *erasureServerPools) updatePoolStats(poolIdx int, bucket string, fi FileInfo) {
   230  	z.rebalMu.Lock()
   231  	defer z.rebalMu.Unlock()
   232  
   233  	r := z.rebalMeta
   234  	if r == nil {
   235  		return
   236  	}
   237  
   238  	r.PoolStats[poolIdx].update(bucket, fi)
   239  }
   240  
   241  const (
   242  	rebalMetaName = "rebalance.bin"
   243  	rebalMetaFmt  = 1
   244  	rebalMetaVer  = 1
   245  )
   246  
   247  func (z *erasureServerPools) nextRebalBucket(poolIdx int) (string, bool) {
   248  	z.rebalMu.RLock()
   249  	defer z.rebalMu.RUnlock()
   250  
   251  	r := z.rebalMeta
   252  	if r == nil {
   253  		return "", false
   254  	}
   255  
   256  	ps := r.PoolStats[poolIdx]
   257  	if ps == nil {
   258  		return "", false
   259  	}
   260  
   261  	if ps.Info.Status == rebalCompleted || !ps.Participating {
   262  		return "", false
   263  	}
   264  
   265  	if len(ps.Buckets) == 0 {
   266  		return "", false
   267  	}
   268  
   269  	return ps.Buckets[0], true
   270  }
   271  
   272  func (z *erasureServerPools) bucketRebalanceDone(bucket string, poolIdx int) {
   273  	z.rebalMu.Lock()
   274  	defer z.rebalMu.Unlock()
   275  
   276  	ps := z.rebalMeta.PoolStats[poolIdx]
   277  	if ps == nil {
   278  		return
   279  	}
   280  
   281  	for i, b := range ps.Buckets {
   282  		if b == bucket {
   283  			ps.Buckets = append(ps.Buckets[:i], ps.Buckets[i+1:]...)
   284  			ps.RebalancedBuckets = append(ps.RebalancedBuckets, bucket)
   285  			break
   286  		}
   287  	}
   288  }
   289  
   290  func (r *rebalanceMeta) load(ctx context.Context, store objectIO) error {
   291  	return r.loadWithOpts(ctx, store, ObjectOptions{})
   292  }
   293  
   294  func (r *rebalanceMeta) loadWithOpts(ctx context.Context, store objectIO, opts ObjectOptions) error {
   295  	data, _, err := readConfigWithMetadata(ctx, store, rebalMetaName, opts)
   296  	if err != nil {
   297  		return err
   298  	}
   299  
   300  	if len(data) == 0 {
   301  		return nil
   302  	}
   303  	if len(data) <= 4 {
   304  		return fmt.Errorf("rebalanceMeta: no data")
   305  	}
   306  
   307  	// Read header
   308  	switch binary.LittleEndian.Uint16(data[0:2]) {
   309  	case rebalMetaFmt:
   310  	default:
   311  		return fmt.Errorf("rebalanceMeta: unknown format: %d", binary.LittleEndian.Uint16(data[0:2]))
   312  	}
   313  	switch binary.LittleEndian.Uint16(data[2:4]) {
   314  	case rebalMetaVer:
   315  	default:
   316  		return fmt.Errorf("rebalanceMeta: unknown version: %d", binary.LittleEndian.Uint16(data[2:4]))
   317  	}
   318  
   319  	// OK, parse data.
   320  	if _, err = r.UnmarshalMsg(data[4:]); err != nil {
   321  		return err
   322  	}
   323  
   324  	r.lastRefreshedAt = time.Now()
   325  
   326  	return nil
   327  }
   328  
   329  func (r *rebalanceMeta) saveWithOpts(ctx context.Context, store objectIO, opts ObjectOptions) error {
   330  	data := make([]byte, 4, r.Msgsize()+4)
   331  
   332  	// Initialize the header.
   333  	binary.LittleEndian.PutUint16(data[0:2], rebalMetaFmt)
   334  	binary.LittleEndian.PutUint16(data[2:4], rebalMetaVer)
   335  
   336  	buf, err := r.MarshalMsg(data)
   337  	if err != nil {
   338  		return err
   339  	}
   340  
   341  	return saveConfigWithOpts(ctx, store, rebalMetaName, buf, opts)
   342  }
   343  
   344  func (r *rebalanceMeta) save(ctx context.Context, store objectIO) error {
   345  	return r.saveWithOpts(ctx, store, ObjectOptions{})
   346  }
   347  
   348  func (z *erasureServerPools) IsRebalanceStarted() bool {
   349  	z.rebalMu.RLock()
   350  	defer z.rebalMu.RUnlock()
   351  
   352  	if r := z.rebalMeta; r != nil {
   353  		if r.StoppedAt.IsZero() {
   354  			return true
   355  		}
   356  	}
   357  	return false
   358  }
   359  
   360  func (z *erasureServerPools) IsPoolRebalancing(poolIndex int) bool {
   361  	z.rebalMu.RLock()
   362  	defer z.rebalMu.RUnlock()
   363  
   364  	if r := z.rebalMeta; r != nil {
   365  		if !r.StoppedAt.IsZero() {
   366  			return false
   367  		}
   368  		ps := z.rebalMeta.PoolStats[poolIndex]
   369  		return ps.Participating && ps.Info.Status == rebalStarted
   370  	}
   371  	return false
   372  }
   373  
   374  func (z *erasureServerPools) rebalanceBuckets(ctx context.Context, poolIdx int) (err error) {
   375  	doneCh := make(chan struct{})
   376  	defer xioutil.SafeClose(doneCh)
   377  
   378  	// Save rebalance.bin periodically.
   379  	go func() {
   380  		// Update rebalance.bin periodically once every 5-10s, chosen randomly
   381  		// to avoid multiple pool leaders herding to update around the same
   382  		// time.
   383  		r := rand.New(rand.NewSource(time.Now().UnixNano()))
   384  		randSleepFor := func() time.Duration {
   385  			return 5*time.Second + time.Duration(float64(5*time.Second)*r.Float64())
   386  		}
   387  
   388  		timer := time.NewTimer(randSleepFor())
   389  		defer timer.Stop()
   390  		var rebalDone bool
   391  		var traceMsg string
   392  
   393  		for {
   394  			select {
   395  			case <-doneCh:
   396  				// rebalance completed for poolIdx
   397  				now := time.Now()
   398  				z.rebalMu.Lock()
   399  				z.rebalMeta.PoolStats[poolIdx].Info.Status = rebalCompleted
   400  				z.rebalMeta.PoolStats[poolIdx].Info.EndTime = now
   401  				z.rebalMu.Unlock()
   402  
   403  				rebalDone = true
   404  				traceMsg = fmt.Sprintf("completed at %s", now)
   405  
   406  			case <-ctx.Done():
   407  
   408  				// rebalance stopped for poolIdx
   409  				now := time.Now()
   410  				z.rebalMu.Lock()
   411  				z.rebalMeta.PoolStats[poolIdx].Info.Status = rebalStopped
   412  				z.rebalMeta.PoolStats[poolIdx].Info.EndTime = now
   413  				z.rebalMeta.cancel = nil // remove the already used context.CancelFunc
   414  				z.rebalMu.Unlock()
   415  
   416  				rebalDone = true
   417  				traceMsg = fmt.Sprintf("stopped at %s", now)
   418  
   419  			case <-timer.C:
   420  				traceMsg = fmt.Sprintf("saved at %s", time.Now())
   421  			}
   422  
   423  			stopFn := globalRebalanceMetrics.log(rebalanceMetricSaveMetadata, poolIdx, traceMsg)
   424  			err := z.saveRebalanceStats(ctx, poolIdx, rebalSaveStats)
   425  			stopFn(err)
   426  			logger.LogIf(ctx, err)
   427  			timer.Reset(randSleepFor())
   428  
   429  			if rebalDone {
   430  				return
   431  			}
   432  		}
   433  	}()
   434  
   435  	logger.Event(ctx, "Pool %d rebalancing is started", poolIdx+1)
   436  
   437  	for {
   438  		select {
   439  		case <-ctx.Done():
   440  			return
   441  		default:
   442  		}
   443  
   444  		bucket, ok := z.nextRebalBucket(poolIdx)
   445  		if !ok {
   446  			// no more buckets to rebalance or target free_space/capacity reached
   447  			break
   448  		}
   449  
   450  		stopFn := globalRebalanceMetrics.log(rebalanceMetricRebalanceBucket, poolIdx, bucket)
   451  		err = z.rebalanceBucket(ctx, bucket, poolIdx)
   452  		if err != nil {
   453  			stopFn(err)
   454  			logger.LogIf(ctx, err)
   455  			return
   456  		}
   457  		stopFn(nil)
   458  		z.bucketRebalanceDone(bucket, poolIdx)
   459  	}
   460  
   461  	logger.Event(ctx, "Pool %d rebalancing is done", poolIdx+1)
   462  
   463  	return err
   464  }
   465  
   466  func (z *erasureServerPools) checkIfRebalanceDone(poolIdx int) bool {
   467  	z.rebalMu.Lock()
   468  	defer z.rebalMu.Unlock()
   469  
   470  	// check if enough objects have been rebalanced
   471  	r := z.rebalMeta
   472  	poolStats := r.PoolStats[poolIdx]
   473  	if poolStats.Info.Status == rebalCompleted {
   474  		return true
   475  	}
   476  
   477  	pfi := float64(poolStats.InitFreeSpace+poolStats.Bytes) / float64(poolStats.InitCapacity)
   478  	// Mark pool rebalance as done if within 5% from PercentFreeGoal.
   479  	if diff := math.Abs(pfi - r.PercentFreeGoal); diff <= 0.05 {
   480  		r.PoolStats[poolIdx].Info.Status = rebalCompleted
   481  		r.PoolStats[poolIdx].Info.EndTime = time.Now()
   482  		return true
   483  	}
   484  
   485  	return false
   486  }
   487  
   488  func (set *erasureObjects) listObjectsToRebalance(ctx context.Context, bucketName string, fn func(entry metaCacheEntry)) error {
   489  	disks, _ := set.getOnlineDisksWithHealing(false)
   490  	if len(disks) == 0 {
   491  		return fmt.Errorf("no online drives found for set with endpoints %s", set.getEndpoints())
   492  	}
   493  
   494  	// However many we ask, versions must exist on ~50%
   495  	listingQuorum := (set.setDriveCount + 1) / 2
   496  
   497  	// How to resolve partial results.
   498  	resolver := metadataResolutionParams{
   499  		dirQuorum: listingQuorum, // make sure to capture all quorum ratios
   500  		objQuorum: listingQuorum, // make sure to capture all quorum ratios
   501  		bucket:    bucketName,
   502  	}
   503  
   504  	err := listPathRaw(ctx, listPathRawOptions{
   505  		disks:          disks,
   506  		bucket:         bucketName,
   507  		recursive:      true,
   508  		forwardTo:      "",
   509  		minDisks:       listingQuorum,
   510  		reportNotFound: false,
   511  		agreed:         fn,
   512  		partial: func(entries metaCacheEntries, _ []error) {
   513  			entry, ok := entries.resolve(&resolver)
   514  			if ok {
   515  				fn(*entry)
   516  			}
   517  		},
   518  		finished: nil,
   519  	})
   520  	return err
   521  }
   522  
   523  // rebalanceBucket rebalances objects under bucket in poolIdx pool
   524  func (z *erasureServerPools) rebalanceBucket(ctx context.Context, bucket string, poolIdx int) error {
   525  	ctx = logger.SetReqInfo(ctx, &logger.ReqInfo{})
   526  	vc, _ := globalBucketVersioningSys.Get(bucket)
   527  	// Check if the current bucket has a configured lifecycle policy
   528  	lc, _ := globalLifecycleSys.Get(bucket)
   529  	// Check if bucket is object locked.
   530  	lr, _ := globalBucketObjectLockSys.Get(bucket)
   531  	rcfg, _ := getReplicationConfig(ctx, bucket)
   532  
   533  	pool := z.serverPools[poolIdx]
   534  
   535  	const envRebalanceWorkers = "_MINIO_REBALANCE_WORKERS"
   536  	workerSize, err := env.GetInt(envRebalanceWorkers, len(pool.sets))
   537  	if err != nil {
   538  		logger.LogIf(ctx, fmt.Errorf("invalid workers value err: %v, defaulting to %d", err, len(pool.sets)))
   539  		workerSize = len(pool.sets)
   540  	}
   541  
   542  	// Each decom worker needs one List() goroutine/worker
   543  	// add that many extra workers.
   544  	workerSize += len(pool.sets)
   545  
   546  	wk, err := workers.New(workerSize)
   547  	if err != nil {
   548  		return err
   549  	}
   550  
   551  	for setIdx, set := range pool.sets {
   552  		set := set
   553  
   554  		filterLifecycle := func(bucket, object string, fi FileInfo) bool {
   555  			if lc == nil {
   556  				return false
   557  			}
   558  			versioned := vc != nil && vc.Versioned(object)
   559  			objInfo := fi.ToObjectInfo(bucket, object, versioned)
   560  
   561  			evt := evalActionFromLifecycle(ctx, *lc, lr, rcfg, objInfo)
   562  			if evt.Action.Delete() {
   563  				globalExpiryState.enqueueByDays(objInfo, evt, lcEventSrc_Rebal)
   564  				return true
   565  			}
   566  
   567  			return false
   568  		}
   569  
   570  		rebalanceEntry := func(entry metaCacheEntry) {
   571  			defer wk.Give()
   572  
   573  			if entry.isDir() {
   574  				return
   575  			}
   576  
   577  			// rebalance on poolIdx has reached its goal
   578  			if z.checkIfRebalanceDone(poolIdx) {
   579  				return
   580  			}
   581  
   582  			fivs, err := entry.fileInfoVersions(bucket)
   583  			if err != nil {
   584  				return
   585  			}
   586  
   587  			// We need a reversed order for rebalance,
   588  			// to create the appropriate stack.
   589  			versionsSorter(fivs.Versions).reverse()
   590  
   591  			var rebalanced, expired int
   592  			for _, version := range fivs.Versions {
   593  				// Skip transitioned objects for now. TBD
   594  				if version.IsRemote() {
   595  					continue
   596  				}
   597  
   598  				// Apply lifecycle rules on the objects that are expired.
   599  				if filterLifecycle(bucket, version.Name, version) {
   600  					expired++
   601  					continue
   602  				}
   603  
   604  				// any object with only single DEL marker we don't need
   605  				// to rebalance, just skip it, this also includes
   606  				// any other versions that have already expired.
   607  				remainingVersions := len(fivs.Versions) - expired
   608  				if version.Deleted && remainingVersions == 1 {
   609  					rebalanced++
   610  					continue
   611  				}
   612  
   613  				versionID := version.VersionID
   614  				if versionID == "" {
   615  					versionID = nullVersionID
   616  				}
   617  
   618  				if version.Deleted {
   619  					_, err := z.DeleteObject(ctx,
   620  						bucket,
   621  						version.Name,
   622  						ObjectOptions{
   623  							Versioned:         true,
   624  							VersionID:         versionID,
   625  							MTime:             version.ModTime,
   626  							DeleteReplication: version.ReplicationState,
   627  							DeleteMarker:      true, // make sure we create a delete marker
   628  							SkipRebalancing:   true, // make sure we skip the decommissioned pool
   629  							NoAuditLog:        true,
   630  						})
   631  					var failure bool
   632  					if err != nil && !isErrObjectNotFound(err) && !isErrVersionNotFound(err) {
   633  						logger.LogIf(ctx, err)
   634  						failure = true
   635  					}
   636  
   637  					if !failure {
   638  						z.updatePoolStats(poolIdx, bucket, version)
   639  						rebalanced++
   640  					}
   641  					auditLogRebalance(ctx, "Rebalance:DeleteMarker", bucket, version.Name, versionID, err)
   642  					continue
   643  				}
   644  
   645  				var failure, ignore bool
   646  				for try := 0; try < 3; try++ {
   647  					// GetObjectReader.Close is called by rebalanceObject
   648  					stopFn := globalRebalanceMetrics.log(rebalanceMetricRebalanceObject, poolIdx, bucket, version.Name, version.VersionID)
   649  					gr, err := set.GetObjectNInfo(ctx,
   650  						bucket,
   651  						encodeDirObject(version.Name),
   652  						nil,
   653  						http.Header{},
   654  						ObjectOptions{
   655  							VersionID:    versionID,
   656  							NoDecryption: true,
   657  							NoLock:       true,
   658  							NoAuditLog:   true,
   659  						})
   660  					if isErrObjectNotFound(err) || isErrVersionNotFound(err) {
   661  						// object deleted by the application, nothing to do here we move on.
   662  						ignore = true
   663  						stopFn(nil)
   664  						break
   665  					}
   666  					if err != nil {
   667  						failure = true
   668  						logger.LogIf(ctx, err)
   669  						stopFn(err)
   670  						continue
   671  					}
   672  
   673  					if err = z.rebalanceObject(ctx, bucket, gr); err != nil {
   674  						failure = true
   675  						logger.LogIf(ctx, err)
   676  						stopFn(err)
   677  						continue
   678  					}
   679  
   680  					stopFn(nil)
   681  					failure = false
   682  					break
   683  				}
   684  				if ignore {
   685  					continue
   686  				}
   687  				if failure {
   688  					break // break out on first error
   689  				}
   690  				z.updatePoolStats(poolIdx, bucket, version)
   691  				rebalanced++
   692  			}
   693  
   694  			// if all versions were rebalanced, we can delete the object versions.
   695  			if rebalanced == len(fivs.Versions) {
   696  				stopFn := globalRebalanceMetrics.log(rebalanceMetricRebalanceRemoveObject, poolIdx, bucket, entry.name)
   697  				_, err := set.DeleteObject(ctx,
   698  					bucket,
   699  					encodeDirObject(entry.name),
   700  					ObjectOptions{
   701  						DeletePrefix:       true, // use prefix delete to delete all versions at once.
   702  						DeletePrefixObject: true, // use prefix delete on exact object (this is an optimization to avoid fan-out calls)
   703  						NoAuditLog:         true,
   704  					},
   705  				)
   706  				stopFn(err)
   707  				auditLogRebalance(ctx, "Rebalance:DeleteObject", bucket, entry.name, "", err)
   708  				if err != nil {
   709  					logger.LogIf(ctx, err)
   710  				}
   711  			}
   712  		}
   713  
   714  		wk.Take()
   715  		go func(setIdx int) {
   716  			defer wk.Give()
   717  			err := set.listObjectsToRebalance(ctx, bucket,
   718  				func(entry metaCacheEntry) {
   719  					wk.Take()
   720  					go rebalanceEntry(entry)
   721  				},
   722  			)
   723  			if err == nil || errors.Is(err, context.Canceled) {
   724  				return
   725  			}
   726  			setN := humanize.Ordinal(setIdx + 1)
   727  			logger.LogOnceIf(ctx, fmt.Errorf("listing objects from %s set failed with %v", setN, err), "rebalance-listing-failed"+setN)
   728  		}(setIdx)
   729  	}
   730  
   731  	wk.Wait()
   732  	return nil
   733  }
   734  
   735  type rebalSaveOpts uint8
   736  
   737  const (
   738  	rebalSaveStats rebalSaveOpts = iota
   739  	rebalSaveStoppedAt
   740  )
   741  
   742  func (z *erasureServerPools) saveRebalanceStats(ctx context.Context, poolIdx int, opts rebalSaveOpts) error {
   743  	lock := z.serverPools[0].NewNSLock(minioMetaBucket, rebalMetaName)
   744  	lkCtx, err := lock.GetLock(ctx, globalOperationTimeout)
   745  	if err != nil {
   746  		logger.LogIf(ctx, fmt.Errorf("failed to acquire write lock on %s/%s: %w", minioMetaBucket, rebalMetaName, err))
   747  		return err
   748  	}
   749  	defer lock.Unlock(lkCtx)
   750  
   751  	ctx = lkCtx.Context()
   752  	noLockOpts := ObjectOptions{NoLock: true}
   753  	r := &rebalanceMeta{}
   754  	if err := r.loadWithOpts(ctx, z.serverPools[0], noLockOpts); err != nil {
   755  		return err
   756  	}
   757  
   758  	z.rebalMu.Lock()
   759  	defer z.rebalMu.Unlock()
   760  
   761  	switch opts {
   762  	case rebalSaveStoppedAt:
   763  		r.StoppedAt = time.Now()
   764  	case rebalSaveStats:
   765  		r.PoolStats[poolIdx] = z.rebalMeta.PoolStats[poolIdx]
   766  	}
   767  	z.rebalMeta = r
   768  
   769  	return z.rebalMeta.saveWithOpts(ctx, z.serverPools[0], noLockOpts)
   770  }
   771  
   772  func auditLogRebalance(ctx context.Context, apiName, bucket, object, versionID string, err error) {
   773  	errStr := ""
   774  	if err != nil {
   775  		errStr = err.Error()
   776  	}
   777  	auditLogInternal(ctx, AuditLogOptions{
   778  		Event:     "rebalance",
   779  		APIName:   apiName,
   780  		Bucket:    bucket,
   781  		Object:    object,
   782  		VersionID: versionID,
   783  		Error:     errStr,
   784  	})
   785  }
   786  
   787  func (z *erasureServerPools) rebalanceObject(ctx context.Context, bucket string, gr *GetObjectReader) (err error) {
   788  	oi := gr.ObjInfo
   789  
   790  	defer func() {
   791  		gr.Close()
   792  		auditLogRebalance(ctx, "RebalanceCopyData", oi.Bucket, oi.Name, oi.VersionID, err)
   793  	}()
   794  
   795  	actualSize, err := oi.GetActualSize()
   796  	if err != nil {
   797  		return err
   798  	}
   799  
   800  	if oi.isMultipart() {
   801  		res, err := z.NewMultipartUpload(ctx, bucket, oi.Name, ObjectOptions{
   802  			VersionID:   oi.VersionID,
   803  			UserDefined: oi.UserDefined,
   804  			NoAuditLog:  true,
   805  		})
   806  		if err != nil {
   807  			return fmt.Errorf("rebalanceObject: NewMultipartUpload() %w", err)
   808  		}
   809  		defer z.AbortMultipartUpload(ctx, bucket, oi.Name, res.UploadID, ObjectOptions{NoAuditLog: true})
   810  
   811  		parts := make([]CompletePart, len(oi.Parts))
   812  		for i, part := range oi.Parts {
   813  			hr, err := hash.NewReader(ctx, io.LimitReader(gr, part.Size), part.Size, "", "", part.ActualSize)
   814  			if err != nil {
   815  				return fmt.Errorf("rebalanceObject: hash.NewReader() %w", err)
   816  			}
   817  			pi, err := z.PutObjectPart(ctx, bucket, oi.Name, res.UploadID,
   818  				part.Number,
   819  				NewPutObjReader(hr),
   820  				ObjectOptions{
   821  					PreserveETag: part.ETag, // Preserve original ETag to ensure same metadata.
   822  					IndexCB: func() []byte {
   823  						return part.Index // Preserve part Index to ensure decompression works.
   824  					},
   825  					NoAuditLog: true,
   826  				})
   827  			if err != nil {
   828  				return fmt.Errorf("rebalanceObject: PutObjectPart() %w", err)
   829  			}
   830  			parts[i] = CompletePart{
   831  				ETag:       pi.ETag,
   832  				PartNumber: pi.PartNumber,
   833  			}
   834  		}
   835  		_, err = z.CompleteMultipartUpload(ctx, bucket, oi.Name, res.UploadID, parts, ObjectOptions{
   836  			DataMovement: true,
   837  			MTime:        oi.ModTime,
   838  			NoAuditLog:   true,
   839  		})
   840  		if err != nil {
   841  			err = fmt.Errorf("rebalanceObject: CompleteMultipartUpload() %w", err)
   842  		}
   843  		return err
   844  	}
   845  
   846  	hr, err := hash.NewReader(ctx, gr, oi.Size, "", "", actualSize)
   847  	if err != nil {
   848  		return fmt.Errorf("rebalanceObject: hash.NewReader() %w", err)
   849  	}
   850  
   851  	_, err = z.PutObject(ctx,
   852  		bucket,
   853  		oi.Name,
   854  		NewPutObjReader(hr),
   855  		ObjectOptions{
   856  			DataMovement: true,
   857  			VersionID:    oi.VersionID,
   858  			MTime:        oi.ModTime,
   859  			UserDefined:  oi.UserDefined,
   860  			PreserveETag: oi.ETag, // Preserve original ETag to ensure same metadata.
   861  			IndexCB: func() []byte {
   862  				return oi.Parts[0].Index // Preserve part Index to ensure decompression works.
   863  			},
   864  			NoAuditLog: true,
   865  		})
   866  	if err != nil {
   867  		err = fmt.Errorf("rebalanceObject: PutObject() %w", err)
   868  	}
   869  	return err
   870  }
   871  
   872  func (z *erasureServerPools) StartRebalance() {
   873  	z.rebalMu.Lock()
   874  	if z.rebalMeta == nil || !z.rebalMeta.StoppedAt.IsZero() { // rebalance not running, nothing to do
   875  		z.rebalMu.Unlock()
   876  		return
   877  	}
   878  	ctx, cancel := context.WithCancel(GlobalContext)
   879  	z.rebalMeta.cancel = cancel // to be used when rebalance-stop is called
   880  	z.rebalMu.Unlock()
   881  
   882  	z.rebalMu.RLock()
   883  	participants := make([]bool, len(z.rebalMeta.PoolStats))
   884  	for i, ps := range z.rebalMeta.PoolStats {
   885  		// skip pools which have completed rebalancing
   886  		if ps.Info.Status != rebalStarted {
   887  			continue
   888  		}
   889  
   890  		participants[i] = ps.Participating
   891  	}
   892  	z.rebalMu.RUnlock()
   893  
   894  	for poolIdx, doRebalance := range participants {
   895  		if !doRebalance {
   896  			continue
   897  		}
   898  		// nothing to do if this node is not pool's first node (i.e pool's rebalance 'leader').
   899  		if !globalEndpoints[poolIdx].Endpoints[0].IsLocal {
   900  			continue
   901  		}
   902  
   903  		go func(idx int) {
   904  			stopfn := globalRebalanceMetrics.log(rebalanceMetricRebalanceBuckets, idx)
   905  			err := z.rebalanceBuckets(ctx, idx)
   906  			stopfn(err)
   907  		}(poolIdx)
   908  	}
   909  }
   910  
   911  // StopRebalance signals the rebalance goroutine running on this node (if any)
   912  // to stop, using the context.CancelFunc(s) saved at the time ofStartRebalance.
   913  func (z *erasureServerPools) StopRebalance() error {
   914  	z.rebalMu.Lock()
   915  	defer z.rebalMu.Unlock()
   916  
   917  	r := z.rebalMeta
   918  	if r == nil { // rebalance not running in this node, nothing to do
   919  		return nil
   920  	}
   921  
   922  	if cancel := r.cancel; cancel != nil {
   923  		// cancel != nil only on pool leaders
   924  		r.cancel = nil
   925  		cancel()
   926  	}
   927  	return nil
   928  }
   929  
   930  // for rebalance trace support
   931  type rebalanceMetrics struct{}
   932  
   933  var globalRebalanceMetrics rebalanceMetrics
   934  
   935  //go:generate stringer -type=rebalanceMetric -trimprefix=rebalanceMetric $GOFILE
   936  type rebalanceMetric uint8
   937  
   938  const (
   939  	rebalanceMetricRebalanceBuckets rebalanceMetric = iota
   940  	rebalanceMetricRebalanceBucket
   941  	rebalanceMetricRebalanceObject
   942  	rebalanceMetricRebalanceRemoveObject
   943  	rebalanceMetricSaveMetadata
   944  )
   945  
   946  func rebalanceTrace(r rebalanceMetric, poolIdx int, startTime time.Time, duration time.Duration, err error, path string) madmin.TraceInfo {
   947  	var errStr string
   948  	if err != nil {
   949  		errStr = err.Error()
   950  	}
   951  	return madmin.TraceInfo{
   952  		TraceType: madmin.TraceRebalance,
   953  		Time:      startTime,
   954  		NodeName:  globalLocalNodeName,
   955  		FuncName:  fmt.Sprintf("rebalance.%s (pool-id=%d)", r.String(), poolIdx),
   956  		Duration:  duration,
   957  		Path:      path,
   958  		Error:     errStr,
   959  	}
   960  }
   961  
   962  func (p *rebalanceMetrics) log(r rebalanceMetric, poolIdx int, paths ...string) func(err error) {
   963  	startTime := time.Now()
   964  	return func(err error) {
   965  		duration := time.Since(startTime)
   966  		if globalTrace.NumSubscribers(madmin.TraceRebalance) > 0 {
   967  			globalTrace.Publish(rebalanceTrace(r, poolIdx, startTime, duration, err, strings.Join(paths, " ")))
   968  		}
   969  	}
   970  }