github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/erasure-server-pool-decom.go (about)

     1  // Copyright (c) 2015-2023 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package cmd
    19  
    20  import (
    21  	"context"
    22  	"encoding/binary"
    23  	"errors"
    24  	"fmt"
    25  	"io"
    26  	"math/rand"
    27  	"net/http"
    28  	"sort"
    29  	"strings"
    30  	"time"
    31  
    32  	"github.com/dustin/go-humanize"
    33  	"github.com/minio/madmin-go/v3"
    34  	"github.com/minio/minio/internal/hash"
    35  	"github.com/minio/minio/internal/logger"
    36  	"github.com/minio/pkg/v2/console"
    37  	"github.com/minio/pkg/v2/env"
    38  	"github.com/minio/pkg/v2/workers"
    39  )
    40  
    41  // PoolDecommissionInfo currently decommissioning information
    42  type PoolDecommissionInfo struct {
    43  	StartTime   time.Time `json:"startTime" msg:"st"`
    44  	StartSize   int64     `json:"startSize" msg:"ss"`
    45  	TotalSize   int64     `json:"totalSize" msg:"ts"`
    46  	CurrentSize int64     `json:"currentSize" msg:"cs"`
    47  
    48  	Complete bool `json:"complete" msg:"cmp"`
    49  	Failed   bool `json:"failed" msg:"fl"`
    50  	Canceled bool `json:"canceled" msg:"cnl"`
    51  
    52  	// Internal information.
    53  	QueuedBuckets         []string `json:"-" msg:"bkts"`
    54  	DecommissionedBuckets []string `json:"-" msg:"dbkts"`
    55  
    56  	// Last bucket/object decommissioned.
    57  	Bucket string `json:"-" msg:"bkt"`
    58  	// Captures prefix that is currently being
    59  	// decommissioned inside the 'Bucket'
    60  	Prefix string `json:"-" msg:"pfx"`
    61  	Object string `json:"-" msg:"obj"`
    62  
    63  	// Verbose information
    64  	ItemsDecommissioned     int64 `json:"objectsDecommissioned" msg:"id"`
    65  	ItemsDecommissionFailed int64 `json:"objectsDecommissionedFailed" msg:"idf"`
    66  	BytesDone               int64 `json:"bytesDecommissioned" msg:"bd"`
    67  	BytesFailed             int64 `json:"bytesDecommissionedFailed" msg:"bf"`
    68  }
    69  
    70  // Clone make a copy of PoolDecommissionInfo
    71  func (pd *PoolDecommissionInfo) Clone() *PoolDecommissionInfo {
    72  	if pd == nil {
    73  		return nil
    74  	}
    75  	return &PoolDecommissionInfo{
    76  		StartTime:               pd.StartTime,
    77  		StartSize:               pd.StartSize,
    78  		TotalSize:               pd.TotalSize,
    79  		CurrentSize:             pd.CurrentSize,
    80  		Complete:                pd.Complete,
    81  		Failed:                  pd.Failed,
    82  		Canceled:                pd.Canceled,
    83  		QueuedBuckets:           pd.QueuedBuckets,
    84  		DecommissionedBuckets:   pd.DecommissionedBuckets,
    85  		Bucket:                  pd.Bucket,
    86  		Prefix:                  pd.Prefix,
    87  		Object:                  pd.Object,
    88  		ItemsDecommissioned:     pd.ItemsDecommissioned,
    89  		ItemsDecommissionFailed: pd.ItemsDecommissionFailed,
    90  		BytesDone:               pd.BytesDone,
    91  		BytesFailed:             pd.BytesFailed,
    92  	}
    93  }
    94  
    95  // bucketPop should be called when a bucket is done decommissioning.
    96  // Adds the bucket to the list of decommissioned buckets and updates resume numbers.
    97  func (pd *PoolDecommissionInfo) bucketPop(bucket string) bool {
    98  	pd.DecommissionedBuckets = append(pd.DecommissionedBuckets, bucket)
    99  	for i, b := range pd.QueuedBuckets {
   100  		if b == bucket {
   101  			// Bucket is done.
   102  			pd.QueuedBuckets = append(pd.QueuedBuckets[:i], pd.QueuedBuckets[i+1:]...)
   103  			// Clear tracker info.
   104  			if pd.Bucket == bucket {
   105  				pd.Bucket = "" // empty this out for next bucket
   106  				pd.Prefix = "" // empty this out for the next bucket
   107  				pd.Object = "" // empty this out for next object
   108  			}
   109  			return true
   110  		}
   111  	}
   112  	return false
   113  }
   114  
   115  func (pd *PoolDecommissionInfo) isBucketDecommissioned(bucket string) bool {
   116  	for _, b := range pd.DecommissionedBuckets {
   117  		if b == bucket {
   118  			return true
   119  		}
   120  	}
   121  	return false
   122  }
   123  
   124  func (pd *PoolDecommissionInfo) bucketPush(bucket decomBucketInfo) {
   125  	for _, b := range pd.QueuedBuckets {
   126  		if pd.isBucketDecommissioned(b) {
   127  			return
   128  		}
   129  		if b == bucket.String() {
   130  			return
   131  		}
   132  	}
   133  	pd.QueuedBuckets = append(pd.QueuedBuckets, bucket.String())
   134  	pd.Bucket = bucket.Name
   135  	pd.Prefix = bucket.Prefix
   136  }
   137  
   138  // PoolStatus captures current pool status
   139  type PoolStatus struct {
   140  	ID           int                   `json:"id" msg:"id"`
   141  	CmdLine      string                `json:"cmdline" msg:"cl"`
   142  	LastUpdate   time.Time             `json:"lastUpdate" msg:"lu"`
   143  	Decommission *PoolDecommissionInfo `json:"decommissionInfo,omitempty" msg:"dec"`
   144  }
   145  
   146  // Clone returns a copy of PoolStatus
   147  func (ps PoolStatus) Clone() PoolStatus {
   148  	return PoolStatus{
   149  		ID:           ps.ID,
   150  		CmdLine:      ps.CmdLine,
   151  		LastUpdate:   ps.LastUpdate,
   152  		Decommission: ps.Decommission.Clone(),
   153  	}
   154  }
   155  
   156  //go:generate msgp -file $GOFILE -unexported
   157  type poolMeta struct {
   158  	Version int          `msg:"v"`
   159  	Pools   []PoolStatus `msg:"pls"`
   160  
   161  	// Value should not be saved when we have not loaded anything yet.
   162  	dontSave bool `msg:"-"`
   163  }
   164  
   165  // A decommission resumable tells us if decommission is worth
   166  // resuming upon restart of a cluster.
   167  func (p *poolMeta) returnResumablePools() []PoolStatus {
   168  	var newPools []PoolStatus
   169  	for _, pool := range p.Pools {
   170  		if pool.Decommission == nil {
   171  			continue
   172  		}
   173  		if pool.Decommission.Complete || pool.Decommission.Canceled {
   174  			// Do not resume decommission upon startup for
   175  			// - decommission complete
   176  			// - decommission canceled
   177  			continue
   178  		} // In all other situations we need to resume
   179  		newPools = append(newPools, pool)
   180  	}
   181  	return newPools
   182  }
   183  
   184  func (p *poolMeta) DecommissionComplete(idx int) bool {
   185  	if p.Pools[idx].Decommission != nil && !p.Pools[idx].Decommission.Complete {
   186  		p.Pools[idx].LastUpdate = UTCNow()
   187  		p.Pools[idx].Decommission.Complete = true
   188  		p.Pools[idx].Decommission.Failed = false
   189  		p.Pools[idx].Decommission.Canceled = false
   190  		return true
   191  	}
   192  	return false
   193  }
   194  
   195  func (p *poolMeta) DecommissionFailed(idx int) bool {
   196  	if p.Pools[idx].Decommission != nil && !p.Pools[idx].Decommission.Failed {
   197  		p.Pools[idx].LastUpdate = UTCNow()
   198  		p.Pools[idx].Decommission.StartTime = time.Time{}
   199  		p.Pools[idx].Decommission.Complete = false
   200  		p.Pools[idx].Decommission.Failed = true
   201  		p.Pools[idx].Decommission.Canceled = false
   202  		return true
   203  	}
   204  	return false
   205  }
   206  
   207  func (p *poolMeta) DecommissionCancel(idx int) bool {
   208  	if p.Pools[idx].Decommission != nil && !p.Pools[idx].Decommission.Canceled {
   209  		p.Pools[idx].LastUpdate = UTCNow()
   210  		p.Pools[idx].Decommission.StartTime = time.Time{}
   211  		p.Pools[idx].Decommission.Complete = false
   212  		p.Pools[idx].Decommission.Failed = false
   213  		p.Pools[idx].Decommission.Canceled = true
   214  		return true
   215  	}
   216  	return false
   217  }
   218  
   219  func (p poolMeta) isBucketDecommissioned(idx int, bucket string) bool {
   220  	return p.Pools[idx].Decommission.isBucketDecommissioned(bucket)
   221  }
   222  
   223  func (p *poolMeta) BucketDone(idx int, bucket decomBucketInfo) bool {
   224  	if p.Pools[idx].Decommission == nil {
   225  		// Decommission not in progress.
   226  		return false
   227  	}
   228  	return p.Pools[idx].Decommission.bucketPop(bucket.String())
   229  }
   230  
   231  func (p poolMeta) ResumeBucketObject(idx int) (bucket, object string) {
   232  	if p.Pools[idx].Decommission != nil {
   233  		bucket = p.Pools[idx].Decommission.Bucket
   234  		object = p.Pools[idx].Decommission.Object
   235  	}
   236  	return
   237  }
   238  
   239  func (p *poolMeta) TrackCurrentBucketObject(idx int, bucket string, object string) {
   240  	if p.Pools[idx].Decommission == nil {
   241  		// Decommission not in progress.
   242  		return
   243  	}
   244  	p.Pools[idx].Decommission.Bucket = bucket
   245  	p.Pools[idx].Decommission.Object = object
   246  }
   247  
   248  func (p *poolMeta) PendingBuckets(idx int) []decomBucketInfo {
   249  	if p.Pools[idx].Decommission == nil {
   250  		// Decommission not in progress.
   251  		return nil
   252  	}
   253  
   254  	decomBuckets := make([]decomBucketInfo, len(p.Pools[idx].Decommission.QueuedBuckets))
   255  	for i := range decomBuckets {
   256  		bucket, prefix := path2BucketObject(p.Pools[idx].Decommission.QueuedBuckets[i])
   257  		decomBuckets[i] = decomBucketInfo{
   258  			Name:   bucket,
   259  			Prefix: prefix,
   260  		}
   261  	}
   262  
   263  	return decomBuckets
   264  }
   265  
   266  //msgp:ignore decomBucketInfo
   267  type decomBucketInfo struct {
   268  	Name   string
   269  	Prefix string
   270  }
   271  
   272  func (db decomBucketInfo) String() string {
   273  	return pathJoin(db.Name, db.Prefix)
   274  }
   275  
   276  func (p *poolMeta) QueueBuckets(idx int, buckets []decomBucketInfo) {
   277  	// add new queued buckets
   278  	for _, bucket := range buckets {
   279  		p.Pools[idx].Decommission.bucketPush(bucket)
   280  	}
   281  }
   282  
   283  var (
   284  	errDecommissionAlreadyRunning = errors.New("decommission is already in progress")
   285  	errDecommissionComplete       = errors.New("decommission is complete, please remove the servers from command-line")
   286  	errDecommissionNotStarted     = errors.New("decommission is not in progress")
   287  )
   288  
   289  func (p *poolMeta) Decommission(idx int, pi poolSpaceInfo) error {
   290  	// Return an error when there is decommission on going - the user needs
   291  	// to explicitly cancel it first in order to restart decommissioning again.
   292  	if p.Pools[idx].Decommission != nil &&
   293  		!p.Pools[idx].Decommission.Complete &&
   294  		!p.Pools[idx].Decommission.Failed &&
   295  		!p.Pools[idx].Decommission.Canceled {
   296  		return errDecommissionAlreadyRunning
   297  	}
   298  
   299  	now := UTCNow()
   300  	p.Pools[idx].LastUpdate = now
   301  	p.Pools[idx].Decommission = &PoolDecommissionInfo{
   302  		StartTime:   now,
   303  		StartSize:   pi.Free,
   304  		CurrentSize: pi.Free,
   305  		TotalSize:   pi.Total,
   306  	}
   307  	return nil
   308  }
   309  
   310  func (p poolMeta) IsSuspended(idx int) bool {
   311  	if idx >= len(p.Pools) {
   312  		// We don't really know if the pool is suspended or not, since it doesn't exist.
   313  		return false
   314  	}
   315  	return p.Pools[idx].Decommission != nil
   316  }
   317  
   318  func (p *poolMeta) validate(pools []*erasureSets) (bool, error) {
   319  	type poolInfo struct {
   320  		position     int
   321  		completed    bool
   322  		decomStarted bool // started but not finished yet
   323  	}
   324  
   325  	rememberedPools := make(map[string]poolInfo)
   326  	for idx, pool := range p.Pools {
   327  		complete := false
   328  		decomStarted := false
   329  		if pool.Decommission != nil {
   330  			if pool.Decommission.Complete {
   331  				complete = true
   332  			}
   333  			decomStarted = true
   334  		}
   335  		rememberedPools[pool.CmdLine] = poolInfo{
   336  			position:     idx,
   337  			completed:    complete,
   338  			decomStarted: decomStarted,
   339  		}
   340  	}
   341  
   342  	specifiedPools := make(map[string]int)
   343  	for idx, pool := range pools {
   344  		specifiedPools[pool.endpoints.CmdLine] = idx
   345  	}
   346  
   347  	var update bool
   348  	// Check if specified pools need to be removed from decommissioned pool.
   349  	for k := range specifiedPools {
   350  		pi, ok := rememberedPools[k]
   351  		if !ok {
   352  			// we do not have the pool anymore that we previously remembered, since all
   353  			// the CLI checks out we can allow updates since we are mostly adding a pool here.
   354  			update = true
   355  		}
   356  		if ok && pi.completed {
   357  			return false, fmt.Errorf("pool(%s) = %s is decommissioned, please remove from server command line", humanize.Ordinal(pi.position+1), k)
   358  		}
   359  	}
   360  
   361  	if len(specifiedPools) == len(rememberedPools) {
   362  		for k, pi := range rememberedPools {
   363  			pos, ok := specifiedPools[k]
   364  			if ok && pos != pi.position {
   365  				update = true // pool order is changing, its okay to allow it.
   366  			}
   367  		}
   368  	}
   369  
   370  	if !update {
   371  		update = len(specifiedPools) != len(rememberedPools)
   372  	}
   373  
   374  	return update, nil
   375  }
   376  
   377  func (p *poolMeta) load(ctx context.Context, pool *erasureSets, pools []*erasureSets) error {
   378  	data, err := readConfig(ctx, pool, poolMetaName)
   379  	if err != nil {
   380  		if errors.Is(err, errConfigNotFound) || isErrObjectNotFound(err) {
   381  			return nil
   382  		}
   383  		return err
   384  	}
   385  	if len(data) == 0 {
   386  		// Seems to be empty create a new poolMeta object.
   387  		return nil
   388  	}
   389  	if len(data) <= 4 {
   390  		return fmt.Errorf("poolMeta: no data")
   391  	}
   392  	// Read header
   393  	switch binary.LittleEndian.Uint16(data[0:2]) {
   394  	case poolMetaFormat:
   395  	default:
   396  		return fmt.Errorf("poolMeta: unknown format: %d", binary.LittleEndian.Uint16(data[0:2]))
   397  	}
   398  	switch binary.LittleEndian.Uint16(data[2:4]) {
   399  	case poolMetaVersion:
   400  	default:
   401  		return fmt.Errorf("poolMeta: unknown version: %d", binary.LittleEndian.Uint16(data[2:4]))
   402  	}
   403  
   404  	// OK, parse data.
   405  	if _, err = p.UnmarshalMsg(data[4:]); err != nil {
   406  		return err
   407  	}
   408  
   409  	switch p.Version {
   410  	case poolMetaVersionV1:
   411  	default:
   412  		return fmt.Errorf("unexpected pool meta version: %d", p.Version)
   413  	}
   414  
   415  	return nil
   416  }
   417  
   418  func (p *poolMeta) CountItem(idx int, size int64, failed bool) {
   419  	pd := p.Pools[idx].Decommission
   420  	if pd == nil {
   421  		return
   422  	}
   423  	if failed {
   424  		pd.ItemsDecommissionFailed++
   425  		pd.BytesFailed += size
   426  	} else {
   427  		pd.ItemsDecommissioned++
   428  		pd.BytesDone += size
   429  	}
   430  	p.Pools[idx].Decommission = pd
   431  }
   432  
   433  func (p *poolMeta) updateAfter(ctx context.Context, idx int, pools []*erasureSets, duration time.Duration) (bool, error) {
   434  	if p.Pools[idx].Decommission == nil {
   435  		return false, errInvalidArgument
   436  	}
   437  	now := UTCNow()
   438  	if now.Sub(p.Pools[idx].LastUpdate) >= duration {
   439  		if serverDebugLog {
   440  			console.Debugf("decommission: persisting poolMeta on drive: threshold:%s, poolMeta:%#v\n", now.Sub(p.Pools[idx].LastUpdate), p.Pools[idx])
   441  		}
   442  		p.Pools[idx].LastUpdate = now
   443  		if err := p.save(ctx, pools); err != nil {
   444  			return false, err
   445  		}
   446  		return true, nil
   447  	}
   448  	return false, nil
   449  }
   450  
   451  func (p poolMeta) save(ctx context.Context, pools []*erasureSets) error {
   452  	if p.dontSave {
   453  		return nil
   454  	}
   455  	data := make([]byte, 4, p.Msgsize()+4)
   456  
   457  	// Initialize the header.
   458  	binary.LittleEndian.PutUint16(data[0:2], poolMetaFormat)
   459  	binary.LittleEndian.PutUint16(data[2:4], poolMetaVersion)
   460  
   461  	buf, err := p.MarshalMsg(data)
   462  	if err != nil {
   463  		return err
   464  	}
   465  
   466  	// Saves on all pools to make sure decommissioning of first pool is allowed.
   467  	for i, eset := range pools {
   468  		if err = saveConfig(ctx, eset, poolMetaName, buf); err != nil {
   469  			if !errors.Is(err, context.Canceled) {
   470  				logger.LogIf(ctx, fmt.Errorf("saving pool.bin for pool index %d failed with: %v", i, err))
   471  			}
   472  			return err
   473  		}
   474  	}
   475  	return nil
   476  }
   477  
   478  const (
   479  	poolMetaName      = "pool.bin"
   480  	poolMetaFormat    = 1
   481  	poolMetaVersionV1 = 1
   482  	poolMetaVersion   = poolMetaVersionV1
   483  )
   484  
   485  // Init() initializes pools and saves additional information about them
   486  // in 'pool.bin', this is eventually used for decommissioning the pool.
   487  func (z *erasureServerPools) Init(ctx context.Context) error {
   488  	// Load rebalance metadata if present
   489  	err := z.loadRebalanceMeta(ctx)
   490  	if err != nil {
   491  		return fmt.Errorf("failed to load rebalance data: %w", err)
   492  	}
   493  
   494  	// Start rebalance routine
   495  	z.StartRebalance()
   496  
   497  	meta := poolMeta{}
   498  	if err := meta.load(ctx, z.serverPools[0], z.serverPools); err != nil {
   499  		return err
   500  	}
   501  
   502  	update, err := meta.validate(z.serverPools)
   503  	if err != nil {
   504  		return err
   505  	}
   506  
   507  	// if no update is needed return right away.
   508  	if !update {
   509  		z.poolMetaMutex.Lock()
   510  		z.poolMeta = meta
   511  		z.poolMetaMutex.Unlock()
   512  	} else {
   513  		newMeta := newPoolMeta(z, meta)
   514  		if err = newMeta.save(ctx, z.serverPools); err != nil {
   515  			return err
   516  		}
   517  		z.poolMetaMutex.Lock()
   518  		z.poolMeta = newMeta
   519  		z.poolMetaMutex.Unlock()
   520  	}
   521  
   522  	pools := meta.returnResumablePools()
   523  	poolIndices := make([]int, 0, len(pools))
   524  	for _, pool := range pools {
   525  		idx := globalEndpoints.GetPoolIdx(pool.CmdLine)
   526  		if idx == -1 {
   527  			return fmt.Errorf("unexpected state present for decommission status pool(%s) not found", pool.CmdLine)
   528  		}
   529  		poolIndices = append(poolIndices, idx)
   530  	}
   531  
   532  	if len(poolIndices) > 0 && globalEndpoints[poolIndices[0]].Endpoints[0].IsLocal {
   533  		go func() {
   534  			r := rand.New(rand.NewSource(time.Now().UnixNano()))
   535  			for {
   536  				if err := z.Decommission(ctx, poolIndices...); err != nil {
   537  					if errors.Is(err, errDecommissionAlreadyRunning) {
   538  						// A previous decommission running found restart it.
   539  						for _, idx := range poolIndices {
   540  							z.doDecommissionInRoutine(ctx, idx)
   541  						}
   542  						return
   543  					}
   544  					if configRetriableErrors(err) {
   545  						logger.LogIf(ctx, fmt.Errorf("Unable to resume decommission of pools %v: %w: retrying..", pools, err))
   546  						time.Sleep(time.Second + time.Duration(r.Float64()*float64(5*time.Second)))
   547  						continue
   548  					}
   549  					logger.LogIf(ctx, fmt.Errorf("Unable to resume decommission of pool %v: %w", pools, err))
   550  					return
   551  				}
   552  			}
   553  		}()
   554  	}
   555  
   556  	return nil
   557  }
   558  
   559  func newPoolMeta(z *erasureServerPools, prevMeta poolMeta) poolMeta {
   560  	newMeta := poolMeta{} // to update write poolMeta fresh.
   561  	// looks like new pool was added we need to update,
   562  	// or this is a fresh installation (or an existing
   563  	// installation with pool removed)
   564  	newMeta.Version = poolMetaVersion
   565  	for idx, pool := range z.serverPools {
   566  		var skip bool
   567  		for _, currentPool := range prevMeta.Pools {
   568  			// Preserve any current pool status.
   569  			if currentPool.CmdLine == pool.endpoints.CmdLine {
   570  				newMeta.Pools = append(newMeta.Pools, currentPool)
   571  				skip = true
   572  				break
   573  			}
   574  		}
   575  		if skip {
   576  			continue
   577  		}
   578  		newMeta.Pools = append(newMeta.Pools, PoolStatus{
   579  			CmdLine:    pool.endpoints.CmdLine,
   580  			ID:         idx,
   581  			LastUpdate: UTCNow(),
   582  		})
   583  	}
   584  	return newMeta
   585  }
   586  
   587  func (z *erasureServerPools) IsDecommissionRunning() bool {
   588  	z.poolMetaMutex.RLock()
   589  	defer z.poolMetaMutex.RUnlock()
   590  	meta := z.poolMeta
   591  	for _, pool := range meta.Pools {
   592  		if pool.Decommission != nil &&
   593  			!pool.Decommission.Complete &&
   594  			!pool.Decommission.Failed &&
   595  			!pool.Decommission.Canceled {
   596  			return true
   597  		}
   598  	}
   599  
   600  	return false
   601  }
   602  
   603  func (z *erasureServerPools) decommissionObject(ctx context.Context, bucket string, gr *GetObjectReader) (err error) {
   604  	objInfo := gr.ObjInfo
   605  
   606  	defer func() {
   607  		gr.Close()
   608  		auditLogDecom(ctx, "DecomCopyData", objInfo.Bucket, objInfo.Name, objInfo.VersionID, err)
   609  	}()
   610  
   611  	actualSize, err := objInfo.GetActualSize()
   612  	if err != nil {
   613  		return err
   614  	}
   615  
   616  	if objInfo.isMultipart() {
   617  		res, err := z.NewMultipartUpload(ctx, bucket, objInfo.Name, ObjectOptions{
   618  			VersionID:   objInfo.VersionID,
   619  			UserDefined: objInfo.UserDefined,
   620  			NoAuditLog:  true,
   621  		})
   622  		if err != nil {
   623  			return fmt.Errorf("decommissionObject: NewMultipartUpload() %w", err)
   624  		}
   625  		defer z.AbortMultipartUpload(ctx, bucket, objInfo.Name, res.UploadID, ObjectOptions{NoAuditLog: true})
   626  		parts := make([]CompletePart, len(objInfo.Parts))
   627  		for i, part := range objInfo.Parts {
   628  			hr, err := hash.NewReader(ctx, io.LimitReader(gr, part.Size), part.Size, "", "", part.ActualSize)
   629  			if err != nil {
   630  				return fmt.Errorf("decommissionObject: hash.NewReader() %w", err)
   631  			}
   632  			pi, err := z.PutObjectPart(ctx, bucket, objInfo.Name, res.UploadID,
   633  				part.Number,
   634  				NewPutObjReader(hr),
   635  				ObjectOptions{
   636  					PreserveETag: part.ETag, // Preserve original ETag to ensure same metadata.
   637  					IndexCB: func() []byte {
   638  						return part.Index // Preserve part Index to ensure decompression works.
   639  					},
   640  					NoAuditLog: true,
   641  				})
   642  			if err != nil {
   643  				return fmt.Errorf("decommissionObject: PutObjectPart() %w", err)
   644  			}
   645  			parts[i] = CompletePart{
   646  				ETag:           pi.ETag,
   647  				PartNumber:     pi.PartNumber,
   648  				ChecksumCRC32:  pi.ChecksumCRC32,
   649  				ChecksumCRC32C: pi.ChecksumCRC32C,
   650  				ChecksumSHA256: pi.ChecksumSHA256,
   651  				ChecksumSHA1:   pi.ChecksumSHA1,
   652  			}
   653  		}
   654  		_, err = z.CompleteMultipartUpload(ctx, bucket, objInfo.Name, res.UploadID, parts, ObjectOptions{
   655  			DataMovement: true,
   656  			MTime:        objInfo.ModTime,
   657  			NoAuditLog:   true,
   658  		})
   659  		if err != nil {
   660  			err = fmt.Errorf("decommissionObject: CompleteMultipartUpload() %w", err)
   661  		}
   662  		return err
   663  	}
   664  
   665  	hr, err := hash.NewReader(ctx, io.LimitReader(gr, objInfo.Size), objInfo.Size, "", "", actualSize)
   666  	if err != nil {
   667  		return fmt.Errorf("decommissionObject: hash.NewReader() %w", err)
   668  	}
   669  
   670  	_, err = z.PutObject(ctx,
   671  		bucket,
   672  		objInfo.Name,
   673  		NewPutObjReader(hr),
   674  		ObjectOptions{
   675  			DataMovement: true,
   676  			VersionID:    objInfo.VersionID,
   677  			MTime:        objInfo.ModTime,
   678  			UserDefined:  objInfo.UserDefined,
   679  			PreserveETag: objInfo.ETag, // Preserve original ETag to ensure same metadata.
   680  			IndexCB: func() []byte {
   681  				return objInfo.Parts[0].Index // Preserve part Index to ensure decompression works.
   682  			},
   683  			NoAuditLog: true,
   684  		})
   685  	if err != nil {
   686  		err = fmt.Errorf("decommissionObject: PutObject() %w", err)
   687  	}
   688  	return err
   689  }
   690  
   691  // versionsSorter sorts FileInfo slices by version.
   692  //
   693  //msgp:ignore versionsSorter
   694  type versionsSorter []FileInfo
   695  
   696  func (v versionsSorter) reverse() {
   697  	sort.Slice(v, func(i, j int) bool {
   698  		return v[i].ModTime.Before(v[j].ModTime)
   699  	})
   700  }
   701  
   702  func (set *erasureObjects) listObjectsToDecommission(ctx context.Context, bi decomBucketInfo, fn func(entry metaCacheEntry)) error {
   703  	disks, _ := set.getOnlineDisksWithHealing(false)
   704  	if len(disks) == 0 {
   705  		return fmt.Errorf("no online drives found for set with endpoints %s", set.getEndpoints())
   706  	}
   707  
   708  	// However many we ask, versions must exist on ~50%
   709  	listingQuorum := (set.setDriveCount + 1) / 2
   710  
   711  	// How to resolve partial results.
   712  	resolver := metadataResolutionParams{
   713  		dirQuorum: listingQuorum, // make sure to capture all quorum ratios
   714  		objQuorum: listingQuorum, // make sure to capture all quorum ratios
   715  		bucket:    bi.Name,
   716  	}
   717  
   718  	err := listPathRaw(ctx, listPathRawOptions{
   719  		disks:          disks,
   720  		bucket:         bi.Name,
   721  		path:           bi.Prefix,
   722  		recursive:      true,
   723  		forwardTo:      "",
   724  		minDisks:       listingQuorum,
   725  		reportNotFound: false,
   726  		agreed:         fn,
   727  		partial: func(entries metaCacheEntries, _ []error) {
   728  			entry, ok := entries.resolve(&resolver)
   729  			if ok {
   730  				fn(*entry)
   731  			}
   732  		},
   733  		finished: nil,
   734  	})
   735  	return err
   736  }
   737  
   738  func (z *erasureServerPools) decommissionPool(ctx context.Context, idx int, pool *erasureSets, bi decomBucketInfo) error {
   739  	ctx = logger.SetReqInfo(ctx, &logger.ReqInfo{})
   740  
   741  	const envDecomWorkers = "_MINIO_DECOMMISSION_WORKERS"
   742  	workerSize, err := env.GetInt(envDecomWorkers, len(pool.sets))
   743  	if err != nil {
   744  		logger.LogIf(ctx, fmt.Errorf("invalid workers value err: %v, defaulting to %d", err, len(pool.sets)))
   745  		workerSize = len(pool.sets)
   746  	}
   747  
   748  	// Each decom worker needs one List() goroutine/worker
   749  	// add that many extra workers.
   750  	workerSize += len(pool.sets)
   751  
   752  	wk, err := workers.New(workerSize)
   753  	if err != nil {
   754  		return err
   755  	}
   756  
   757  	vc, _ := globalBucketVersioningSys.Get(bi.Name)
   758  
   759  	// Check if the current bucket has a configured lifecycle policy
   760  	lc, _ := globalLifecycleSys.Get(bi.Name)
   761  
   762  	// Check if bucket is object locked.
   763  	lr, _ := globalBucketObjectLockSys.Get(bi.Name)
   764  	rcfg, _ := getReplicationConfig(ctx, bi.Name)
   765  
   766  	for setIdx, set := range pool.sets {
   767  		set := set
   768  
   769  		filterLifecycle := func(bucket, object string, fi FileInfo) bool {
   770  			if lc == nil {
   771  				return false
   772  			}
   773  			versioned := vc != nil && vc.Versioned(object)
   774  			objInfo := fi.ToObjectInfo(bucket, object, versioned)
   775  
   776  			evt := evalActionFromLifecycle(ctx, *lc, lr, rcfg, objInfo)
   777  			switch {
   778  			case evt.Action.DeleteRestored(): // if restored copy has expired, delete it synchronously
   779  				applyExpiryOnTransitionedObject(ctx, z, objInfo, evt, lcEventSrc_Decom)
   780  				return false
   781  			case evt.Action.Delete():
   782  				globalExpiryState.enqueueByDays(objInfo, evt, lcEventSrc_Decom)
   783  				return true
   784  			default:
   785  				return false
   786  			}
   787  		}
   788  
   789  		decommissionEntry := func(entry metaCacheEntry) {
   790  			defer wk.Give()
   791  
   792  			if entry.isDir() {
   793  				return
   794  			}
   795  
   796  			fivs, err := entry.fileInfoVersions(bi.Name)
   797  			if err != nil {
   798  				return
   799  			}
   800  
   801  			// We need a reversed order for decommissioning,
   802  			// to create the appropriate stack.
   803  			versionsSorter(fivs.Versions).reverse()
   804  
   805  			var decommissioned, expired int
   806  			for _, version := range fivs.Versions {
   807  				stopFn := globalDecommissionMetrics.log(decomMetricDecommissionObject, idx, bi.Name, version.Name, version.VersionID)
   808  				// Apply lifecycle rules on the objects that are expired.
   809  				if filterLifecycle(bi.Name, version.Name, version) {
   810  					expired++
   811  					decommissioned++
   812  					stopFn(errors.New("ILM expired object/version will be skipped"))
   813  					continue
   814  				}
   815  
   816  				// any object with only single DEL marker we don't need
   817  				// to decommission, just skip it, this also includes
   818  				// any other versions that have already expired.
   819  				remainingVersions := len(fivs.Versions) - expired
   820  				if version.Deleted && remainingVersions == 1 {
   821  					decommissioned++
   822  					stopFn(errors.New("DELETE marked object with no other non-current versions will be skipped"))
   823  					continue
   824  				}
   825  
   826  				versionID := version.VersionID
   827  				if versionID == "" {
   828  					versionID = nullVersionID
   829  				}
   830  
   831  				if version.Deleted {
   832  					_, err := z.DeleteObject(ctx,
   833  						bi.Name,
   834  						version.Name,
   835  						ObjectOptions{
   836  							// Since we are preserving a delete marker, we have to make sure this is always true.
   837  							// regardless of the current configuration of the bucket we must preserve all versions
   838  							// on the pool being decommissioned.
   839  							Versioned:          true,
   840  							VersionID:          versionID,
   841  							MTime:              version.ModTime,
   842  							DeleteReplication:  version.ReplicationState,
   843  							DeleteMarker:       true, // make sure we create a delete marker
   844  							SkipDecommissioned: true, // make sure we skip the decommissioned pool
   845  							NoAuditLog:         true,
   846  						})
   847  					var failure bool
   848  					if err != nil {
   849  						if isErrObjectNotFound(err) || isErrVersionNotFound(err) {
   850  							err = nil
   851  						}
   852  					}
   853  					stopFn(err)
   854  					if err != nil {
   855  						logger.LogIf(ctx, err)
   856  						failure = true
   857  					}
   858  					z.poolMetaMutex.Lock()
   859  					z.poolMeta.CountItem(idx, 0, failure)
   860  					z.poolMetaMutex.Unlock()
   861  					if !failure {
   862  						// Success keep a count.
   863  						decommissioned++
   864  					}
   865  					auditLogDecom(ctx, "DecomCopyDeleteMarker", bi.Name, version.Name, versionID, err)
   866  					continue
   867  				}
   868  
   869  				var failure, ignore bool
   870  				// gr.Close() is ensured by decommissionObject().
   871  				for try := 0; try < 3; try++ {
   872  					if version.IsRemote() {
   873  						if err := z.DecomTieredObject(ctx, bi.Name, version.Name, version, ObjectOptions{
   874  							VersionID:   versionID,
   875  							MTime:       version.ModTime,
   876  							UserDefined: version.Metadata,
   877  						}); err != nil {
   878  							stopFn(err)
   879  							failure = true
   880  							logger.LogIf(ctx, err)
   881  							continue
   882  						}
   883  						stopFn(nil)
   884  						failure = false
   885  						break
   886  					}
   887  					gr, err := set.GetObjectNInfo(ctx,
   888  						bi.Name,
   889  						encodeDirObject(version.Name),
   890  						nil,
   891  						http.Header{},
   892  						ObjectOptions{
   893  							VersionID:    versionID,
   894  							NoDecryption: true,
   895  							NoLock:       true,
   896  							NoAuditLog:   true,
   897  						})
   898  					if isErrObjectNotFound(err) || isErrVersionNotFound(err) {
   899  						// object deleted by the application, nothing to do here we move on.
   900  						ignore = true
   901  						stopFn(nil)
   902  						break
   903  					}
   904  					if err != nil && !ignore {
   905  						// if usage-cache.bin is not readable log and ignore it.
   906  						if bi.Name == minioMetaBucket && strings.Contains(version.Name, dataUsageCacheName) {
   907  							ignore = true
   908  							stopFn(err)
   909  							logger.LogIf(ctx, err)
   910  							break
   911  						}
   912  					}
   913  					if err != nil {
   914  						failure = true
   915  						logger.LogIf(ctx, err)
   916  						stopFn(err)
   917  						continue
   918  					}
   919  					if err = z.decommissionObject(ctx, bi.Name, gr); err != nil {
   920  						stopFn(err)
   921  						failure = true
   922  						logger.LogIf(ctx, err)
   923  						continue
   924  					}
   925  					stopFn(nil)
   926  					failure = false
   927  					break
   928  				}
   929  				if ignore {
   930  					continue
   931  				}
   932  				z.poolMetaMutex.Lock()
   933  				z.poolMeta.CountItem(idx, version.Size, failure)
   934  				z.poolMetaMutex.Unlock()
   935  				if failure {
   936  					break // break out on first error
   937  				}
   938  				decommissioned++
   939  			}
   940  
   941  			// if all versions were decommissioned, then we can delete the object versions.
   942  			if decommissioned == len(fivs.Versions) {
   943  				stopFn := globalDecommissionMetrics.log(decomMetricDecommissionRemoveObject, idx, bi.Name, entry.name)
   944  				_, err := set.DeleteObject(ctx,
   945  					bi.Name,
   946  					encodeDirObject(entry.name),
   947  					ObjectOptions{
   948  						DeletePrefix:       true, // use prefix delete to delete all versions at once.
   949  						DeletePrefixObject: true, // use prefix delete on exact object (this is an optimization to avoid fan-out calls)
   950  						NoAuditLog:         true,
   951  					},
   952  				)
   953  				stopFn(err)
   954  				auditLogDecom(ctx, "DecomDeleteObject", bi.Name, entry.name, "", err)
   955  				if err != nil {
   956  					logger.LogIf(ctx, err)
   957  				}
   958  			}
   959  			z.poolMetaMutex.Lock()
   960  			z.poolMeta.TrackCurrentBucketObject(idx, bi.Name, entry.name)
   961  			ok, err := z.poolMeta.updateAfter(ctx, idx, z.serverPools, 30*time.Second)
   962  			logger.LogIf(ctx, err)
   963  			if ok {
   964  				globalNotificationSys.ReloadPoolMeta(ctx)
   965  			}
   966  			z.poolMetaMutex.Unlock()
   967  		}
   968  
   969  		wk.Take()
   970  		go func(setIdx int) {
   971  			defer wk.Give()
   972  			// We will perpetually retry listing if it fails, since we cannot
   973  			// possibly give up in this matter
   974  			for {
   975  				if contextCanceled(ctx) {
   976  					break
   977  				}
   978  
   979  				err := set.listObjectsToDecommission(ctx, bi,
   980  					func(entry metaCacheEntry) {
   981  						wk.Take()
   982  						go decommissionEntry(entry)
   983  					},
   984  				)
   985  				if err == nil || errors.Is(err, context.Canceled) {
   986  					break
   987  				}
   988  				setN := humanize.Ordinal(setIdx + 1)
   989  				retryDur := time.Duration(rand.Float64() * float64(5*time.Second))
   990  				logger.LogOnceIf(ctx, fmt.Errorf("listing objects from %s set failed with %v, retrying in %v", setN, err, retryDur), "decom-listing-failed"+setN)
   991  				time.Sleep(retryDur)
   992  			}
   993  		}(setIdx)
   994  	}
   995  	wk.Wait()
   996  	return nil
   997  }
   998  
   999  //msgp:ignore decomMetrics
  1000  type decomMetrics struct{}
  1001  
  1002  var globalDecommissionMetrics decomMetrics
  1003  
  1004  //msgp:ignore decomMetric
  1005  //go:generate stringer -type=decomMetric -trimprefix=decomMetric $GOFILE
  1006  type decomMetric uint8
  1007  
  1008  const (
  1009  	decomMetricDecommissionBucket decomMetric = iota
  1010  	decomMetricDecommissionObject
  1011  	decomMetricDecommissionRemoveObject
  1012  )
  1013  
  1014  func decomTrace(d decomMetric, poolIdx int, startTime time.Time, duration time.Duration, path string, err error) madmin.TraceInfo {
  1015  	var errStr string
  1016  	if err != nil {
  1017  		errStr = err.Error()
  1018  	}
  1019  	return madmin.TraceInfo{
  1020  		TraceType: madmin.TraceDecommission,
  1021  		Time:      startTime,
  1022  		NodeName:  globalLocalNodeName,
  1023  		FuncName:  fmt.Sprintf("decommission.%s (pool-id=%d)", d.String(), poolIdx),
  1024  		Duration:  duration,
  1025  		Path:      path,
  1026  		Error:     errStr,
  1027  	}
  1028  }
  1029  
  1030  func (m *decomMetrics) log(d decomMetric, poolIdx int, paths ...string) func(err error) {
  1031  	startTime := time.Now()
  1032  	return func(err error) {
  1033  		duration := time.Since(startTime)
  1034  		if globalTrace.NumSubscribers(madmin.TraceDecommission) > 0 {
  1035  			globalTrace.Publish(decomTrace(d, poolIdx, startTime, duration, strings.Join(paths, " "), err))
  1036  		}
  1037  	}
  1038  }
  1039  
  1040  func (z *erasureServerPools) decommissionInBackground(ctx context.Context, idx int) error {
  1041  	pool := z.serverPools[idx]
  1042  	z.poolMetaMutex.RLock()
  1043  	pending := z.poolMeta.PendingBuckets(idx)
  1044  	z.poolMetaMutex.RUnlock()
  1045  
  1046  	for _, bucket := range pending {
  1047  		z.poolMetaMutex.RLock()
  1048  		isDecommissioned := z.poolMeta.isBucketDecommissioned(idx, bucket.String())
  1049  		z.poolMetaMutex.RUnlock()
  1050  		if isDecommissioned {
  1051  			if serverDebugLog {
  1052  				console.Debugln("decommission: already done, moving on", bucket)
  1053  			}
  1054  
  1055  			z.poolMetaMutex.Lock()
  1056  			if z.poolMeta.BucketDone(idx, bucket) {
  1057  				// remove from pendingBuckets and persist.
  1058  				logger.LogIf(ctx, z.poolMeta.save(ctx, z.serverPools))
  1059  			}
  1060  			z.poolMetaMutex.Unlock()
  1061  			continue
  1062  		}
  1063  		if serverDebugLog {
  1064  			console.Debugln("decommission: currently on bucket", bucket.Name)
  1065  		}
  1066  		stopFn := globalDecommissionMetrics.log(decomMetricDecommissionBucket, idx, bucket.Name)
  1067  		if err := z.decommissionPool(ctx, idx, pool, bucket); err != nil {
  1068  			stopFn(err)
  1069  			return err
  1070  		}
  1071  		stopFn(nil)
  1072  
  1073  		z.poolMetaMutex.Lock()
  1074  		if z.poolMeta.BucketDone(idx, bucket) {
  1075  			logger.LogIf(ctx, z.poolMeta.save(ctx, z.serverPools))
  1076  		}
  1077  		z.poolMetaMutex.Unlock()
  1078  	}
  1079  	return nil
  1080  }
  1081  
  1082  func (z *erasureServerPools) checkAfterDecom(ctx context.Context, idx int) error {
  1083  	buckets, err := z.getBucketsToDecommission(ctx)
  1084  	if err != nil {
  1085  		return err
  1086  	}
  1087  
  1088  	pool := z.serverPools[idx]
  1089  	for _, set := range pool.sets {
  1090  		for _, bi := range buckets {
  1091  			vc, _ := globalBucketVersioningSys.Get(bi.Name)
  1092  
  1093  			// Check if the current bucket has a configured lifecycle policy
  1094  			lc, _ := globalLifecycleSys.Get(bi.Name)
  1095  
  1096  			// Check if bucket is object locked.
  1097  			lr, _ := globalBucketObjectLockSys.Get(bi.Name)
  1098  			rcfg, _ := getReplicationConfig(ctx, bi.Name)
  1099  
  1100  			filterLifecycle := func(bucket, object string, fi FileInfo) bool {
  1101  				if lc == nil {
  1102  					return false
  1103  				}
  1104  				versioned := vc != nil && vc.Versioned(object)
  1105  				objInfo := fi.ToObjectInfo(bucket, object, versioned)
  1106  
  1107  				evt := evalActionFromLifecycle(ctx, *lc, lr, rcfg, objInfo)
  1108  				switch {
  1109  				case evt.Action.DeleteRestored(): // if restored copy has expired,delete it synchronously
  1110  					applyExpiryOnTransitionedObject(ctx, z, objInfo, evt, lcEventSrc_Decom)
  1111  					return false
  1112  				case evt.Action.Delete():
  1113  					globalExpiryState.enqueueByDays(objInfo, evt, lcEventSrc_Decom)
  1114  					return true
  1115  				default:
  1116  					return false
  1117  				}
  1118  			}
  1119  
  1120  			var versionsFound int
  1121  			err := set.listObjectsToDecommission(ctx, bi, func(entry metaCacheEntry) {
  1122  				if !entry.isObject() {
  1123  					return
  1124  				}
  1125  
  1126  				fivs, err := entry.fileInfoVersions(bi.Name)
  1127  				if err != nil {
  1128  					return
  1129  				}
  1130  
  1131  				// We need a reversed order for decommissioning,
  1132  				// to create the appropriate stack.
  1133  				versionsSorter(fivs.Versions).reverse()
  1134  
  1135  				for _, version := range fivs.Versions {
  1136  					// Apply lifecycle rules on the objects that are expired.
  1137  					if filterLifecycle(bi.Name, version.Name, version) {
  1138  						continue
  1139  					}
  1140  
  1141  					// `.usage-cache.bin` still exists, must be not readable ignore it.
  1142  					if bi.Name == minioMetaBucket && strings.Contains(version.Name, dataUsageCacheName) {
  1143  						// skipping bucket usage cache name, as its autogenerated.
  1144  						continue
  1145  					}
  1146  
  1147  					versionsFound++
  1148  				}
  1149  			})
  1150  			if err != nil {
  1151  				return err
  1152  			}
  1153  
  1154  			if versionsFound > 0 {
  1155  				return fmt.Errorf("at least %d object(s)/version(s) were found in bucket `%s` after decommissioning", versionsFound, bi.Name)
  1156  			}
  1157  		}
  1158  	}
  1159  
  1160  	return nil
  1161  }
  1162  
  1163  func (z *erasureServerPools) doDecommissionInRoutine(ctx context.Context, idx int) {
  1164  	z.poolMetaMutex.Lock()
  1165  	var dctx context.Context
  1166  	dctx, z.decommissionCancelers[idx] = context.WithCancel(GlobalContext)
  1167  	z.poolMetaMutex.Unlock()
  1168  
  1169  	// Generate an empty request info so it can be directly modified later by audit
  1170  	dctx = logger.SetReqInfo(dctx, &logger.ReqInfo{})
  1171  
  1172  	if err := z.decommissionInBackground(dctx, idx); err != nil {
  1173  		logger.LogIf(GlobalContext, err)
  1174  		logger.LogIf(GlobalContext, z.DecommissionFailed(dctx, idx))
  1175  		return
  1176  	}
  1177  
  1178  	z.poolMetaMutex.Lock()
  1179  	failed := z.poolMeta.Pools[idx].Decommission.ItemsDecommissionFailed > 0 || contextCanceled(dctx)
  1180  	poolCmdLine := z.poolMeta.Pools[idx].CmdLine
  1181  	z.poolMetaMutex.Unlock()
  1182  
  1183  	if !failed {
  1184  		logger.Event(dctx, "Decommissioning complete for pool '%s', verifying for any pending objects", poolCmdLine)
  1185  		err := z.checkAfterDecom(dctx, idx)
  1186  		if err != nil {
  1187  			logger.LogIf(ctx, err)
  1188  			failed = true
  1189  		}
  1190  	}
  1191  
  1192  	if failed {
  1193  		// Decommission failed indicate as such.
  1194  		logger.LogIf(GlobalContext, z.DecommissionFailed(dctx, idx))
  1195  	} else {
  1196  		// Complete the decommission..
  1197  		logger.LogIf(GlobalContext, z.CompleteDecommission(dctx, idx))
  1198  	}
  1199  }
  1200  
  1201  func (z *erasureServerPools) IsSuspended(idx int) bool {
  1202  	z.poolMetaMutex.RLock()
  1203  	defer z.poolMetaMutex.RUnlock()
  1204  	return z.poolMeta.IsSuspended(idx)
  1205  }
  1206  
  1207  // Decommission - start decommission session.
  1208  func (z *erasureServerPools) Decommission(ctx context.Context, indices ...int) error {
  1209  	if len(indices) == 0 {
  1210  		return errInvalidArgument
  1211  	}
  1212  
  1213  	if z.SinglePool() {
  1214  		return errInvalidArgument
  1215  	}
  1216  
  1217  	// Make pool unwritable before decommissioning.
  1218  	if err := z.StartDecommission(ctx, indices...); err != nil {
  1219  		return err
  1220  	}
  1221  
  1222  	go func() {
  1223  		for _, idx := range indices {
  1224  			// decommission all pools serially one after
  1225  			// the other.
  1226  			z.doDecommissionInRoutine(ctx, idx)
  1227  		}
  1228  	}()
  1229  
  1230  	// Successfully started decommissioning.
  1231  	return nil
  1232  }
  1233  
  1234  type decomError struct {
  1235  	Err string
  1236  }
  1237  
  1238  func (d decomError) Error() string {
  1239  	return d.Err
  1240  }
  1241  
  1242  type poolSpaceInfo struct {
  1243  	Free  int64
  1244  	Total int64
  1245  	Used  int64
  1246  }
  1247  
  1248  func (z *erasureServerPools) getDecommissionPoolSpaceInfo(idx int) (pi poolSpaceInfo, err error) {
  1249  	if idx < 0 {
  1250  		return pi, errInvalidArgument
  1251  	}
  1252  	if idx+1 > len(z.serverPools) {
  1253  		return pi, errInvalidArgument
  1254  	}
  1255  
  1256  	info := z.serverPools[idx].StorageInfo(context.Background())
  1257  	info.Backend = z.BackendInfo()
  1258  
  1259  	usableTotal := int64(GetTotalUsableCapacity(info.Disks, info))
  1260  	usableFree := int64(GetTotalUsableCapacityFree(info.Disks, info))
  1261  	return poolSpaceInfo{
  1262  		Total: usableTotal,
  1263  		Free:  usableFree,
  1264  		Used:  usableTotal - usableFree,
  1265  	}, nil
  1266  }
  1267  
  1268  func (z *erasureServerPools) Status(ctx context.Context, idx int) (PoolStatus, error) {
  1269  	if idx < 0 {
  1270  		return PoolStatus{}, errInvalidArgument
  1271  	}
  1272  
  1273  	pi, err := z.getDecommissionPoolSpaceInfo(idx)
  1274  	if err != nil {
  1275  		return PoolStatus{}, err
  1276  	}
  1277  
  1278  	z.poolMetaMutex.RLock()
  1279  	defer z.poolMetaMutex.RUnlock()
  1280  
  1281  	poolInfo := z.poolMeta.Pools[idx].Clone()
  1282  	if poolInfo.Decommission != nil {
  1283  		poolInfo.Decommission.TotalSize = pi.Total
  1284  		if poolInfo.Decommission.Failed || poolInfo.Decommission.Canceled {
  1285  			poolInfo.Decommission.CurrentSize = pi.Free
  1286  		} else {
  1287  			poolInfo.Decommission.CurrentSize = poolInfo.Decommission.StartSize + poolInfo.Decommission.BytesDone
  1288  		}
  1289  	} else {
  1290  		poolInfo.Decommission = &PoolDecommissionInfo{
  1291  			TotalSize:   pi.Total,
  1292  			CurrentSize: pi.Free,
  1293  		}
  1294  	}
  1295  	return poolInfo, nil
  1296  }
  1297  
  1298  func (z *erasureServerPools) ReloadPoolMeta(ctx context.Context) (err error) {
  1299  	meta := poolMeta{}
  1300  
  1301  	if err = meta.load(ctx, z.serverPools[0], z.serverPools); err != nil {
  1302  		return err
  1303  	}
  1304  
  1305  	z.poolMetaMutex.Lock()
  1306  	defer z.poolMetaMutex.Unlock()
  1307  
  1308  	z.poolMeta = meta
  1309  	return nil
  1310  }
  1311  
  1312  func (z *erasureServerPools) DecommissionCancel(ctx context.Context, idx int) (err error) {
  1313  	if idx < 0 {
  1314  		return errInvalidArgument
  1315  	}
  1316  
  1317  	if z.SinglePool() {
  1318  		return errInvalidArgument
  1319  	}
  1320  
  1321  	z.poolMetaMutex.Lock()
  1322  	defer z.poolMetaMutex.Unlock()
  1323  
  1324  	fn := z.decommissionCancelers[idx]
  1325  	if fn == nil {
  1326  		// canceling a decommission before it started return an error.
  1327  		return errDecommissionNotStarted
  1328  	}
  1329  
  1330  	defer fn() // cancel any active thread.
  1331  
  1332  	if z.poolMeta.DecommissionCancel(idx) {
  1333  		if err = z.poolMeta.save(ctx, z.serverPools); err != nil {
  1334  			return err
  1335  		}
  1336  		globalNotificationSys.ReloadPoolMeta(ctx)
  1337  	}
  1338  
  1339  	return nil
  1340  }
  1341  
  1342  func (z *erasureServerPools) DecommissionFailed(ctx context.Context, idx int) (err error) {
  1343  	if idx < 0 {
  1344  		return errInvalidArgument
  1345  	}
  1346  
  1347  	if z.SinglePool() {
  1348  		return errInvalidArgument
  1349  	}
  1350  
  1351  	z.poolMetaMutex.Lock()
  1352  	defer z.poolMetaMutex.Unlock()
  1353  
  1354  	if z.poolMeta.DecommissionFailed(idx) {
  1355  		if fn := z.decommissionCancelers[idx]; fn != nil {
  1356  			defer fn()
  1357  		} // cancel any active thread.
  1358  
  1359  		if err = z.poolMeta.save(ctx, z.serverPools); err != nil {
  1360  			return err
  1361  		}
  1362  		globalNotificationSys.ReloadPoolMeta(ctx)
  1363  	}
  1364  	return nil
  1365  }
  1366  
  1367  func (z *erasureServerPools) CompleteDecommission(ctx context.Context, idx int) (err error) {
  1368  	if idx < 0 {
  1369  		return errInvalidArgument
  1370  	}
  1371  
  1372  	if z.SinglePool() {
  1373  		return errInvalidArgument
  1374  	}
  1375  
  1376  	z.poolMetaMutex.Lock()
  1377  	defer z.poolMetaMutex.Unlock()
  1378  
  1379  	if z.poolMeta.DecommissionComplete(idx) {
  1380  		if fn := z.decommissionCancelers[idx]; fn != nil {
  1381  			defer fn()
  1382  		} // cancel any active thread.
  1383  
  1384  		if err = z.poolMeta.save(ctx, z.serverPools); err != nil {
  1385  			return err
  1386  		}
  1387  		globalNotificationSys.ReloadPoolMeta(ctx)
  1388  	}
  1389  	return nil
  1390  }
  1391  
  1392  func (z *erasureServerPools) getBucketsToDecommission(ctx context.Context) ([]decomBucketInfo, error) {
  1393  	buckets, err := z.ListBuckets(ctx, BucketOptions{})
  1394  	if err != nil {
  1395  		return nil, err
  1396  	}
  1397  
  1398  	decomBuckets := make([]decomBucketInfo, len(buckets))
  1399  	for i := range buckets {
  1400  		decomBuckets[i] = decomBucketInfo{
  1401  			Name: buckets[i].Name,
  1402  		}
  1403  	}
  1404  
  1405  	// Buckets data are dispersed in multiple zones/sets, make
  1406  	// sure to decommission the necessary metadata.
  1407  	decomBuckets = append(decomBuckets, decomBucketInfo{
  1408  		Name:   minioMetaBucket,
  1409  		Prefix: minioConfigPrefix,
  1410  	})
  1411  	decomBuckets = append(decomBuckets, decomBucketInfo{
  1412  		Name:   minioMetaBucket,
  1413  		Prefix: bucketMetaPrefix,
  1414  	})
  1415  
  1416  	return decomBuckets, nil
  1417  }
  1418  
  1419  func (z *erasureServerPools) StartDecommission(ctx context.Context, indices ...int) (err error) {
  1420  	if len(indices) == 0 {
  1421  		return errInvalidArgument
  1422  	}
  1423  
  1424  	if z.SinglePool() {
  1425  		return errInvalidArgument
  1426  	}
  1427  
  1428  	decomBuckets, err := z.getBucketsToDecommission(ctx)
  1429  	if err != nil {
  1430  		return err
  1431  	}
  1432  
  1433  	for _, bucket := range decomBuckets {
  1434  		z.HealBucket(ctx, bucket.Name, madmin.HealOpts{})
  1435  	}
  1436  
  1437  	// Create .minio.sys/config, .minio.sys/buckets paths if missing,
  1438  	// this code is present to avoid any missing meta buckets on other
  1439  	// pools.
  1440  	for _, metaBucket := range []string{
  1441  		pathJoin(minioMetaBucket, minioConfigPrefix),
  1442  		pathJoin(minioMetaBucket, bucketMetaPrefix),
  1443  	} {
  1444  		var bucketExists BucketExists
  1445  		if err = z.MakeBucket(ctx, metaBucket, MakeBucketOptions{}); err != nil {
  1446  			if !errors.As(err, &bucketExists) {
  1447  				return err
  1448  			}
  1449  		}
  1450  	}
  1451  
  1452  	z.poolMetaMutex.Lock()
  1453  	defer z.poolMetaMutex.Unlock()
  1454  
  1455  	for _, idx := range indices {
  1456  		pi, err := z.getDecommissionPoolSpaceInfo(idx)
  1457  		if err != nil {
  1458  			return err
  1459  		}
  1460  
  1461  		if err = z.poolMeta.Decommission(idx, pi); err != nil {
  1462  			return err
  1463  		}
  1464  
  1465  		z.poolMeta.QueueBuckets(idx, decomBuckets)
  1466  	}
  1467  
  1468  	if err = z.poolMeta.save(ctx, z.serverPools); err != nil {
  1469  		return err
  1470  	}
  1471  
  1472  	globalNotificationSys.ReloadPoolMeta(ctx)
  1473  
  1474  	return nil
  1475  }
  1476  
  1477  func auditLogDecom(ctx context.Context, apiName, bucket, object, versionID string, err error) {
  1478  	errStr := ""
  1479  	if err != nil {
  1480  		errStr = err.Error()
  1481  	}
  1482  	auditLogInternal(ctx, AuditLogOptions{
  1483  		Event:     "decommission",
  1484  		APIName:   apiName,
  1485  		Bucket:    bucket,
  1486  		Object:    object,
  1487  		VersionID: versionID,
  1488  		Error:     errStr,
  1489  	})
  1490  }