github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/metacache-set.go (about)

     1  // Copyright (c) 2015-2021 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package cmd
    19  
    20  import (
    21  	"bytes"
    22  	"context"
    23  	"encoding/gob"
    24  	"encoding/json"
    25  	"errors"
    26  	"fmt"
    27  	"io"
    28  	"math/rand"
    29  	"strconv"
    30  	"strings"
    31  	"sync"
    32  	"time"
    33  
    34  	jsoniter "github.com/json-iterator/go"
    35  	"github.com/minio/minio/internal/bucket/lifecycle"
    36  	"github.com/minio/minio/internal/bucket/object/lock"
    37  	"github.com/minio/minio/internal/bucket/versioning"
    38  	"github.com/minio/minio/internal/color"
    39  	"github.com/minio/minio/internal/hash"
    40  	xioutil "github.com/minio/minio/internal/ioutil"
    41  	"github.com/minio/minio/internal/logger"
    42  	"github.com/minio/pkg/v2/console"
    43  )
    44  
    45  //go:generate msgp -file $GOFILE -unexported
    46  
    47  type listPathOptions struct {
    48  	// ID of the listing.
    49  	// This will be used to persist the list.
    50  	ID string
    51  
    52  	// Bucket of the listing.
    53  	Bucket string
    54  
    55  	// Directory inside the bucket.
    56  	// When unset listPath will set this based on Prefix
    57  	BaseDir string
    58  
    59  	// Scan/return only content with prefix.
    60  	Prefix string
    61  
    62  	// FilterPrefix will return only results with this prefix when scanning.
    63  	// Should never contain a slash.
    64  	// Prefix should still be set.
    65  	FilterPrefix string
    66  
    67  	// Marker to resume listing.
    68  	// The response will be the first entry >= this object name.
    69  	Marker string
    70  
    71  	// Limit the number of results.
    72  	Limit int
    73  
    74  	// The number of disks to ask.
    75  	AskDisks string
    76  
    77  	// InclDeleted will keep all entries where latest version is a delete marker.
    78  	InclDeleted bool
    79  
    80  	// Scan recursively.
    81  	// If false only main directory will be scanned.
    82  	// Should always be true if Separator is n SlashSeparator.
    83  	Recursive bool
    84  
    85  	// Separator to use.
    86  	Separator string
    87  
    88  	// Create indicates that the lister should not attempt to load an existing cache.
    89  	Create bool
    90  
    91  	// Include pure directories.
    92  	IncludeDirectories bool
    93  
    94  	// Transient is set if the cache is transient due to an error or being a reserved bucket.
    95  	// This means the cache metadata will not be persisted on disk.
    96  	// A transient result will never be returned from the cache so knowing the list id is required.
    97  	Transient bool
    98  
    99  	// Versioned is this a ListObjectVersions call.
   100  	Versioned bool
   101  	// V1 listing type
   102  	V1 bool
   103  
   104  	// Versioning config is used for if the path
   105  	// has versioning enabled.
   106  	Versioning *versioning.Versioning `msg:"-"`
   107  
   108  	// Lifecycle performs filtering based on lifecycle.
   109  	// This will filter out objects if the most recent version should be deleted by lifecycle.
   110  	// Is not transferred across request calls.
   111  	Lifecycle *lifecycle.Lifecycle `msg:"-"`
   112  
   113  	// Retention configuration, needed to be passed along with lifecycle if set.
   114  	Retention lock.Retention `msg:"-"`
   115  
   116  	// Replication configuration
   117  	Replication replicationConfig `msg:"-"`
   118  
   119  	// StopDiskAtLimit will stop listing on each disk when limit number off objects has been returned.
   120  	StopDiskAtLimit bool
   121  
   122  	// pool and set of where the cache is located.
   123  	pool, set int
   124  }
   125  
   126  func init() {
   127  	gob.Register(listPathOptions{})
   128  }
   129  
   130  func (o *listPathOptions) setBucketMeta(ctx context.Context) {
   131  	lc, _ := globalLifecycleSys.Get(o.Bucket)
   132  	vc, _ := globalBucketVersioningSys.Get(o.Bucket)
   133  
   134  	// Check if bucket is object locked.
   135  	rcfg, _ := globalBucketObjectLockSys.Get(o.Bucket)
   136  	replCfg, _, _ := globalBucketMetadataSys.GetReplicationConfig(ctx, o.Bucket)
   137  	tgts, _ := globalBucketTargetSys.ListBucketTargets(ctx, o.Bucket)
   138  	o.Lifecycle = lc
   139  	o.Versioning = vc
   140  	o.Replication = replicationConfig{
   141  		Config:  replCfg,
   142  		remotes: tgts,
   143  	}
   144  	o.Retention = rcfg
   145  }
   146  
   147  // newMetacache constructs a new metacache from the options.
   148  func (o listPathOptions) newMetacache() metacache {
   149  	return metacache{
   150  		id:          o.ID,
   151  		bucket:      o.Bucket,
   152  		root:        o.BaseDir,
   153  		recursive:   o.Recursive,
   154  		status:      scanStateStarted,
   155  		error:       "",
   156  		started:     UTCNow(),
   157  		lastHandout: UTCNow(),
   158  		lastUpdate:  UTCNow(),
   159  		ended:       time.Time{},
   160  		dataVersion: metacacheStreamVersion,
   161  		filter:      o.FilterPrefix,
   162  	}
   163  }
   164  
   165  func (o *listPathOptions) debugf(format string, data ...interface{}) {
   166  	if serverDebugLog {
   167  		console.Debugf(format+"\n", data...)
   168  	}
   169  }
   170  
   171  func (o *listPathOptions) debugln(data ...interface{}) {
   172  	if serverDebugLog {
   173  		console.Debugln(data...)
   174  	}
   175  }
   176  
   177  // gatherResults will collect all results on the input channel and filter results according
   178  // to the options or to the current bucket ILM expiry rules.
   179  // Caller should close the channel when done.
   180  // The returned function will return the results once there is enough or input is closed,
   181  // or the context is canceled.
   182  func (o *listPathOptions) gatherResults(ctx context.Context, in <-chan metaCacheEntry) func() (metaCacheEntriesSorted, error) {
   183  	resultsDone := make(chan metaCacheEntriesSorted)
   184  	// Copy so we can mutate
   185  	resCh := resultsDone
   186  	var done bool
   187  	var mu sync.Mutex
   188  	resErr := io.EOF
   189  
   190  	go func() {
   191  		var results metaCacheEntriesSorted
   192  		var returned bool
   193  		for entry := range in {
   194  			if returned {
   195  				// past limit
   196  				continue
   197  			}
   198  			mu.Lock()
   199  			returned = done
   200  			mu.Unlock()
   201  			if returned {
   202  				resCh = nil
   203  				continue
   204  			}
   205  			if !o.IncludeDirectories && (entry.isDir() || (!o.Versioned && entry.isObjectDir() && entry.isLatestDeletemarker())) {
   206  				continue
   207  			}
   208  			if o.Marker != "" && entry.name < o.Marker {
   209  				continue
   210  			}
   211  			if !strings.HasPrefix(entry.name, o.Prefix) {
   212  				continue
   213  			}
   214  			if !o.Recursive && !entry.isInDir(o.Prefix, o.Separator) {
   215  				continue
   216  			}
   217  			if !o.InclDeleted && entry.isObject() && entry.isLatestDeletemarker() && !entry.isObjectDir() {
   218  				continue
   219  			}
   220  			if o.Lifecycle != nil || o.Replication.Config != nil {
   221  				if skipped := triggerExpiryAndRepl(ctx, *o, entry); skipped == true {
   222  					results.lastSkippedEntry = entry.name
   223  					continue
   224  				}
   225  			}
   226  			if o.Limit > 0 && results.len() >= o.Limit {
   227  				// We have enough and we have more.
   228  				// Do not return io.EOF
   229  				if resCh != nil {
   230  					resErr = nil
   231  					select {
   232  					case resCh <- results:
   233  					case <-ctx.Done():
   234  					}
   235  					resCh = nil
   236  					returned = true
   237  				}
   238  				continue
   239  			}
   240  			results.o = append(results.o, entry)
   241  		}
   242  		if resCh != nil {
   243  			resErr = io.EOF
   244  			select {
   245  			case <-ctx.Done():
   246  				// Nobody wants it.
   247  			case resCh <- results:
   248  			}
   249  		}
   250  	}()
   251  	return func() (metaCacheEntriesSorted, error) {
   252  		select {
   253  		case <-ctx.Done():
   254  			mu.Lock()
   255  			done = true
   256  			mu.Unlock()
   257  			return metaCacheEntriesSorted{}, ctx.Err()
   258  		case r := <-resultsDone:
   259  			return r, resErr
   260  		}
   261  	}
   262  }
   263  
   264  // findFirstPart will find the part with 0 being the first that corresponds to the marker in the options.
   265  // io.ErrUnexpectedEOF is returned if the place containing the marker hasn't been scanned yet.
   266  // io.EOF indicates the marker is beyond the end of the stream and does not exist.
   267  func (o *listPathOptions) findFirstPart(fi FileInfo) (int, error) {
   268  	search := o.Marker
   269  	if search == "" {
   270  		search = o.Prefix
   271  	}
   272  	if search == "" {
   273  		return 0, nil
   274  	}
   275  	o.debugln("searching for ", search)
   276  	var tmp metacacheBlock
   277  	json := jsoniter.ConfigCompatibleWithStandardLibrary
   278  	i := 0
   279  	for {
   280  		partKey := fmt.Sprintf("%s-metacache-part-%d", ReservedMetadataPrefixLower, i)
   281  		v, ok := fi.Metadata[partKey]
   282  		if !ok {
   283  			o.debugln("no match in metadata, waiting")
   284  			return -1, io.ErrUnexpectedEOF
   285  		}
   286  		err := json.Unmarshal([]byte(v), &tmp)
   287  		if !ok {
   288  			logger.LogIf(context.Background(), err)
   289  			return -1, err
   290  		}
   291  		if tmp.First == "" && tmp.Last == "" && tmp.EOS {
   292  			return 0, errFileNotFound
   293  		}
   294  		if tmp.First >= search {
   295  			o.debugln("First >= search", v)
   296  			return i, nil
   297  		}
   298  		if tmp.Last >= search {
   299  			o.debugln("Last >= search", v)
   300  			return i, nil
   301  		}
   302  		if tmp.EOS {
   303  			o.debugln("no match, at EOS", v)
   304  			return -3, io.EOF
   305  		}
   306  		o.debugln("First ", tmp.First, "<", search, " search", i)
   307  		i++
   308  	}
   309  }
   310  
   311  // updateMetacacheListing will update the metacache listing.
   312  func (o *listPathOptions) updateMetacacheListing(m metacache, rpc *peerRESTClient) (metacache, error) {
   313  	if rpc == nil {
   314  		return localMetacacheMgr.updateCacheEntry(m)
   315  	}
   316  	return rpc.UpdateMetacacheListing(context.Background(), m)
   317  }
   318  
   319  func getMetacacheBlockInfo(fi FileInfo, block int) (*metacacheBlock, error) {
   320  	var tmp metacacheBlock
   321  	partKey := fmt.Sprintf("%s-metacache-part-%d", ReservedMetadataPrefixLower, block)
   322  	v, ok := fi.Metadata[partKey]
   323  	if !ok {
   324  		return nil, io.ErrUnexpectedEOF
   325  	}
   326  	return &tmp, json.Unmarshal([]byte(v), &tmp)
   327  }
   328  
   329  const metacachePrefix = ".metacache"
   330  
   331  func metacachePrefixForID(bucket, id string) string {
   332  	return pathJoin(bucketMetaPrefix, bucket, metacachePrefix, id)
   333  }
   334  
   335  // objectPath returns the object path of the cache.
   336  func (o *listPathOptions) objectPath(block int) string {
   337  	return pathJoin(metacachePrefixForID(o.Bucket, o.ID), "block-"+strconv.Itoa(block)+".s2")
   338  }
   339  
   340  func (o *listPathOptions) SetFilter() {
   341  	switch {
   342  	case metacacheSharePrefix:
   343  		return
   344  	case o.Prefix == o.BaseDir:
   345  		// No additional prefix
   346  		return
   347  	}
   348  	// Remove basedir.
   349  	o.FilterPrefix = strings.TrimPrefix(o.Prefix, o.BaseDir)
   350  	// Remove leading and trailing slashes.
   351  	o.FilterPrefix = strings.Trim(o.FilterPrefix, slashSeparator)
   352  
   353  	if strings.Contains(o.FilterPrefix, slashSeparator) {
   354  		// Sanity check, should not happen.
   355  		o.FilterPrefix = ""
   356  	}
   357  }
   358  
   359  // filter will apply the options and return the number of objects requested by the limit.
   360  // Will return io.EOF if there are no more entries with the same filter.
   361  // The last entry can be used as a marker to resume the listing.
   362  func (r *metacacheReader) filter(o listPathOptions) (entries metaCacheEntriesSorted, err error) {
   363  	// Forward to prefix, if any
   364  	err = r.forwardTo(o.Prefix)
   365  	if err != nil {
   366  		return entries, err
   367  	}
   368  	if o.Marker != "" {
   369  		err = r.forwardTo(o.Marker)
   370  		if err != nil {
   371  			return entries, err
   372  		}
   373  	}
   374  	o.debugln("forwarded to ", o.Prefix, "marker:", o.Marker, "sep:", o.Separator)
   375  
   376  	// Filter
   377  	if !o.Recursive {
   378  		entries.o = make(metaCacheEntries, 0, o.Limit)
   379  		pastPrefix := false
   380  		err := r.readFn(func(entry metaCacheEntry) bool {
   381  			if o.Prefix != "" && !strings.HasPrefix(entry.name, o.Prefix) {
   382  				// We are past the prefix, don't continue.
   383  				pastPrefix = true
   384  				return false
   385  			}
   386  			if !o.IncludeDirectories && (entry.isDir() || (!o.Versioned && entry.isObjectDir() && entry.isLatestDeletemarker())) {
   387  				return true
   388  			}
   389  			if !entry.isInDir(o.Prefix, o.Separator) {
   390  				return true
   391  			}
   392  			if !o.InclDeleted && entry.isObject() && entry.isLatestDeletemarker() && !entry.isObjectDir() {
   393  				return true
   394  			}
   395  			if entry.isAllFreeVersions() {
   396  				return true
   397  			}
   398  			entries.o = append(entries.o, entry)
   399  			return entries.len() < o.Limit
   400  		})
   401  		if (err != nil && errors.Is(err, io.EOF)) || pastPrefix || r.nextEOF() {
   402  			return entries, io.EOF
   403  		}
   404  		return entries, err
   405  	}
   406  
   407  	// We should not need to filter more.
   408  	return r.readN(o.Limit, o.InclDeleted, o.IncludeDirectories, o.Versioned, o.Prefix)
   409  }
   410  
   411  func (er *erasureObjects) streamMetadataParts(ctx context.Context, o listPathOptions) (entries metaCacheEntriesSorted, err error) {
   412  	retries := 0
   413  	rpc := globalNotificationSys.restClientFromHash(pathJoin(o.Bucket, o.Prefix))
   414  
   415  	const (
   416  		retryDelay    = 50 * time.Millisecond
   417  		retryDelay250 = 250 * time.Millisecond
   418  	)
   419  
   420  	for {
   421  		if contextCanceled(ctx) {
   422  			return entries, ctx.Err()
   423  		}
   424  
   425  		// If many failures, check the cache state.
   426  		if retries > 10 {
   427  			err := o.checkMetacacheState(ctx, rpc)
   428  			if err != nil {
   429  				return entries, fmt.Errorf("remote listing canceled: %w", err)
   430  			}
   431  			retries = 1
   432  		}
   433  
   434  		// All operations are performed without locks, so we must be careful and allow for failures.
   435  		// Read metadata associated with the object from a disk.
   436  		if retries > 0 {
   437  			for _, disk := range er.getDisks() {
   438  				if disk == nil {
   439  					continue
   440  				}
   441  				if !disk.IsOnline() {
   442  					continue
   443  				}
   444  				_, err := disk.ReadVersion(ctx, "", minioMetaBucket,
   445  					o.objectPath(0), "", ReadOptions{})
   446  				if err != nil {
   447  					time.Sleep(retryDelay250)
   448  					retries++
   449  					continue
   450  				}
   451  				break
   452  			}
   453  		}
   454  		retryWait := func() {
   455  			retries++
   456  			if retries == 1 {
   457  				time.Sleep(retryDelay)
   458  			} else {
   459  				time.Sleep(retryDelay250)
   460  			}
   461  		}
   462  		// Load first part metadata...
   463  		// Read metadata associated with the object from all disks.
   464  		fi, metaArr, onlineDisks, err := er.getObjectFileInfo(ctx, minioMetaBucket, o.objectPath(0), ObjectOptions{}, true)
   465  		if err != nil {
   466  			switch toObjectErr(err, minioMetaBucket, o.objectPath(0)).(type) {
   467  			case ObjectNotFound, InsufficientReadQuorum:
   468  				retryWait()
   469  				continue
   470  			}
   471  			// Allow one fast retry for other errors.
   472  			if retries > 0 {
   473  				return entries, fmt.Errorf("reading first part metadata: %v", err)
   474  			}
   475  			retryWait()
   476  			continue
   477  		}
   478  
   479  		partN, err := o.findFirstPart(fi)
   480  		switch {
   481  		case err == nil:
   482  		case errors.Is(err, io.ErrUnexpectedEOF):
   483  			if retries == 10 {
   484  				err := o.checkMetacacheState(ctx, rpc)
   485  				if err != nil {
   486  					return entries, fmt.Errorf("remote listing canceled: %w", err)
   487  				}
   488  				retries = -1
   489  			}
   490  			retryWait()
   491  			continue
   492  		case errors.Is(err, io.EOF):
   493  			return entries, io.EOF
   494  		}
   495  
   496  		// We got a stream to start at.
   497  		loadedPart := 0
   498  		for {
   499  			if contextCanceled(ctx) {
   500  				return entries, ctx.Err()
   501  			}
   502  
   503  			if partN != loadedPart {
   504  				if retries > 10 {
   505  					err := o.checkMetacacheState(ctx, rpc)
   506  					if err != nil {
   507  						return entries, fmt.Errorf("waiting for next part %d: %w", partN, err)
   508  					}
   509  					retries = 1
   510  				}
   511  
   512  				if retries > 0 {
   513  					// Load from one disk only
   514  					for _, disk := range er.getDisks() {
   515  						if disk == nil {
   516  							continue
   517  						}
   518  						if !disk.IsOnline() {
   519  							continue
   520  						}
   521  						_, err := disk.ReadVersion(ctx, "", minioMetaBucket,
   522  							o.objectPath(partN), "", ReadOptions{})
   523  						if err != nil {
   524  							time.Sleep(retryDelay250)
   525  							retries++
   526  							continue
   527  						}
   528  						break
   529  					}
   530  				}
   531  
   532  				// Load partN metadata...
   533  				fi, metaArr, onlineDisks, err = er.getObjectFileInfo(ctx, minioMetaBucket, o.objectPath(partN), ObjectOptions{}, true)
   534  				if err != nil {
   535  					time.Sleep(retryDelay250)
   536  					retries++
   537  					continue
   538  				}
   539  				loadedPart = partN
   540  				bi, err := getMetacacheBlockInfo(fi, partN)
   541  				logger.LogIf(ctx, err)
   542  				if err == nil {
   543  					if bi.pastPrefix(o.Prefix) {
   544  						return entries, io.EOF
   545  					}
   546  				}
   547  			}
   548  
   549  			pr, pw := io.Pipe()
   550  			go func() {
   551  				werr := er.getObjectWithFileInfo(ctx, minioMetaBucket, o.objectPath(partN), 0,
   552  					fi.Size, pw, fi, metaArr, onlineDisks)
   553  				pw.CloseWithError(werr)
   554  			}()
   555  
   556  			tmp := newMetacacheReader(pr)
   557  			e, err := tmp.filter(o)
   558  			pr.CloseWithError(err)
   559  			tmp.Close()
   560  			entries.o = append(entries.o, e.o...)
   561  			if o.Limit > 0 && entries.len() > o.Limit {
   562  				entries.truncate(o.Limit)
   563  				return entries, nil
   564  			}
   565  			if err == nil {
   566  				// We stopped within the listing, we are done for now...
   567  				return entries, nil
   568  			}
   569  			if err != nil && !errors.Is(err, io.EOF) {
   570  				switch toObjectErr(err, minioMetaBucket, o.objectPath(partN)).(type) {
   571  				case ObjectNotFound:
   572  					retries++
   573  					time.Sleep(retryDelay250)
   574  					continue
   575  				case InsufficientReadQuorum:
   576  					retries++
   577  					time.Sleep(retryDelay250)
   578  					continue
   579  				default:
   580  					logger.LogIf(ctx, err)
   581  					return entries, err
   582  				}
   583  			}
   584  
   585  			// We finished at the end of the block.
   586  			// And should not expect any more results.
   587  			bi, err := getMetacacheBlockInfo(fi, partN)
   588  			logger.LogIf(ctx, err)
   589  			if err != nil || bi.EOS {
   590  				// We are done and there are no more parts.
   591  				return entries, io.EOF
   592  			}
   593  			if bi.endedPrefix(o.Prefix) {
   594  				// Nothing more for prefix.
   595  				return entries, io.EOF
   596  			}
   597  			partN++
   598  			retries = 0
   599  		}
   600  	}
   601  }
   602  
   603  // getListQuorum interprets list quorum values and returns appropriate
   604  // acceptable quorum expected for list operations
   605  func getListQuorum(quorum string, driveCount int) int {
   606  	switch quorum {
   607  	case "disk":
   608  		return 1
   609  	case "reduced":
   610  		return 2
   611  	case "optimal":
   612  		return (driveCount + 1) / 2
   613  	case "auto":
   614  		return -1
   615  	}
   616  	// defaults to 'strict'
   617  	return driveCount
   618  }
   619  
   620  func calcCommonWritesDeletes(infos []DiskInfo, readQuorum int) (commonWrite, commonDelete uint64) {
   621  	deletes := make([]uint64, len(infos))
   622  	writes := make([]uint64, len(infos))
   623  	for index, di := range infos {
   624  		deletes[index] = di.Metrics.TotalDeletes
   625  		writes[index] = di.Metrics.TotalWrites
   626  	}
   627  
   628  	filter := func(list []uint64) (commonCount uint64) {
   629  		max := 0
   630  		signatureMap := map[uint64]int{}
   631  		for _, v := range list {
   632  			signatureMap[v]++
   633  		}
   634  		for ops, count := range signatureMap {
   635  			if max < count && commonCount < ops {
   636  				max = count
   637  				commonCount = ops
   638  			}
   639  		}
   640  		if max < readQuorum {
   641  			return 0
   642  		}
   643  		return commonCount
   644  	}
   645  
   646  	commonWrite = filter(writes)
   647  	commonDelete = filter(deletes)
   648  	return
   649  }
   650  
   651  func calcCommonCounter(infos []DiskInfo, readQuorum int) (commonCount uint64) {
   652  	filter := func() (commonCount uint64) {
   653  		max := 0
   654  		signatureMap := map[uint64]int{}
   655  		for _, info := range infos {
   656  			if info.Error != "" {
   657  				continue
   658  			}
   659  			mutations := info.Metrics.TotalDeletes + info.Metrics.TotalWrites
   660  			signatureMap[mutations]++
   661  		}
   662  		for ops, count := range signatureMap {
   663  			if max < count && commonCount < ops {
   664  				max = count
   665  				commonCount = ops
   666  			}
   667  		}
   668  		if max < readQuorum {
   669  			return 0
   670  		}
   671  		return commonCount
   672  	}
   673  
   674  	return filter()
   675  }
   676  
   677  func getQuorumDiskInfos(disks []StorageAPI, infos []DiskInfo, readQuorum int) (newDisks []StorageAPI, newInfos []DiskInfo) {
   678  	commonMutations := calcCommonCounter(infos, readQuorum)
   679  	for i, info := range infos {
   680  		mutations := info.Metrics.TotalDeletes + info.Metrics.TotalWrites
   681  		if mutations >= commonMutations {
   682  			newDisks = append(newDisks, disks[i])
   683  			newInfos = append(newInfos, infos[i])
   684  		}
   685  	}
   686  
   687  	return newDisks, newInfos
   688  }
   689  
   690  func getQuorumDisks(disks []StorageAPI, infos []DiskInfo, readQuorum int) (newDisks []StorageAPI) {
   691  	newDisks, _ = getQuorumDiskInfos(disks, infos, readQuorum)
   692  	return newDisks
   693  }
   694  
   695  // Will return io.EOF if continuing would not yield more results.
   696  func (er *erasureObjects) listPath(ctx context.Context, o listPathOptions, results chan<- metaCacheEntry) (err error) {
   697  	defer xioutil.SafeClose(results)
   698  	o.debugf(color.Green("listPath:")+" with options: %#v", o)
   699  
   700  	// get prioritized non-healing disks for listing
   701  	disks, infos, _ := er.getOnlineDisksWithHealingAndInfo(true)
   702  	askDisks := getListQuorum(o.AskDisks, er.setDriveCount)
   703  	if askDisks == -1 {
   704  		newDisks := getQuorumDisks(disks, infos, (len(disks)+1)/2)
   705  		if newDisks != nil {
   706  			// If we found disks signature in quorum, we proceed to list
   707  			// from a single drive, shuffling of the drives is subsequently.
   708  			disks = newDisks
   709  			askDisks = 1
   710  		} else {
   711  			// If we did not find suitable disks, perform strict quorum listing
   712  			// as no disk agrees on quorum anymore.
   713  			askDisks = getListQuorum("strict", er.setDriveCount)
   714  		}
   715  	}
   716  
   717  	var fallbackDisks []StorageAPI
   718  
   719  	// Special case: ask all disks if the drive count is 4
   720  	if er.setDriveCount == 4 || askDisks > len(disks) {
   721  		askDisks = len(disks) // use all available drives
   722  	}
   723  
   724  	// However many we ask, versions must exist on ~50%
   725  	listingQuorum := (askDisks + 1) / 2
   726  
   727  	if askDisks > 0 && len(disks) > askDisks {
   728  		rand.Shuffle(len(disks), func(i, j int) {
   729  			disks[i], disks[j] = disks[j], disks[i]
   730  		})
   731  		fallbackDisks = disks[askDisks:]
   732  		disks = disks[:askDisks]
   733  	}
   734  
   735  	// How to resolve results.
   736  	resolver := metadataResolutionParams{
   737  		dirQuorum: listingQuorum,
   738  		objQuorum: listingQuorum,
   739  		bucket:    o.Bucket,
   740  	}
   741  
   742  	// Maximum versions requested for "latest" object
   743  	// resolution on versioned buckets, this is to be only
   744  	// used when o.Versioned is false
   745  	if !o.Versioned {
   746  		resolver.requestedVersions = 1
   747  	}
   748  	var limit int
   749  	if o.Limit > 0 && o.StopDiskAtLimit {
   750  		// Over-read by 4 + 1 for every 16 in limit to give some space for resolver,
   751  		// allow for truncating the list and know if we have more results.
   752  		limit = o.Limit + 4 + (o.Limit / 16)
   753  	}
   754  	ctxDone := ctx.Done()
   755  	return listPathRaw(ctx, listPathRawOptions{
   756  		disks:         disks,
   757  		fallbackDisks: fallbackDisks,
   758  		bucket:        o.Bucket,
   759  		path:          o.BaseDir,
   760  		recursive:     o.Recursive,
   761  		filterPrefix:  o.FilterPrefix,
   762  		minDisks:      listingQuorum,
   763  		forwardTo:     o.Marker,
   764  		perDiskLimit:  limit,
   765  		agreed: func(entry metaCacheEntry) {
   766  			select {
   767  			case <-ctxDone:
   768  			case results <- entry:
   769  			}
   770  		},
   771  		partial: func(entries metaCacheEntries, errs []error) {
   772  			// Results Disagree :-(
   773  			entry, ok := entries.resolve(&resolver)
   774  			if ok {
   775  				select {
   776  				case <-ctxDone:
   777  				case results <- *entry:
   778  				}
   779  			}
   780  		},
   781  	})
   782  }
   783  
   784  //msgp:ignore metaCacheRPC
   785  type metaCacheRPC struct {
   786  	o      listPathOptions
   787  	mu     sync.Mutex
   788  	meta   *metacache
   789  	rpc    *peerRESTClient
   790  	cancel context.CancelFunc
   791  }
   792  
   793  func (m *metaCacheRPC) setErr(err string) {
   794  	m.mu.Lock()
   795  	defer m.mu.Unlock()
   796  	meta := *m.meta
   797  	if meta.status != scanStateError {
   798  		meta.error = err
   799  		meta.status = scanStateError
   800  	} else {
   801  		// An error is already set.
   802  		return
   803  	}
   804  	meta, _ = m.o.updateMetacacheListing(meta, m.rpc)
   805  	*m.meta = meta
   806  }
   807  
   808  func (er *erasureObjects) saveMetaCacheStream(ctx context.Context, mc *metaCacheRPC, entries <-chan metaCacheEntry) (err error) {
   809  	o := mc.o
   810  	o.debugf(color.Green("saveMetaCacheStream:")+" with options: %#v", o)
   811  
   812  	metaMu := &mc.mu
   813  	rpc := mc.rpc
   814  	cancel := mc.cancel
   815  	defer func() {
   816  		o.debugln(color.Green("saveMetaCacheStream:")+"err:", err)
   817  		if err != nil && !errors.Is(err, io.EOF) {
   818  			go mc.setErr(err.Error())
   819  			cancel()
   820  		}
   821  	}()
   822  
   823  	defer cancel()
   824  	// Save continuous updates
   825  	go func() {
   826  		var err error
   827  		ticker := time.NewTicker(10 * time.Second)
   828  		defer ticker.Stop()
   829  		var exit bool
   830  		for !exit {
   831  			select {
   832  			case <-ticker.C:
   833  			case <-ctx.Done():
   834  				exit = true
   835  			}
   836  			metaMu.Lock()
   837  			meta := *mc.meta
   838  			meta, err = o.updateMetacacheListing(meta, rpc)
   839  			if err == nil && time.Since(meta.lastHandout) > metacacheMaxClientWait {
   840  				cancel()
   841  				exit = true
   842  				meta.status = scanStateError
   843  				meta.error = fmt.Sprintf("listing canceled since time since last handout was %v ago", time.Since(meta.lastHandout).Round(time.Second))
   844  				o.debugln(color.Green("saveMetaCacheStream: ") + meta.error)
   845  				meta, err = o.updateMetacacheListing(meta, rpc)
   846  			}
   847  			if err == nil {
   848  				*mc.meta = meta
   849  				if meta.status == scanStateError {
   850  					cancel()
   851  					exit = true
   852  				}
   853  			}
   854  			metaMu.Unlock()
   855  		}
   856  	}()
   857  
   858  	const retryDelay = 200 * time.Millisecond
   859  	const maxTries = 5
   860  
   861  	// Keep destination...
   862  	// Write results to disk.
   863  	bw := newMetacacheBlockWriter(entries, func(b *metacacheBlock) error {
   864  		// if the block is 0 bytes and its a first block skip it.
   865  		// skip only this for Transient caches.
   866  		if len(b.data) == 0 && b.n == 0 && o.Transient {
   867  			return nil
   868  		}
   869  		o.debugln(color.Green("saveMetaCacheStream:")+" saving block", b.n, "to", o.objectPath(b.n))
   870  		r, err := hash.NewReader(ctx, bytes.NewReader(b.data), int64(len(b.data)), "", "", int64(len(b.data)))
   871  		logger.LogIf(ctx, err)
   872  		custom := b.headerKV()
   873  		_, err = er.putMetacacheObject(ctx, o.objectPath(b.n), NewPutObjReader(r), ObjectOptions{
   874  			UserDefined: custom,
   875  		})
   876  		if err != nil {
   877  			mc.setErr(err.Error())
   878  			cancel()
   879  			return err
   880  		}
   881  		if b.n == 0 {
   882  			return nil
   883  		}
   884  		// Update block 0 metadata.
   885  		var retries int
   886  		for {
   887  			meta := b.headerKV()
   888  			fi := FileInfo{
   889  				Metadata: make(map[string]string, len(meta)),
   890  			}
   891  			for k, v := range meta {
   892  				fi.Metadata[k] = v
   893  			}
   894  			err := er.updateObjectMetaWithOpts(ctx, minioMetaBucket, o.objectPath(0), fi, er.getDisks(), UpdateMetadataOpts{NoPersistence: true})
   895  			if err == nil {
   896  				break
   897  			}
   898  			switch err.(type) {
   899  			case ObjectNotFound:
   900  				return err
   901  			case StorageErr:
   902  				return err
   903  			case InsufficientReadQuorum:
   904  			default:
   905  				logger.LogIf(ctx, err)
   906  			}
   907  			if retries >= maxTries {
   908  				return err
   909  			}
   910  			retries++
   911  			time.Sleep(retryDelay)
   912  		}
   913  		return nil
   914  	})
   915  
   916  	// Blocks while consuming entries or an error occurs.
   917  	err = bw.Close()
   918  	if err != nil {
   919  		mc.setErr(err.Error())
   920  	}
   921  	metaMu.Lock()
   922  	defer metaMu.Unlock()
   923  	if mc.meta.error != "" {
   924  		return err
   925  	}
   926  	// Save success
   927  	mc.meta.status = scanStateSuccess
   928  	meta, err := o.updateMetacacheListing(*mc.meta, rpc)
   929  	if err == nil {
   930  		*mc.meta = meta
   931  	}
   932  	return nil
   933  }
   934  
   935  //msgp:ignore listPathRawOptions
   936  type listPathRawOptions struct {
   937  	disks         []StorageAPI
   938  	fallbackDisks []StorageAPI
   939  	bucket, path  string
   940  	recursive     bool
   941  
   942  	// Only return results with this prefix.
   943  	filterPrefix string
   944  
   945  	// Forward to this prefix before returning results.
   946  	forwardTo string
   947  
   948  	// Minimum number of good disks to continue.
   949  	// An error will be returned if this many disks returned an error.
   950  	minDisks       int
   951  	reportNotFound bool
   952  
   953  	// perDiskLimit will limit each disk to return n objects.
   954  	// If <= 0 all results will be returned until canceled.
   955  	perDiskLimit int
   956  
   957  	// Callbacks with results:
   958  	// If set to nil, it will not be called.
   959  
   960  	// agreed is called if all disks agreed.
   961  	agreed func(entry metaCacheEntry)
   962  
   963  	// partial will be called when there is disagreement between disks.
   964  	// if disk did not return any result, but also haven't errored
   965  	// the entry will be empty and errs will
   966  	partial func(entries metaCacheEntries, errs []error)
   967  
   968  	// finished will be called when all streams have finished and
   969  	// more than one disk returned an error.
   970  	// Will not be called if everything operates as expected.
   971  	finished func(errs []error)
   972  }
   973  
   974  // listPathRaw will list a path on the provided drives.
   975  // See listPathRawOptions on how results are delivered.
   976  // Directories are always returned.
   977  // Cache will be bypassed.
   978  // Context cancellation will be respected but may take a while to effectuate.
   979  func listPathRaw(ctx context.Context, opts listPathRawOptions) (err error) {
   980  	disks := opts.disks
   981  	if len(disks) == 0 {
   982  		return fmt.Errorf("listPathRaw: 0 drives provided")
   983  	}
   984  
   985  	// Cancel upstream if we finish before we expect.
   986  	ctx, cancel := context.WithCancel(ctx)
   987  	defer cancel()
   988  
   989  	// Keep track of fallback disks
   990  	var fdMu sync.Mutex
   991  	fds := opts.fallbackDisks
   992  	fallback := func(err error) StorageAPI {
   993  		if _, ok := err.(StorageErr); ok {
   994  			// Attempt to grab a fallback disk
   995  			fdMu.Lock()
   996  			defer fdMu.Unlock()
   997  			if len(fds) == 0 {
   998  				return nil
   999  			}
  1000  			fdsCopy := fds
  1001  			for _, fd := range fdsCopy {
  1002  				// Grab a fallback disk
  1003  				fds = fds[1:]
  1004  				if fd != nil && fd.IsOnline() {
  1005  					return fd
  1006  				}
  1007  			}
  1008  		}
  1009  		// Either no more disks for fallback or
  1010  		// not a storage error.
  1011  		return nil
  1012  	}
  1013  	askDisks := len(disks)
  1014  	readers := make([]*metacacheReader, askDisks)
  1015  	defer func() {
  1016  		for _, r := range readers {
  1017  			r.Close()
  1018  		}
  1019  	}()
  1020  	for i := range disks {
  1021  		r, w := io.Pipe()
  1022  		// Make sure we close the pipe so blocked writes doesn't stay around.
  1023  		defer r.CloseWithError(context.Canceled)
  1024  
  1025  		readers[i] = newMetacacheReader(r)
  1026  		d := disks[i]
  1027  
  1028  		// Send request to each disk.
  1029  		go func() {
  1030  			var werr error
  1031  			if d == nil {
  1032  				werr = errDiskNotFound
  1033  			} else {
  1034  				werr = d.WalkDir(ctx, WalkDirOptions{
  1035  					Limit:          opts.perDiskLimit,
  1036  					Bucket:         opts.bucket,
  1037  					BaseDir:        opts.path,
  1038  					Recursive:      opts.recursive,
  1039  					ReportNotFound: opts.reportNotFound,
  1040  					FilterPrefix:   opts.filterPrefix,
  1041  					ForwardTo:      opts.forwardTo,
  1042  				}, w)
  1043  			}
  1044  
  1045  			// fallback only when set.
  1046  			for {
  1047  				fd := fallback(werr)
  1048  				if fd == nil {
  1049  					break
  1050  				}
  1051  				// This fallback is only set when
  1052  				// askDisks is less than total
  1053  				// number of disks per set.
  1054  				werr = fd.WalkDir(ctx, WalkDirOptions{
  1055  					Limit:          opts.perDiskLimit,
  1056  					Bucket:         opts.bucket,
  1057  					BaseDir:        opts.path,
  1058  					Recursive:      opts.recursive,
  1059  					ReportNotFound: opts.reportNotFound,
  1060  					FilterPrefix:   opts.filterPrefix,
  1061  					ForwardTo:      opts.forwardTo,
  1062  				}, w)
  1063  				if werr == nil {
  1064  					break
  1065  				}
  1066  			}
  1067  			w.CloseWithError(werr)
  1068  		}()
  1069  	}
  1070  
  1071  	topEntries := make(metaCacheEntries, len(readers))
  1072  	errs := make([]error, len(readers))
  1073  	for {
  1074  		// Get the top entry from each
  1075  		var current metaCacheEntry
  1076  		var atEOF, fnf, vnf, hasErr, agree int
  1077  		for i := range topEntries {
  1078  			topEntries[i] = metaCacheEntry{}
  1079  		}
  1080  		if contextCanceled(ctx) {
  1081  			return ctx.Err()
  1082  		}
  1083  		for i, r := range readers {
  1084  			if errs[i] != nil {
  1085  				hasErr++
  1086  				continue
  1087  			}
  1088  			entry, err := r.peek()
  1089  			switch err {
  1090  			case io.EOF:
  1091  				atEOF++
  1092  				continue
  1093  			case nil:
  1094  			default:
  1095  				switch err.Error() {
  1096  				case errFileNotFound.Error(),
  1097  					errVolumeNotFound.Error(),
  1098  					errUnformattedDisk.Error(),
  1099  					errDiskNotFound.Error():
  1100  					atEOF++
  1101  					fnf++
  1102  					// This is a special case, to handle bucket does
  1103  					// not exist situations.
  1104  					if errors.Is(err, errVolumeNotFound) {
  1105  						vnf++
  1106  					}
  1107  					continue
  1108  				}
  1109  				hasErr++
  1110  				errs[i] = err
  1111  				continue
  1112  			}
  1113  			// If no current, add it.
  1114  			if current.name == "" {
  1115  				topEntries[i] = entry
  1116  				current = entry
  1117  				agree++
  1118  				continue
  1119  			}
  1120  			// If exact match, we agree.
  1121  			if _, ok := current.matches(&entry, true); ok {
  1122  				topEntries[i] = entry
  1123  				agree++
  1124  				continue
  1125  			}
  1126  			// If only the name matches we didn't agree, but add it for resolution.
  1127  			if entry.name == current.name {
  1128  				topEntries[i] = entry
  1129  				continue
  1130  			}
  1131  			// We got different entries
  1132  			if entry.name > current.name {
  1133  				continue
  1134  			}
  1135  			// We got a new, better current.
  1136  			// Clear existing entries.
  1137  			for i := range topEntries[:i] {
  1138  				topEntries[i] = metaCacheEntry{}
  1139  			}
  1140  			agree = 1
  1141  			current = entry
  1142  			topEntries[i] = entry
  1143  		}
  1144  
  1145  		// Stop if we exceed number of bad disks
  1146  		if hasErr > len(disks)-opts.minDisks && hasErr > 0 {
  1147  			if opts.finished != nil {
  1148  				opts.finished(errs)
  1149  			}
  1150  			var combinedErr []string
  1151  			for i, err := range errs {
  1152  				if err != nil {
  1153  					if disks[i] != nil {
  1154  						combinedErr = append(combinedErr,
  1155  							fmt.Sprintf("drive %s returned: %s", disks[i], err))
  1156  					} else {
  1157  						combinedErr = append(combinedErr, err.Error())
  1158  					}
  1159  				}
  1160  			}
  1161  			return errors.New(strings.Join(combinedErr, ", "))
  1162  		}
  1163  
  1164  		if vnf == len(readers) {
  1165  			return errVolumeNotFound
  1166  		}
  1167  
  1168  		// Break if all at EOF or error.
  1169  		if atEOF+hasErr == len(readers) {
  1170  			if hasErr > 0 && opts.finished != nil {
  1171  				opts.finished(errs)
  1172  			}
  1173  			break
  1174  		}
  1175  
  1176  		if fnf == len(readers) {
  1177  			return errFileNotFound
  1178  		}
  1179  
  1180  		if agree == len(readers) {
  1181  			// Everybody agreed
  1182  			for _, r := range readers {
  1183  				r.skip(1)
  1184  			}
  1185  			if opts.agreed != nil {
  1186  				opts.agreed(current)
  1187  			}
  1188  			continue
  1189  		}
  1190  		if opts.partial != nil {
  1191  			opts.partial(topEntries, errs)
  1192  		}
  1193  		// Skip the inputs we used.
  1194  		for i, r := range readers {
  1195  			if topEntries[i].name != "" {
  1196  				r.skip(1)
  1197  			}
  1198  		}
  1199  	}
  1200  	return nil
  1201  }