github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/erasure-object.go (about)

     1  // Copyright (c) 2015-2021 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package cmd
    19  
    20  import (
    21  	"bytes"
    22  	"context"
    23  	"errors"
    24  	"fmt"
    25  	"io"
    26  	"net/http"
    27  	"path"
    28  	"runtime"
    29  	"strconv"
    30  	"strings"
    31  	"sync"
    32  	"time"
    33  
    34  	"github.com/dustin/go-humanize"
    35  	"github.com/klauspost/readahead"
    36  	"github.com/minio/madmin-go/v3"
    37  	"github.com/minio/minio-go/v7/pkg/tags"
    38  	"github.com/minio/minio/internal/bucket/lifecycle"
    39  	"github.com/minio/minio/internal/bucket/object/lock"
    40  	"github.com/minio/minio/internal/bucket/replication"
    41  	"github.com/minio/minio/internal/config/storageclass"
    42  	"github.com/minio/minio/internal/crypto"
    43  	"github.com/minio/minio/internal/event"
    44  	"github.com/minio/minio/internal/hash"
    45  	xhttp "github.com/minio/minio/internal/http"
    46  	xioutil "github.com/minio/minio/internal/ioutil"
    47  	"github.com/minio/minio/internal/logger"
    48  	"github.com/minio/pkg/v2/mimedb"
    49  	"github.com/minio/pkg/v2/sync/errgroup"
    50  	"github.com/minio/pkg/v2/wildcard"
    51  )
    52  
    53  // list all errors which can be ignored in object operations.
    54  var objectOpIgnoredErrs = append(baseIgnoredErrs, errDiskAccessDenied, errUnformattedDisk, errDiskOngoingReq)
    55  
    56  // Object Operations
    57  
    58  func countOnlineDisks(onlineDisks []StorageAPI) (online int) {
    59  	for _, onlineDisk := range onlineDisks {
    60  		if onlineDisk != nil && onlineDisk.IsOnline() {
    61  			online++
    62  		}
    63  	}
    64  	return online
    65  }
    66  
    67  // CopyObject - copy object source object to destination object.
    68  // if source object and destination object are same we only
    69  // update metadata.
    70  func (er erasureObjects) CopyObject(ctx context.Context, srcBucket, srcObject, dstBucket, dstObject string, srcInfo ObjectInfo, srcOpts, dstOpts ObjectOptions) (oi ObjectInfo, err error) {
    71  	if !dstOpts.NoAuditLog {
    72  		auditObjectErasureSet(ctx, dstObject, &er)
    73  	}
    74  
    75  	// This call shouldn't be used for anything other than metadata updates or adding self referential versions.
    76  	if !srcInfo.metadataOnly {
    77  		return oi, NotImplemented{}
    78  	}
    79  
    80  	if !dstOpts.NoLock {
    81  		lk := er.NewNSLock(dstBucket, dstObject)
    82  		lkctx, err := lk.GetLock(ctx, globalOperationTimeout)
    83  		if err != nil {
    84  			return oi, err
    85  		}
    86  		ctx = lkctx.Context()
    87  		defer lk.Unlock(lkctx)
    88  	}
    89  	// Read metadata associated with the object from all disks.
    90  	storageDisks := er.getDisks()
    91  
    92  	var metaArr []FileInfo
    93  	var errs []error
    94  
    95  	// Read metadata associated with the object from all disks.
    96  	if srcOpts.VersionID != "" {
    97  		metaArr, errs = readAllFileInfo(ctx, storageDisks, "", srcBucket, srcObject, srcOpts.VersionID, true, false)
    98  	} else {
    99  		metaArr, errs = readAllXL(ctx, storageDisks, srcBucket, srcObject, true, false, true)
   100  	}
   101  
   102  	readQuorum, writeQuorum, err := objectQuorumFromMeta(ctx, metaArr, errs, er.defaultParityCount)
   103  	if err != nil {
   104  		if errors.Is(err, errErasureReadQuorum) && !strings.HasPrefix(srcBucket, minioMetaBucket) {
   105  			_, derr := er.deleteIfDangling(context.Background(), srcBucket, srcObject, metaArr, errs, nil, srcOpts)
   106  			if derr != nil {
   107  				err = derr
   108  			}
   109  		}
   110  		return ObjectInfo{}, toObjectErr(err, srcBucket, srcObject)
   111  	}
   112  
   113  	// List all online disks.
   114  	onlineDisks, modTime, etag := listOnlineDisks(storageDisks, metaArr, errs, readQuorum)
   115  
   116  	// Pick latest valid metadata.
   117  	fi, err := pickValidFileInfo(ctx, metaArr, modTime, etag, readQuorum)
   118  	if err != nil {
   119  		return oi, toObjectErr(err, srcBucket, srcObject)
   120  	}
   121  	if fi.Deleted {
   122  		if srcOpts.VersionID == "" {
   123  			return oi, toObjectErr(errFileNotFound, srcBucket, srcObject)
   124  		}
   125  		return fi.ToObjectInfo(srcBucket, srcObject, srcOpts.Versioned || srcOpts.VersionSuspended), toObjectErr(errMethodNotAllowed, srcBucket, srcObject)
   126  	}
   127  
   128  	filterOnlineDisksInplace(fi, metaArr, onlineDisks)
   129  
   130  	versionID := srcInfo.VersionID
   131  	if srcInfo.versionOnly {
   132  		versionID = dstOpts.VersionID
   133  		// preserve destination versionId if specified.
   134  		if versionID == "" {
   135  			versionID = mustGetUUID()
   136  			fi.IsLatest = true // we are creating a new version so this is latest.
   137  		}
   138  	}
   139  
   140  	modTime = UTCNow() // We only preserve modTime if dstOpts.MTime is true.
   141  	// in all other cases mtime is latest.
   142  
   143  	fi.VersionID = versionID // set any new versionID we might have created
   144  	fi.ModTime = modTime     // set modTime for the new versionID
   145  	if !dstOpts.MTime.IsZero() {
   146  		modTime = dstOpts.MTime
   147  		fi.ModTime = dstOpts.MTime
   148  	}
   149  
   150  	fi.Metadata = srcInfo.UserDefined
   151  	srcInfo.UserDefined["etag"] = srcInfo.ETag
   152  
   153  	inlineData := fi.InlineData()
   154  	freeVersionID := fi.TierFreeVersionID()
   155  	freeVersionMarker := fi.TierFreeVersion()
   156  
   157  	// Update `xl.meta` content on each disks.
   158  	for index := range metaArr {
   159  		if metaArr[index].IsValid() {
   160  			metaArr[index].ModTime = modTime
   161  			metaArr[index].VersionID = versionID
   162  			if !metaArr[index].InlineData() {
   163  				// If the data is not inlined, we may end up incorrectly
   164  				// inlining the data here, that leads to an inconsistent
   165  				// situation where some objects are were not inlined
   166  				// were now inlined, make sure to `nil` the Data such
   167  				// that xl.meta is written as expected.
   168  				metaArr[index].Data = nil
   169  			}
   170  			metaArr[index].Metadata = srcInfo.UserDefined
   171  			// Preserve existing values
   172  			if inlineData {
   173  				metaArr[index].SetInlineData()
   174  			}
   175  			if freeVersionID != "" {
   176  				metaArr[index].SetTierFreeVersionID(freeVersionID)
   177  			}
   178  			if freeVersionMarker {
   179  				metaArr[index].SetTierFreeVersion()
   180  			}
   181  		}
   182  	}
   183  
   184  	// Write unique `xl.meta` for each disk.
   185  	if _, err = writeUniqueFileInfo(ctx, onlineDisks, "", srcBucket, srcObject, metaArr, writeQuorum); err != nil {
   186  		return oi, toObjectErr(err, srcBucket, srcObject)
   187  	}
   188  
   189  	return fi.ToObjectInfo(srcBucket, srcObject, srcOpts.Versioned || srcOpts.VersionSuspended), nil
   190  }
   191  
   192  // GetObjectNInfo - returns object info and an object
   193  // Read(Closer). When err != nil, the returned reader is always nil.
   194  func (er erasureObjects) GetObjectNInfo(ctx context.Context, bucket, object string, rs *HTTPRangeSpec, h http.Header, opts ObjectOptions) (gr *GetObjectReader, err error) {
   195  	if !opts.NoAuditLog {
   196  		auditObjectErasureSet(ctx, object, &er)
   197  	}
   198  
   199  	var unlockOnDefer bool
   200  	nsUnlocker := func() {}
   201  	defer func() {
   202  		if unlockOnDefer {
   203  			nsUnlocker()
   204  		}
   205  	}()
   206  
   207  	// Acquire lock
   208  	if !opts.NoLock {
   209  		lock := er.NewNSLock(bucket, object)
   210  		lkctx, err := lock.GetRLock(ctx, globalOperationTimeout)
   211  		if err != nil {
   212  			return nil, err
   213  		}
   214  		ctx = lkctx.Context()
   215  
   216  		// Release lock when the metadata is verified, and reader
   217  		// is ready to be read.
   218  		//
   219  		// This is possible to be lock free because
   220  		// - xl.meta for inlined objects has already read the data
   221  		//   into memory, any mutation on xl.meta subsequently is
   222  		//   inconsequential to the overall read operation.
   223  		// - xl.meta metadata is still verified for quorum under lock()
   224  		//   however writing the response doesn't need to serialize
   225  		//   concurrent writers
   226  		unlockOnDefer = true
   227  		nsUnlocker = func() { lock.RUnlock(lkctx) }
   228  	}
   229  
   230  	fi, metaArr, onlineDisks, err := er.getObjectFileInfo(ctx, bucket, object, opts, true)
   231  	if err != nil {
   232  		return nil, toObjectErr(err, bucket, object)
   233  	}
   234  
   235  	objInfo := fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended)
   236  	if objInfo.DeleteMarker {
   237  		if opts.VersionID == "" {
   238  			return &GetObjectReader{
   239  				ObjInfo: objInfo,
   240  			}, toObjectErr(errFileNotFound, bucket, object)
   241  		}
   242  		// Make sure to return object info to provide extra information.
   243  		return &GetObjectReader{
   244  			ObjInfo: objInfo,
   245  		}, toObjectErr(errMethodNotAllowed, bucket, object)
   246  	}
   247  
   248  	// Set NoDecryption for SSE-C objects and if replication request
   249  	if crypto.SSEC.IsEncrypted(objInfo.UserDefined) && opts.ReplicationRequest {
   250  		opts.NoDecryption = true
   251  	}
   252  
   253  	if objInfo.IsRemote() {
   254  		gr, err := getTransitionedObjectReader(ctx, bucket, object, rs, h, objInfo, opts)
   255  		if err != nil {
   256  			return nil, err
   257  		}
   258  		unlockOnDefer = false
   259  		return gr.WithCleanupFuncs(nsUnlocker), nil
   260  	}
   261  
   262  	if objInfo.Size == 0 {
   263  		// Zero byte objects don't even need to further initialize pipes etc.
   264  		return NewGetObjectReaderFromReader(bytes.NewReader(nil), objInfo, opts)
   265  	}
   266  
   267  	fn, off, length, err := NewGetObjectReader(rs, objInfo, opts)
   268  	if err != nil {
   269  		return nil, err
   270  	}
   271  
   272  	if unlockOnDefer {
   273  		unlockOnDefer = fi.InlineData()
   274  	}
   275  
   276  	pr, pw := xioutil.WaitPipe()
   277  	go func() {
   278  		pw.CloseWithError(er.getObjectWithFileInfo(ctx, bucket, object, off, length, pw, fi, metaArr, onlineDisks))
   279  	}()
   280  
   281  	// Cleanup function to cause the go routine above to exit, in
   282  	// case of incomplete read.
   283  	pipeCloser := func() {
   284  		pr.CloseWithError(nil)
   285  	}
   286  
   287  	if !unlockOnDefer {
   288  		return fn(pr, h, pipeCloser, nsUnlocker)
   289  	}
   290  
   291  	return fn(pr, h, pipeCloser)
   292  }
   293  
   294  func (er erasureObjects) getObjectWithFileInfo(ctx context.Context, bucket, object string, startOffset int64, length int64, writer io.Writer, fi FileInfo, metaArr []FileInfo, onlineDisks []StorageAPI) error {
   295  	// Reorder online disks based on erasure distribution order.
   296  	// Reorder parts metadata based on erasure distribution order.
   297  	onlineDisks, metaArr = shuffleDisksAndPartsMetadataByIndex(onlineDisks, metaArr, fi)
   298  
   299  	// For negative length read everything.
   300  	if length < 0 {
   301  		length = fi.Size - startOffset
   302  	}
   303  
   304  	// Reply back invalid range if the input offset and length fall out of range.
   305  	if startOffset > fi.Size || startOffset+length > fi.Size {
   306  		return InvalidRange{startOffset, length, fi.Size}
   307  	}
   308  
   309  	// Get start part index and offset.
   310  	partIndex, partOffset, err := fi.ObjectToPartOffset(ctx, startOffset)
   311  	if err != nil {
   312  		return InvalidRange{startOffset, length, fi.Size}
   313  	}
   314  
   315  	// Calculate endOffset according to length
   316  	endOffset := startOffset
   317  	if length > 0 {
   318  		endOffset += length - 1
   319  	}
   320  
   321  	// Get last part index to read given length.
   322  	lastPartIndex, _, err := fi.ObjectToPartOffset(ctx, endOffset)
   323  	if err != nil {
   324  		return InvalidRange{startOffset, length, fi.Size}
   325  	}
   326  
   327  	var totalBytesRead int64
   328  	erasure, err := NewErasure(ctx, fi.Erasure.DataBlocks, fi.Erasure.ParityBlocks, fi.Erasure.BlockSize)
   329  	if err != nil {
   330  		return toObjectErr(err, bucket, object)
   331  	}
   332  
   333  	var healOnce sync.Once
   334  
   335  	for ; partIndex <= lastPartIndex; partIndex++ {
   336  		if length == totalBytesRead {
   337  			break
   338  		}
   339  
   340  		partNumber := fi.Parts[partIndex].Number
   341  
   342  		// Save the current part name and size.
   343  		partSize := fi.Parts[partIndex].Size
   344  
   345  		partLength := partSize - partOffset
   346  		// partLength should be adjusted so that we don't write more data than what was requested.
   347  		if partLength > (length - totalBytesRead) {
   348  			partLength = length - totalBytesRead
   349  		}
   350  
   351  		tillOffset := erasure.ShardFileOffset(partOffset, partLength, partSize)
   352  		// Get the checksums of the current part.
   353  		readers := make([]io.ReaderAt, len(onlineDisks))
   354  		prefer := make([]bool, len(onlineDisks))
   355  		for index, disk := range onlineDisks {
   356  			if disk == OfflineDisk {
   357  				continue
   358  			}
   359  			if !metaArr[index].IsValid() {
   360  				continue
   361  			}
   362  			if !metaArr[index].Erasure.Equal(fi.Erasure) {
   363  				continue
   364  			}
   365  			checksumInfo := metaArr[index].Erasure.GetChecksumInfo(partNumber)
   366  			partPath := pathJoin(object, metaArr[index].DataDir, fmt.Sprintf("part.%d", partNumber))
   367  			readers[index] = newBitrotReader(disk, metaArr[index].Data, bucket, partPath, tillOffset,
   368  				checksumInfo.Algorithm, checksumInfo.Hash, erasure.ShardSize())
   369  
   370  			// Prefer local disks
   371  			prefer[index] = disk.Hostname() == ""
   372  		}
   373  
   374  		written, err := erasure.Decode(ctx, writer, readers, partOffset, partLength, partSize, prefer)
   375  		// Note: we should not be defer'ing the following closeBitrotReaders() call as
   376  		// we are inside a for loop i.e if we use defer, we would accumulate a lot of open files by the time
   377  		// we return from this function.
   378  		closeBitrotReaders(readers)
   379  		if err != nil {
   380  			// If we have successfully written all the content that was asked
   381  			// by the client, but we still see an error - this would mean
   382  			// that we have some parts or data blocks missing or corrupted
   383  			// - attempt a heal to successfully heal them for future calls.
   384  			if written == partLength {
   385  				var scan madmin.HealScanMode
   386  				switch {
   387  				case errors.Is(err, errFileNotFound):
   388  					scan = madmin.HealNormalScan
   389  				case errors.Is(err, errFileCorrupt):
   390  					scan = madmin.HealDeepScan
   391  				}
   392  				switch scan {
   393  				case madmin.HealNormalScan, madmin.HealDeepScan:
   394  					healOnce.Do(func() {
   395  						globalMRFState.addPartialOp(partialOperation{
   396  							bucket:    bucket,
   397  							object:    object,
   398  							versionID: fi.VersionID,
   399  							queued:    time.Now(),
   400  							setIndex:  er.setIndex,
   401  							poolIndex: er.poolIndex,
   402  							scanMode:  scan,
   403  						})
   404  					})
   405  					// Healing is triggered and we have written
   406  					// successfully the content to client for
   407  					// the specific part, we should `nil` this error
   408  					// and proceed forward, instead of throwing errors.
   409  					err = nil
   410  				}
   411  			}
   412  			if err != nil {
   413  				return toObjectErr(err, bucket, object)
   414  			}
   415  		}
   416  		for i, r := range readers {
   417  			if r == nil {
   418  				onlineDisks[i] = OfflineDisk
   419  			}
   420  		}
   421  		// Track total bytes read from disk and written to the client.
   422  		totalBytesRead += partLength
   423  		// partOffset will be valid only for the first part, hence reset it to 0 for
   424  		// the remaining parts.
   425  		partOffset = 0
   426  	} // End of read all parts loop.
   427  	// Return success.
   428  	return nil
   429  }
   430  
   431  // GetObjectInfo - reads object metadata and replies back ObjectInfo.
   432  func (er erasureObjects) GetObjectInfo(ctx context.Context, bucket, object string, opts ObjectOptions) (info ObjectInfo, err error) {
   433  	if !opts.NoAuditLog {
   434  		auditObjectErasureSet(ctx, object, &er)
   435  	}
   436  
   437  	if !opts.NoLock {
   438  		// Lock the object before reading.
   439  		lk := er.NewNSLock(bucket, object)
   440  		lkctx, err := lk.GetRLock(ctx, globalOperationTimeout)
   441  		if err != nil {
   442  			return ObjectInfo{}, err
   443  		}
   444  		ctx = lkctx.Context()
   445  		defer lk.RUnlock(lkctx)
   446  	}
   447  
   448  	return er.getObjectInfo(ctx, bucket, object, opts)
   449  }
   450  
   451  func auditDanglingObjectDeletion(ctx context.Context, bucket, object, versionID string, tags map[string]interface{}) {
   452  	if len(logger.AuditTargets()) == 0 {
   453  		return
   454  	}
   455  
   456  	opts := AuditLogOptions{
   457  		Event:     "DeleteDanglingObject",
   458  		Bucket:    bucket,
   459  		Object:    object,
   460  		VersionID: versionID,
   461  		Tags:      tags,
   462  	}
   463  
   464  	auditLogInternal(ctx, opts)
   465  }
   466  
   467  func joinErrs(errs []error) []string {
   468  	s := make([]string, len(errs))
   469  	for i := range s {
   470  		if errs[i] == nil {
   471  			s[i] = "<nil>"
   472  		} else {
   473  			s[i] = errs[i].Error()
   474  		}
   475  	}
   476  	return s
   477  }
   478  
   479  func (er erasureObjects) deleteIfDangling(ctx context.Context, bucket, object string, metaArr []FileInfo, errs []error, dataErrs []error, opts ObjectOptions) (FileInfo, error) {
   480  	var err error
   481  	m, ok := isObjectDangling(metaArr, errs, dataErrs)
   482  	if ok {
   483  		tags := make(map[string]interface{}, 4)
   484  		tags["set"] = er.setIndex
   485  		tags["pool"] = er.poolIndex
   486  		tags["merrs"] = joinErrs(errs)
   487  		tags["derrs"] = joinErrs(dataErrs)
   488  		if m.IsValid() {
   489  			tags["size"] = m.Size
   490  			tags["mtime"] = m.ModTime.Format(http.TimeFormat)
   491  			tags["data"] = m.Erasure.DataBlocks
   492  			tags["parity"] = m.Erasure.ParityBlocks
   493  		} else {
   494  			tags["invalid-meta"] = true
   495  			tags["data"] = er.setDriveCount - er.defaultParityCount
   496  			tags["parity"] = er.defaultParityCount
   497  		}
   498  
   499  		// count the number of offline disks
   500  		offline := 0
   501  		for i := 0; i < max(len(errs), len(dataErrs)); i++ {
   502  			if i < len(errs) && errors.Is(errs[i], errDiskNotFound) || i < len(dataErrs) && errors.Is(dataErrs[i], errDiskNotFound) {
   503  				offline++
   504  			}
   505  		}
   506  		if offline > 0 {
   507  			tags["offline"] = offline
   508  		}
   509  
   510  		_, file, line, cok := runtime.Caller(1)
   511  		if cok {
   512  			tags["caller"] = fmt.Sprintf("%s:%d", file, line)
   513  		}
   514  
   515  		defer auditDanglingObjectDeletion(ctx, bucket, object, m.VersionID, tags)
   516  
   517  		err = errFileNotFound
   518  		if opts.VersionID != "" {
   519  			err = errFileVersionNotFound
   520  		}
   521  
   522  		fi := FileInfo{
   523  			VersionID: m.VersionID,
   524  		}
   525  		if opts.VersionID != "" {
   526  			fi.VersionID = opts.VersionID
   527  		}
   528  		fi.SetTierFreeVersionID(mustGetUUID())
   529  		disks := er.getDisks()
   530  		g := errgroup.WithNErrs(len(disks))
   531  		for index := range disks {
   532  			index := index
   533  			g.Go(func() error {
   534  				if disks[index] == nil {
   535  					return errDiskNotFound
   536  				}
   537  				return disks[index].DeleteVersion(ctx, bucket, object, fi, false, DeleteOptions{})
   538  			}, index)
   539  		}
   540  
   541  		rmDisks := make(map[string]string, len(disks))
   542  		for index, err := range g.Wait() {
   543  			var errStr, diskName string
   544  			if err != nil {
   545  				errStr = err.Error()
   546  			} else {
   547  				errStr = "<nil>"
   548  			}
   549  			if disks[index] != nil {
   550  				diskName = disks[index].String()
   551  			} else {
   552  				diskName = fmt.Sprintf("disk-%d", index)
   553  			}
   554  			rmDisks[diskName] = errStr
   555  		}
   556  		tags["cleanupResult"] = rmDisks
   557  	}
   558  	return m, err
   559  }
   560  
   561  func fileInfoFromRaw(ri RawFileInfo, bucket, object string, readData, inclFreeVers, allParts bool) (FileInfo, error) {
   562  	var xl xlMetaV2
   563  	if err := xl.LoadOrConvert(ri.Buf); err != nil {
   564  		return FileInfo{}, err
   565  	}
   566  
   567  	fi, err := xl.ToFileInfo(bucket, object, "", inclFreeVers, allParts)
   568  	if err != nil {
   569  		return FileInfo{}, err
   570  	}
   571  
   572  	if !fi.IsValid() {
   573  		return FileInfo{}, errCorruptedFormat
   574  	}
   575  
   576  	versionID := fi.VersionID
   577  	if versionID == "" {
   578  		versionID = nullVersionID
   579  	}
   580  
   581  	fileInfo, err := xl.ToFileInfo(bucket, object, versionID, inclFreeVers, allParts)
   582  	if err != nil {
   583  		return FileInfo{}, err
   584  	}
   585  
   586  	if readData {
   587  		fileInfo.Data = xl.data.find(versionID)
   588  	}
   589  
   590  	return fileInfo, nil
   591  }
   592  
   593  func readAllRawFileInfo(ctx context.Context, disks []StorageAPI, bucket, object string, readData bool) ([]RawFileInfo, []error) {
   594  	rawFileInfos := make([]RawFileInfo, len(disks))
   595  	g := errgroup.WithNErrs(len(disks))
   596  	for index := range disks {
   597  		index := index
   598  		g.Go(func() (err error) {
   599  			if disks[index] == nil {
   600  				return errDiskNotFound
   601  			}
   602  			rf, err := disks[index].ReadXL(ctx, bucket, object, readData)
   603  			if err != nil {
   604  				return err
   605  			}
   606  			rawFileInfos[index] = rf
   607  			return nil
   608  		}, index)
   609  	}
   610  
   611  	return rawFileInfos, g.Wait()
   612  }
   613  
   614  func pickLatestQuorumFilesInfo(ctx context.Context, rawFileInfos []RawFileInfo, errs []error, bucket, object string, readData, inclFreeVers, allParts bool) ([]FileInfo, []error) {
   615  	metadataArray := make([]*xlMetaV2, len(rawFileInfos))
   616  	metaFileInfos := make([]FileInfo, len(rawFileInfos))
   617  	metadataShallowVersions := make([][]xlMetaV2ShallowVersion, len(rawFileInfos))
   618  	var v2bufs [][]byte
   619  	if !readData {
   620  		v2bufs = make([][]byte, len(rawFileInfos))
   621  	}
   622  
   623  	// Read `xl.meta` in parallel across disks.
   624  	for index := range rawFileInfos {
   625  		rf := rawFileInfos[index]
   626  		if rf.Buf == nil {
   627  			continue
   628  		}
   629  		if !readData {
   630  			// Save the buffer so we can reuse it.
   631  			v2bufs[index] = rf.Buf
   632  		}
   633  
   634  		var xl xlMetaV2
   635  		if err := xl.LoadOrConvert(rf.Buf); err != nil {
   636  			errs[index] = err
   637  			continue
   638  		}
   639  		metadataArray[index] = &xl
   640  		metaFileInfos[index] = FileInfo{}
   641  	}
   642  
   643  	for index := range metadataArray {
   644  		if metadataArray[index] != nil {
   645  			metadataShallowVersions[index] = metadataArray[index].versions
   646  		}
   647  	}
   648  
   649  	readQuorum := (len(rawFileInfos) + 1) / 2
   650  	meta := &xlMetaV2{versions: mergeXLV2Versions(readQuorum, false, 1, metadataShallowVersions...)}
   651  	lfi, err := meta.ToFileInfo(bucket, object, "", inclFreeVers, allParts)
   652  	if err != nil {
   653  		for i := range errs {
   654  			if errs[i] == nil {
   655  				errs[i] = err
   656  			}
   657  		}
   658  		return metaFileInfos, errs
   659  	}
   660  	if !lfi.IsValid() {
   661  		for i := range errs {
   662  			if errs[i] == nil {
   663  				errs[i] = errCorruptedFormat
   664  			}
   665  		}
   666  		return metaFileInfos, errs
   667  	}
   668  
   669  	versionID := lfi.VersionID
   670  	if versionID == "" {
   671  		versionID = nullVersionID
   672  	}
   673  
   674  	for index := range metadataArray {
   675  		if metadataArray[index] == nil {
   676  			continue
   677  		}
   678  
   679  		// make sure to preserve this for diskmtime based healing bugfix.
   680  		metaFileInfos[index], errs[index] = metadataArray[index].ToFileInfo(bucket, object, versionID, inclFreeVers, allParts)
   681  		if errs[index] != nil {
   682  			continue
   683  		}
   684  
   685  		if readData {
   686  			metaFileInfos[index].Data = metadataArray[index].data.find(versionID)
   687  		}
   688  	}
   689  	if !readData {
   690  		for i := range v2bufs {
   691  			metaDataPoolPut(v2bufs[i])
   692  		}
   693  	}
   694  
   695  	// Return all the metadata.
   696  	return metaFileInfos, errs
   697  }
   698  
   699  // Checking if an object is dangling costs some IOPS; hence implementing this function
   700  // which decides which condition it is useful to check if an object is dangling
   701  //
   702  //	  errs: errors from reading xl.meta in all disks
   703  //	   err: reduced errs
   704  //	bucket: the object name in question
   705  func shouldCheckForDangling(err error, errs []error, bucket string) bool {
   706  	// Avoid data in .minio.sys for now
   707  	if bucket == minioMetaBucket {
   708  		return false
   709  	}
   710  	switch {
   711  	// Check if we have a read quorum issue
   712  	case errors.Is(err, errErasureReadQuorum):
   713  		return true
   714  	// Check if the object is inexistent in most disks but not all of them
   715  	case errors.Is(err, errFileNotFound) || errors.Is(err, errFileVersionNotFound):
   716  		for i := range errs {
   717  			if errs[i] == nil {
   718  				return true
   719  			}
   720  		}
   721  	}
   722  	return false
   723  }
   724  
   725  func readAllXL(ctx context.Context, disks []StorageAPI, bucket, object string, readData, inclFreeVers, allParts bool) ([]FileInfo, []error) {
   726  	rawFileInfos, errs := readAllRawFileInfo(ctx, disks, bucket, object, readData)
   727  	return pickLatestQuorumFilesInfo(ctx, rawFileInfos, errs, bucket, object, readData, inclFreeVers, allParts)
   728  }
   729  
   730  func (er erasureObjects) getObjectFileInfo(ctx context.Context, bucket, object string, opts ObjectOptions, readData bool) (FileInfo, []FileInfo, []StorageAPI, error) {
   731  	rawArr := make([]RawFileInfo, er.setDriveCount)
   732  	metaArr := make([]FileInfo, er.setDriveCount)
   733  	errs := make([]error, er.setDriveCount)
   734  	for i := range errs {
   735  		errs[i] = errDiskOngoingReq
   736  	}
   737  
   738  	done := make(chan bool, er.setDriveCount)
   739  	disks := er.getDisks()
   740  
   741  	ropts := ReadOptions{
   742  		ReadData: readData,
   743  		Healing:  false,
   744  	}
   745  
   746  	mrfCheck := make(chan FileInfo)
   747  	defer xioutil.SafeClose(mrfCheck)
   748  
   749  	var rw sync.Mutex
   750  
   751  	// Ask for all disks first;
   752  	go func() {
   753  		ctx, cancel := context.WithCancel(ctx)
   754  		defer cancel()
   755  
   756  		wg := sync.WaitGroup{}
   757  		for i, disk := range disks {
   758  			if disk == nil {
   759  				done <- false
   760  				continue
   761  			}
   762  			if !disk.IsOnline() {
   763  				done <- false
   764  				continue
   765  			}
   766  			wg.Add(1)
   767  			go func(i int, disk StorageAPI) {
   768  				defer wg.Done()
   769  
   770  				var (
   771  					fi  FileInfo
   772  					rfi RawFileInfo
   773  					err error
   774  				)
   775  
   776  				if opts.VersionID != "" {
   777  					// Read a specific version ID
   778  					fi, err = disk.ReadVersion(ctx, "", bucket, object, opts.VersionID, ropts)
   779  				} else {
   780  					// Read the latest version
   781  					rfi, err = disk.ReadXL(ctx, bucket, object, readData)
   782  					if err == nil {
   783  						fi, err = fileInfoFromRaw(rfi, bucket, object, readData, opts.InclFreeVersions, true)
   784  					}
   785  				}
   786  
   787  				rw.Lock()
   788  				rawArr[i] = rfi
   789  				metaArr[i], errs[i] = fi, err
   790  				rw.Unlock()
   791  
   792  				done <- err == nil
   793  			}(i, disk)
   794  		}
   795  
   796  		wg.Wait()
   797  		xioutil.SafeClose(done)
   798  
   799  		fi, ok := <-mrfCheck
   800  		if !ok {
   801  			return
   802  		}
   803  
   804  		if fi.Deleted {
   805  			return
   806  		}
   807  
   808  		// if one of the disk is offline, return right here no need
   809  		// to attempt a heal on the object.
   810  		if countErrs(errs, errDiskNotFound) > 0 {
   811  			return
   812  		}
   813  
   814  		var missingBlocks int
   815  		for i := range errs {
   816  			if IsErr(errs[i],
   817  				errFileNotFound,
   818  				errFileVersionNotFound,
   819  				errFileCorrupt,
   820  			) {
   821  				missingBlocks++
   822  			}
   823  		}
   824  
   825  		// if missing metadata can be reconstructed, attempt to reconstruct.
   826  		// additionally do not heal delete markers inline, let them be
   827  		// healed upon regular heal process.
   828  		if missingBlocks > 0 && missingBlocks < fi.Erasure.DataBlocks {
   829  			globalMRFState.addPartialOp(partialOperation{
   830  				bucket:    fi.Volume,
   831  				object:    fi.Name,
   832  				versionID: fi.VersionID,
   833  				queued:    time.Now(),
   834  				setIndex:  er.setIndex,
   835  				poolIndex: er.poolIndex,
   836  			})
   837  		}
   838  
   839  		return
   840  	}()
   841  
   842  	validResp := 0
   843  	totalResp := 0
   844  
   845  	// minDisks value is only to reduce the number of calls
   846  	// to the disks; this value is not accurate because we do
   847  	// not know the storage class of the object yet
   848  	minDisks := 0
   849  	if p := globalStorageClass.GetParityForSC(""); p > -1 {
   850  		minDisks = er.setDriveCount - p
   851  	} else {
   852  		minDisks = er.setDriveCount - er.defaultParityCount
   853  	}
   854  
   855  	calcQuorum := func(metaArr []FileInfo, errs []error) (FileInfo, []FileInfo, []StorageAPI, time.Time, string, error) {
   856  		readQuorum, _, err := objectQuorumFromMeta(ctx, metaArr, errs, er.defaultParityCount)
   857  		if err != nil {
   858  			return FileInfo{}, nil, nil, time.Time{}, "", err
   859  		}
   860  		if err := reduceReadQuorumErrs(ctx, errs, objectOpIgnoredErrs, readQuorum); err != nil {
   861  			return FileInfo{}, nil, nil, time.Time{}, "", err
   862  		}
   863  		onlineDisks, modTime, etag := listOnlineDisks(disks, metaArr, errs, readQuorum)
   864  		fi, err := pickValidFileInfo(ctx, metaArr, modTime, etag, readQuorum)
   865  		if err != nil {
   866  			return FileInfo{}, nil, nil, time.Time{}, "", err
   867  		}
   868  
   869  		onlineMeta := make([]FileInfo, len(metaArr))
   870  		for i, disk := range onlineDisks {
   871  			if disk != nil {
   872  				onlineMeta[i] = metaArr[i]
   873  			}
   874  		}
   875  
   876  		return fi, onlineMeta, onlineDisks, modTime, etag, nil
   877  	}
   878  
   879  	var (
   880  		modTime     time.Time
   881  		etag        string
   882  		fi          FileInfo
   883  		onlineMeta  []FileInfo
   884  		onlineDisks []StorageAPI
   885  		err         error
   886  	)
   887  
   888  	for success := range done {
   889  		totalResp++
   890  		if success {
   891  			validResp++
   892  		}
   893  		if totalResp < er.setDriveCount {
   894  			if !opts.FastGetObjInfo {
   895  				continue
   896  			}
   897  			if validResp < minDisks {
   898  				continue
   899  			}
   900  		}
   901  
   902  		rw.Lock()
   903  		if opts.VersionID == "" && totalResp == er.setDriveCount {
   904  			fi, onlineMeta, onlineDisks, modTime, etag, err = calcQuorum(pickLatestQuorumFilesInfo(ctx,
   905  				rawArr, errs, bucket, object, readData, opts.InclFreeVersions, true))
   906  		} else {
   907  			fi, onlineMeta, onlineDisks, modTime, etag, err = calcQuorum(metaArr, errs)
   908  		}
   909  		rw.Unlock()
   910  		if err == nil && fi.InlineData() {
   911  			break
   912  		}
   913  	}
   914  
   915  	if err != nil {
   916  		// We can only look for dangling if we received all the responses, if we did
   917  		// not we simply ignore it, since we can't tell for sure if its dangling object.
   918  		if totalResp == er.setDriveCount && shouldCheckForDangling(err, errs, bucket) {
   919  			_, derr := er.deleteIfDangling(context.Background(), bucket, object, metaArr, errs, nil, opts)
   920  			if derr != nil {
   921  				err = derr
   922  			}
   923  		}
   924  		return fi, nil, nil, toObjectErr(err, bucket, object)
   925  	}
   926  
   927  	if !fi.Deleted && len(fi.Erasure.Distribution) != len(onlineDisks) {
   928  		err := fmt.Errorf("unexpected file distribution (%v) from online disks (%v), looks like backend disks have been manually modified refusing to heal %s/%s(%s)",
   929  			fi.Erasure.Distribution, onlineDisks, bucket, object, opts.VersionID)
   930  		logger.LogOnceIf(ctx, err, "get-object-file-info-manually-modified")
   931  		return fi, nil, nil, toObjectErr(err, bucket, object, opts.VersionID)
   932  	}
   933  
   934  	filterOnlineDisksInplace(fi, onlineMeta, onlineDisks)
   935  	for i := range onlineMeta {
   936  		// verify metadata is valid, it has similar erasure info
   937  		// as well as common modtime, if modtime is not possible
   938  		// verify if it has common "etag" at least.
   939  		if onlineMeta[i].IsValid() && onlineMeta[i].Erasure.Equal(fi.Erasure) {
   940  			ok := onlineMeta[i].ModTime.Equal(modTime)
   941  			if modTime.IsZero() || modTime.Equal(timeSentinel) {
   942  				ok = etag != "" && etag == fi.Metadata["etag"]
   943  			}
   944  			if ok {
   945  				continue
   946  			}
   947  		} // in all other cases metadata is corrupt, do not read from it.
   948  
   949  		onlineMeta[i] = FileInfo{}
   950  		onlineDisks[i] = nil
   951  	}
   952  
   953  	select {
   954  	case mrfCheck <- fi.ShallowCopy():
   955  	case <-ctx.Done():
   956  		return fi, onlineMeta, onlineDisks, toObjectErr(ctx.Err(), bucket, object)
   957  	}
   958  
   959  	return fi, onlineMeta, onlineDisks, nil
   960  }
   961  
   962  // getObjectInfo - wrapper for reading object metadata and constructs ObjectInfo.
   963  func (er erasureObjects) getObjectInfo(ctx context.Context, bucket, object string, opts ObjectOptions) (objInfo ObjectInfo, err error) {
   964  	fi, _, _, err := er.getObjectFileInfo(ctx, bucket, object, opts, false)
   965  	if err != nil {
   966  		return objInfo, toObjectErr(err, bucket, object)
   967  	}
   968  	objInfo = fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended)
   969  	if fi.Deleted {
   970  		if opts.VersionID == "" || opts.DeleteMarker {
   971  			return objInfo, toObjectErr(errFileNotFound, bucket, object)
   972  		}
   973  		// Make sure to return object info to provide extra information.
   974  		return objInfo, toObjectErr(errMethodNotAllowed, bucket, object)
   975  	}
   976  
   977  	return objInfo, nil
   978  }
   979  
   980  // getObjectInfoAndQuorum - wrapper for reading object metadata and constructs ObjectInfo, additionally returns write quorum for the object.
   981  func (er erasureObjects) getObjectInfoAndQuorum(ctx context.Context, bucket, object string, opts ObjectOptions) (objInfo ObjectInfo, wquorum int, err error) {
   982  	fi, _, _, err := er.getObjectFileInfo(ctx, bucket, object, opts, false)
   983  	if err != nil {
   984  		return objInfo, er.defaultWQuorum(), toObjectErr(err, bucket, object)
   985  	}
   986  
   987  	wquorum = fi.WriteQuorum(er.defaultWQuorum())
   988  
   989  	objInfo = fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended)
   990  	if !fi.VersionPurgeStatus().Empty() && opts.VersionID != "" {
   991  		// Make sure to return object info to provide extra information.
   992  		return objInfo, wquorum, toObjectErr(errMethodNotAllowed, bucket, object)
   993  	}
   994  
   995  	if fi.Deleted {
   996  		if opts.VersionID == "" || opts.DeleteMarker {
   997  			return objInfo, wquorum, toObjectErr(errFileNotFound, bucket, object)
   998  		}
   999  		// Make sure to return object info to provide extra information.
  1000  		return objInfo, wquorum, toObjectErr(errMethodNotAllowed, bucket, object)
  1001  	}
  1002  
  1003  	return objInfo, wquorum, nil
  1004  }
  1005  
  1006  // Similar to rename but renames data from srcEntry to dstEntry at dataDir
  1007  func renameData(ctx context.Context, disks []StorageAPI, srcBucket, srcEntry string, metadata []FileInfo, dstBucket, dstEntry string, writeQuorum int) ([]StorageAPI, bool, error) {
  1008  	g := errgroup.WithNErrs(len(disks))
  1009  
  1010  	fvID := mustGetUUID()
  1011  	for index := range disks {
  1012  		metadata[index].SetTierFreeVersionID(fvID)
  1013  	}
  1014  
  1015  	diskVersions := make([]uint64, len(disks))
  1016  	// Rename file on all underlying storage disks.
  1017  	for index := range disks {
  1018  		index := index
  1019  		g.Go(func() error {
  1020  			if disks[index] == nil {
  1021  				return errDiskNotFound
  1022  			}
  1023  
  1024  			// Pick one FileInfo for a disk at index.
  1025  			fi := metadata[index]
  1026  			// Assign index when index is initialized
  1027  			if fi.Erasure.Index == 0 {
  1028  				fi.Erasure.Index = index + 1
  1029  			}
  1030  
  1031  			if !fi.IsValid() {
  1032  				return errFileCorrupt
  1033  			}
  1034  			sign, err := disks[index].RenameData(ctx, srcBucket, srcEntry, fi, dstBucket, dstEntry, RenameOptions{})
  1035  			if err != nil {
  1036  				return err
  1037  			}
  1038  			diskVersions[index] = sign
  1039  			return nil
  1040  		}, index)
  1041  	}
  1042  
  1043  	// Wait for all renames to finish.
  1044  	errs := g.Wait()
  1045  
  1046  	var versionsDisparity bool
  1047  
  1048  	err := reduceWriteQuorumErrs(ctx, errs, objectOpIgnoredErrs, writeQuorum)
  1049  	if err != nil {
  1050  		dg := errgroup.WithNErrs(len(disks))
  1051  		for index, nerr := range errs {
  1052  			if nerr != nil {
  1053  				continue
  1054  			}
  1055  			index := index
  1056  			// When we are going to return error, attempt to delete success
  1057  			// on some of the drives, if we cannot we do not have to notify
  1058  			// caller this dangling object will be now scheduled to be removed
  1059  			// via active healing.
  1060  			dg.Go(func() error {
  1061  				return disks[index].DeleteVersion(context.Background(), dstBucket, dstEntry, metadata[index], false, DeleteOptions{UndoWrite: true})
  1062  			}, index)
  1063  		}
  1064  		dg.Wait()
  1065  	}
  1066  	if err == nil {
  1067  		versions := reduceCommonVersions(diskVersions, writeQuorum)
  1068  		for index, dversions := range diskVersions {
  1069  			if errs[index] != nil {
  1070  				continue
  1071  			}
  1072  			if versions != dversions {
  1073  				versionsDisparity = true
  1074  				break
  1075  			}
  1076  		}
  1077  	}
  1078  
  1079  	// We can safely allow RenameData errors up to len(er.getDisks()) - writeQuorum
  1080  	// otherwise return failure.
  1081  	return evalDisks(disks, errs), versionsDisparity, err
  1082  }
  1083  
  1084  func (er erasureObjects) putMetacacheObject(ctx context.Context, key string, r *PutObjReader, opts ObjectOptions) (objInfo ObjectInfo, err error) {
  1085  	data := r.Reader
  1086  
  1087  	// No metadata is set, allocate a new one.
  1088  	if opts.UserDefined == nil {
  1089  		opts.UserDefined = make(map[string]string)
  1090  	}
  1091  
  1092  	storageDisks := er.getDisks()
  1093  	// Get parity and data drive count based on storage class metadata
  1094  	parityDrives := globalStorageClass.GetParityForSC(opts.UserDefined[xhttp.AmzStorageClass])
  1095  	if parityDrives < 0 {
  1096  		parityDrives = er.defaultParityCount
  1097  	}
  1098  	dataDrives := len(storageDisks) - parityDrives
  1099  
  1100  	// we now know the number of blocks this object needs for data and parity.
  1101  	// writeQuorum is dataBlocks + 1
  1102  	writeQuorum := dataDrives
  1103  	if dataDrives == parityDrives {
  1104  		writeQuorum++
  1105  	}
  1106  
  1107  	// Validate input data size and it can never be less than zero.
  1108  	if data.Size() < -1 {
  1109  		logger.LogIf(ctx, errInvalidArgument, logger.ErrorKind)
  1110  		return ObjectInfo{}, toObjectErr(errInvalidArgument)
  1111  	}
  1112  
  1113  	// Initialize parts metadata
  1114  	partsMetadata := make([]FileInfo, len(storageDisks))
  1115  
  1116  	fi := newFileInfo(pathJoin(minioMetaBucket, key), dataDrives, parityDrives)
  1117  	fi.DataDir = mustGetUUID()
  1118  
  1119  	// Initialize erasure metadata.
  1120  	for index := range partsMetadata {
  1121  		partsMetadata[index] = fi
  1122  	}
  1123  
  1124  	// Order disks according to erasure distribution
  1125  	var onlineDisks []StorageAPI
  1126  	onlineDisks, partsMetadata = shuffleDisksAndPartsMetadata(storageDisks, partsMetadata, fi)
  1127  
  1128  	erasure, err := NewErasure(ctx, fi.Erasure.DataBlocks, fi.Erasure.ParityBlocks, fi.Erasure.BlockSize)
  1129  	if err != nil {
  1130  		return ObjectInfo{}, toObjectErr(err, minioMetaBucket, key)
  1131  	}
  1132  
  1133  	// Fetch buffer for I/O, returns from the pool if not allocates a new one and returns.
  1134  	var buffer []byte
  1135  	switch size := data.Size(); {
  1136  	case size == 0:
  1137  		buffer = make([]byte, 1) // Allocate at least a byte to reach EOF
  1138  	case size >= fi.Erasure.BlockSize:
  1139  		buffer = globalBytePoolCap.Get()
  1140  		defer globalBytePoolCap.Put(buffer)
  1141  	case size < fi.Erasure.BlockSize:
  1142  		// No need to allocate fully blockSizeV1 buffer if the incoming data is smaller.
  1143  		buffer = make([]byte, size, 2*size+int64(fi.Erasure.ParityBlocks+fi.Erasure.DataBlocks-1))
  1144  	}
  1145  
  1146  	if len(buffer) > int(fi.Erasure.BlockSize) {
  1147  		buffer = buffer[:fi.Erasure.BlockSize]
  1148  	}
  1149  
  1150  	shardFileSize := erasure.ShardFileSize(data.Size())
  1151  	writers := make([]io.Writer, len(onlineDisks))
  1152  	inlineBuffers := make([]*bytes.Buffer, len(onlineDisks))
  1153  	for i, disk := range onlineDisks {
  1154  		if disk == nil {
  1155  			continue
  1156  		}
  1157  		if disk.IsOnline() {
  1158  			inlineBuffers[i] = bytes.NewBuffer(make([]byte, 0, shardFileSize))
  1159  			writers[i] = newStreamingBitrotWriterBuffer(inlineBuffers[i], DefaultBitrotAlgorithm, erasure.ShardSize())
  1160  		}
  1161  	}
  1162  
  1163  	n, erasureErr := erasure.Encode(ctx, data, writers, buffer, writeQuorum)
  1164  	closeBitrotWriters(writers)
  1165  	if erasureErr != nil {
  1166  		return ObjectInfo{}, toObjectErr(erasureErr, minioMetaBucket, key)
  1167  	}
  1168  
  1169  	// Should return IncompleteBody{} error when reader has fewer bytes
  1170  	// than specified in request header.
  1171  	if n < data.Size() {
  1172  		return ObjectInfo{}, IncompleteBody{Bucket: minioMetaBucket, Object: key}
  1173  	}
  1174  	var index []byte
  1175  	if opts.IndexCB != nil {
  1176  		index = opts.IndexCB()
  1177  	}
  1178  
  1179  	modTime := UTCNow()
  1180  
  1181  	for i, w := range writers {
  1182  		if w == nil {
  1183  			// Make sure to avoid writing to disks which we couldn't complete in erasure.Encode()
  1184  			onlineDisks[i] = nil
  1185  			continue
  1186  		}
  1187  		partsMetadata[i].Data = inlineBuffers[i].Bytes()
  1188  		partsMetadata[i].AddObjectPart(1, "", n, data.ActualSize(), modTime, index, nil)
  1189  	}
  1190  
  1191  	// Fill all the necessary metadata.
  1192  	// Update `xl.meta` content on each disks.
  1193  	for index := range partsMetadata {
  1194  		partsMetadata[index].Size = n
  1195  		partsMetadata[index].Fresh = true
  1196  		partsMetadata[index].ModTime = modTime
  1197  		partsMetadata[index].Metadata = opts.UserDefined
  1198  	}
  1199  
  1200  	// Set an additional header when data is inlined.
  1201  	for index := range partsMetadata {
  1202  		partsMetadata[index].SetInlineData()
  1203  	}
  1204  
  1205  	for i := 0; i < len(onlineDisks); i++ {
  1206  		if onlineDisks[i] != nil && onlineDisks[i].IsOnline() {
  1207  			// Object info is the same in all disks, so we can pick
  1208  			// the first meta from online disk
  1209  			fi = partsMetadata[i]
  1210  			break
  1211  		}
  1212  	}
  1213  
  1214  	if _, err = writeUniqueFileInfo(ctx, onlineDisks, "", minioMetaBucket, key, partsMetadata, writeQuorum); err != nil {
  1215  		return ObjectInfo{}, toObjectErr(err, minioMetaBucket, key)
  1216  	}
  1217  
  1218  	return fi.ToObjectInfo(minioMetaBucket, key, opts.Versioned || opts.VersionSuspended), nil
  1219  }
  1220  
  1221  // PutObject - creates an object upon reading from the input stream
  1222  // until EOF, erasure codes the data across all disk and additionally
  1223  // writes `xl.meta` which carries the necessary metadata for future
  1224  // object operations.
  1225  func (er erasureObjects) PutObject(ctx context.Context, bucket string, object string, data *PutObjReader, opts ObjectOptions) (objInfo ObjectInfo, err error) {
  1226  	return er.putObject(ctx, bucket, object, data, opts)
  1227  }
  1228  
  1229  // Heal up to two versions of one object when there is disparity between disks
  1230  func healObjectVersionsDisparity(bucket string, entry metaCacheEntry, scanMode madmin.HealScanMode) error {
  1231  	if entry.isDir() {
  1232  		return nil
  1233  	}
  1234  	// We might land at .metacache, .trash, .multipart
  1235  	// no need to heal them skip, only when bucket
  1236  	// is '.minio.sys'
  1237  	if bucket == minioMetaBucket {
  1238  		if wildcard.Match("buckets/*/.metacache/*", entry.name) {
  1239  			return nil
  1240  		}
  1241  		if wildcard.Match("tmp/*", entry.name) {
  1242  			return nil
  1243  		}
  1244  		if wildcard.Match("multipart/*", entry.name) {
  1245  			return nil
  1246  		}
  1247  		if wildcard.Match("tmp-old/*", entry.name) {
  1248  			return nil
  1249  		}
  1250  	}
  1251  
  1252  	fivs, err := entry.fileInfoVersions(bucket)
  1253  	if err != nil {
  1254  		healObject(bucket, entry.name, "", madmin.HealDeepScan)
  1255  		return err
  1256  	}
  1257  
  1258  	if len(fivs.Versions) <= 2 {
  1259  		for _, version := range fivs.Versions {
  1260  			healObject(bucket, entry.name, version.VersionID, scanMode)
  1261  		}
  1262  	}
  1263  
  1264  	return nil
  1265  }
  1266  
  1267  // putObject wrapper for erasureObjects PutObject
  1268  func (er erasureObjects) putObject(ctx context.Context, bucket string, object string, r *PutObjReader, opts ObjectOptions) (objInfo ObjectInfo, err error) {
  1269  	if !opts.NoAuditLog {
  1270  		auditObjectErasureSet(ctx, object, &er)
  1271  	}
  1272  
  1273  	data := r.Reader
  1274  
  1275  	if opts.CheckPrecondFn != nil {
  1276  		if !opts.NoLock {
  1277  			ns := er.NewNSLock(bucket, object)
  1278  			lkctx, err := ns.GetLock(ctx, globalOperationTimeout)
  1279  			if err != nil {
  1280  				return ObjectInfo{}, err
  1281  			}
  1282  			ctx = lkctx.Context()
  1283  			defer ns.Unlock(lkctx)
  1284  			opts.NoLock = true
  1285  		}
  1286  
  1287  		obj, err := er.getObjectInfo(ctx, bucket, object, opts)
  1288  		if err == nil && opts.CheckPrecondFn(obj) {
  1289  			return objInfo, PreConditionFailed{}
  1290  		}
  1291  		if err != nil && !isErrVersionNotFound(err) && !isErrObjectNotFound(err) && !isErrReadQuorum(err) {
  1292  			return objInfo, err
  1293  		}
  1294  	}
  1295  
  1296  	// Validate input data size and it can never be less than -1.
  1297  	if data.Size() < -1 {
  1298  		logger.LogIf(ctx, errInvalidArgument, logger.ErrorKind)
  1299  		return ObjectInfo{}, toObjectErr(errInvalidArgument)
  1300  	}
  1301  
  1302  	userDefined := cloneMSS(opts.UserDefined)
  1303  
  1304  	storageDisks := er.getDisks()
  1305  
  1306  	// Get parity and data drive count based on storage class metadata
  1307  	parityDrives := globalStorageClass.GetParityForSC(userDefined[xhttp.AmzStorageClass])
  1308  	if parityDrives < 0 {
  1309  		parityDrives = er.defaultParityCount
  1310  	}
  1311  	if opts.MaxParity {
  1312  		parityDrives = len(storageDisks) / 2
  1313  	}
  1314  	if !opts.MaxParity && globalStorageClass.AvailabilityOptimized() {
  1315  		// If we have offline disks upgrade the number of erasure codes for this object.
  1316  		parityOrig := parityDrives
  1317  
  1318  		var offlineDrives int
  1319  		for _, disk := range storageDisks {
  1320  			if disk == nil || !disk.IsOnline() {
  1321  				parityDrives++
  1322  				offlineDrives++
  1323  				continue
  1324  			}
  1325  		}
  1326  
  1327  		if offlineDrives >= (len(storageDisks)+1)/2 {
  1328  			// if offline drives are more than 50% of the drives
  1329  			// we have no quorum, we shouldn't proceed just
  1330  			// fail at that point.
  1331  			return ObjectInfo{}, toObjectErr(errErasureWriteQuorum, bucket, object)
  1332  		}
  1333  
  1334  		if parityDrives >= len(storageDisks)/2 {
  1335  			parityDrives = len(storageDisks) / 2
  1336  		}
  1337  
  1338  		if parityOrig != parityDrives {
  1339  			userDefined[minIOErasureUpgraded] = strconv.Itoa(parityOrig) + "->" + strconv.Itoa(parityDrives)
  1340  		}
  1341  	}
  1342  	dataDrives := len(storageDisks) - parityDrives
  1343  
  1344  	// we now know the number of blocks this object needs for data and parity.
  1345  	// writeQuorum is dataBlocks + 1
  1346  	writeQuorum := dataDrives
  1347  	if dataDrives == parityDrives {
  1348  		writeQuorum++
  1349  	}
  1350  
  1351  	// Initialize parts metadata
  1352  	partsMetadata := make([]FileInfo, len(storageDisks))
  1353  
  1354  	fi := newFileInfo(pathJoin(bucket, object), dataDrives, parityDrives)
  1355  	fi.VersionID = opts.VersionID
  1356  	if opts.Versioned && fi.VersionID == "" {
  1357  		fi.VersionID = mustGetUUID()
  1358  	}
  1359  
  1360  	fi.DataDir = mustGetUUID()
  1361  	fi.Checksum = opts.WantChecksum.AppendTo(nil, nil)
  1362  	if opts.EncryptFn != nil {
  1363  		fi.Checksum = opts.EncryptFn("object-checksum", fi.Checksum)
  1364  	}
  1365  	uniqueID := mustGetUUID()
  1366  	tempObj := uniqueID
  1367  
  1368  	// Initialize erasure metadata.
  1369  	for index := range partsMetadata {
  1370  		partsMetadata[index] = fi
  1371  	}
  1372  
  1373  	// Order disks according to erasure distribution
  1374  	var onlineDisks []StorageAPI
  1375  	onlineDisks, partsMetadata = shuffleDisksAndPartsMetadata(storageDisks, partsMetadata, fi)
  1376  
  1377  	erasure, err := NewErasure(ctx, fi.Erasure.DataBlocks, fi.Erasure.ParityBlocks, fi.Erasure.BlockSize)
  1378  	if err != nil {
  1379  		return ObjectInfo{}, toObjectErr(err, bucket, object)
  1380  	}
  1381  
  1382  	// Fetch buffer for I/O, returns from the pool if not allocates a new one and returns.
  1383  	var buffer []byte
  1384  	switch size := data.Size(); {
  1385  	case size == 0:
  1386  		buffer = make([]byte, 1) // Allocate at least a byte to reach EOF
  1387  	case size >= fi.Erasure.BlockSize || size == -1:
  1388  		buffer = globalBytePoolCap.Get()
  1389  		defer globalBytePoolCap.Put(buffer)
  1390  	case size < fi.Erasure.BlockSize:
  1391  		// No need to allocate fully blockSizeV1 buffer if the incoming data is smaller.
  1392  		buffer = make([]byte, size, 2*size+int64(fi.Erasure.ParityBlocks+fi.Erasure.DataBlocks-1))
  1393  	}
  1394  
  1395  	if len(buffer) > int(fi.Erasure.BlockSize) {
  1396  		buffer = buffer[:fi.Erasure.BlockSize]
  1397  	}
  1398  
  1399  	partName := "part.1"
  1400  	tempErasureObj := pathJoin(uniqueID, fi.DataDir, partName)
  1401  
  1402  	defer er.deleteAll(context.Background(), minioMetaTmpBucket, tempObj)
  1403  
  1404  	shardFileSize := erasure.ShardFileSize(data.Size())
  1405  	inlineBlock := globalStorageClass.InlineBlock()
  1406  	if inlineBlock <= 0 {
  1407  		inlineBlock = 128 * humanize.KiByte
  1408  	}
  1409  
  1410  	writers := make([]io.Writer, len(onlineDisks))
  1411  	var inlineBuffers []*bytes.Buffer
  1412  	if shardFileSize >= 0 {
  1413  		if !opts.Versioned && shardFileSize < inlineBlock {
  1414  			inlineBuffers = make([]*bytes.Buffer, len(onlineDisks))
  1415  		} else if shardFileSize < inlineBlock/8 {
  1416  			inlineBuffers = make([]*bytes.Buffer, len(onlineDisks))
  1417  		}
  1418  	} else {
  1419  		// If compressed, use actual size to determine.
  1420  		if sz := erasure.ShardFileSize(data.ActualSize()); sz > 0 {
  1421  			if !opts.Versioned && sz < inlineBlock {
  1422  				inlineBuffers = make([]*bytes.Buffer, len(onlineDisks))
  1423  			} else if sz < inlineBlock/8 {
  1424  				inlineBuffers = make([]*bytes.Buffer, len(onlineDisks))
  1425  			}
  1426  		}
  1427  	}
  1428  	for i, disk := range onlineDisks {
  1429  		if disk == nil {
  1430  			continue
  1431  		}
  1432  
  1433  		if !disk.IsOnline() {
  1434  			continue
  1435  		}
  1436  
  1437  		if len(inlineBuffers) > 0 {
  1438  			sz := shardFileSize
  1439  			if sz < 0 {
  1440  				sz = data.ActualSize()
  1441  			}
  1442  			inlineBuffers[i] = bytes.NewBuffer(make([]byte, 0, sz))
  1443  			writers[i] = newStreamingBitrotWriterBuffer(inlineBuffers[i], DefaultBitrotAlgorithm, erasure.ShardSize())
  1444  			continue
  1445  		}
  1446  
  1447  		writers[i] = newBitrotWriter(disk, bucket, minioMetaTmpBucket, tempErasureObj, shardFileSize, DefaultBitrotAlgorithm, erasure.ShardSize())
  1448  	}
  1449  
  1450  	toEncode := io.Reader(data)
  1451  	if data.Size() > bigFileThreshold {
  1452  		// We use 2 buffers, so we always have a full buffer of input.
  1453  		bufA := globalBytePoolCap.Get()
  1454  		bufB := globalBytePoolCap.Get()
  1455  		defer globalBytePoolCap.Put(bufA)
  1456  		defer globalBytePoolCap.Put(bufB)
  1457  		ra, err := readahead.NewReaderBuffer(data, [][]byte{bufA[:fi.Erasure.BlockSize], bufB[:fi.Erasure.BlockSize]})
  1458  		if err == nil {
  1459  			toEncode = ra
  1460  			defer ra.Close()
  1461  		}
  1462  		logger.LogIf(ctx, err)
  1463  	}
  1464  	n, erasureErr := erasure.Encode(ctx, toEncode, writers, buffer, writeQuorum)
  1465  	closeBitrotWriters(writers)
  1466  	if erasureErr != nil {
  1467  		return ObjectInfo{}, toObjectErr(erasureErr, bucket, object)
  1468  	}
  1469  
  1470  	// Should return IncompleteBody{} error when reader has fewer bytes
  1471  	// than specified in request header.
  1472  	if n < data.Size() {
  1473  		return ObjectInfo{}, IncompleteBody{Bucket: bucket, Object: object}
  1474  	}
  1475  
  1476  	var compIndex []byte
  1477  	if opts.IndexCB != nil {
  1478  		compIndex = opts.IndexCB()
  1479  	}
  1480  	if !opts.NoLock {
  1481  		lk := er.NewNSLock(bucket, object)
  1482  		lkctx, err := lk.GetLock(ctx, globalOperationTimeout)
  1483  		if err != nil {
  1484  			return ObjectInfo{}, err
  1485  		}
  1486  		ctx = lkctx.Context()
  1487  		defer lk.Unlock(lkctx)
  1488  	}
  1489  
  1490  	modTime := opts.MTime
  1491  	if opts.MTime.IsZero() {
  1492  		modTime = UTCNow()
  1493  	}
  1494  
  1495  	for i, w := range writers {
  1496  		if w == nil {
  1497  			onlineDisks[i] = nil
  1498  			continue
  1499  		}
  1500  		if len(inlineBuffers) > 0 && inlineBuffers[i] != nil {
  1501  			partsMetadata[i].Data = inlineBuffers[i].Bytes()
  1502  		} else {
  1503  			partsMetadata[i].Data = nil
  1504  		}
  1505  		// No need to add checksum to part. We already have it on the object.
  1506  		partsMetadata[i].AddObjectPart(1, "", n, data.ActualSize(), modTime, compIndex, nil)
  1507  		partsMetadata[i].Versioned = opts.Versioned || opts.VersionSuspended
  1508  	}
  1509  
  1510  	userDefined["etag"] = r.MD5CurrentHexString()
  1511  	kind, _ := crypto.IsEncrypted(userDefined)
  1512  	if opts.PreserveETag != "" {
  1513  		if !opts.ReplicationRequest {
  1514  			userDefined["etag"] = opts.PreserveETag
  1515  		} else if kind != crypto.S3 {
  1516  			// if we have a replication request
  1517  			// and SSE-S3 is specified do not preserve
  1518  			// the incoming etag.
  1519  			userDefined["etag"] = opts.PreserveETag
  1520  		}
  1521  	}
  1522  
  1523  	// Guess content-type from the extension if possible.
  1524  	if userDefined["content-type"] == "" {
  1525  		userDefined["content-type"] = mimedb.TypeByExtension(path.Ext(object))
  1526  	}
  1527  
  1528  	// if storageClass is standard no need to save it as part of metadata.
  1529  	if userDefined[xhttp.AmzStorageClass] == storageclass.STANDARD {
  1530  		delete(userDefined, xhttp.AmzStorageClass)
  1531  	}
  1532  
  1533  	// Fill all the necessary metadata.
  1534  	// Update `xl.meta` content on each disks.
  1535  	for index := range partsMetadata {
  1536  		partsMetadata[index].Metadata = userDefined
  1537  		partsMetadata[index].Size = n
  1538  		partsMetadata[index].ModTime = modTime
  1539  		if len(inlineBuffers) > 0 {
  1540  			partsMetadata[index].SetInlineData()
  1541  		}
  1542  		if opts.DataMovement {
  1543  			partsMetadata[index].SetDataMov()
  1544  		}
  1545  	}
  1546  
  1547  	// Rename the successfully written temporary object to final location.
  1548  	onlineDisks, versionsDisparity, err := renameData(ctx, onlineDisks, minioMetaTmpBucket, tempObj, partsMetadata, bucket, object, writeQuorum)
  1549  	if err != nil {
  1550  		if errors.Is(err, errFileNotFound) {
  1551  			return ObjectInfo{}, toObjectErr(errErasureWriteQuorum, bucket, object)
  1552  		}
  1553  		return ObjectInfo{}, toObjectErr(err, bucket, object)
  1554  	}
  1555  
  1556  	for i := 0; i < len(onlineDisks); i++ {
  1557  		if onlineDisks[i] != nil && onlineDisks[i].IsOnline() {
  1558  			// Object info is the same in all disks, so we can pick
  1559  			// the first meta from online disk
  1560  			fi = partsMetadata[i]
  1561  			break
  1562  		}
  1563  	}
  1564  
  1565  	// For speedtest objects do not attempt to heal them.
  1566  	if !opts.Speedtest {
  1567  		// When there is versions disparity we are healing
  1568  		// the content implicitly for all versions, we can
  1569  		// avoid triggering another MRF heal for offline drives.
  1570  		if !versionsDisparity {
  1571  			// Whether a disk was initially or becomes offline
  1572  			// during this upload, send it to the MRF list.
  1573  			for i := 0; i < len(onlineDisks); i++ {
  1574  				if onlineDisks[i] != nil && onlineDisks[i].IsOnline() {
  1575  					continue
  1576  				}
  1577  
  1578  				er.addPartial(bucket, object, fi.VersionID)
  1579  				break
  1580  			}
  1581  		} else {
  1582  			globalMRFState.addPartialOp(partialOperation{
  1583  				bucket:      bucket,
  1584  				object:      object,
  1585  				queued:      time.Now(),
  1586  				allVersions: true,
  1587  				setIndex:    er.setIndex,
  1588  				poolIndex:   er.poolIndex,
  1589  			})
  1590  		}
  1591  	}
  1592  
  1593  	fi.ReplicationState = opts.PutReplicationState()
  1594  
  1595  	// we are adding a new version to this object under the namespace lock, so this is the latest version.
  1596  	fi.IsLatest = true
  1597  
  1598  	return fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended), nil
  1599  }
  1600  
  1601  func (er erasureObjects) deleteObjectVersion(ctx context.Context, bucket, object string, fi FileInfo, forceDelMarker bool) error {
  1602  	disks := er.getDisks()
  1603  	// Assume (N/2 + 1) quorum for Delete()
  1604  	// this is a theoretical assumption such that
  1605  	// for delete's we do not need to honor storage
  1606  	// class for objects that have reduced quorum
  1607  	// due to storage class - this only needs to be honored
  1608  	// for Read() requests alone that we already do.
  1609  	writeQuorum := len(disks)/2 + 1
  1610  
  1611  	g := errgroup.WithNErrs(len(disks))
  1612  	for index := range disks {
  1613  		index := index
  1614  		g.Go(func() error {
  1615  			if disks[index] == nil {
  1616  				return errDiskNotFound
  1617  			}
  1618  			return disks[index].DeleteVersion(ctx, bucket, object, fi, forceDelMarker, DeleteOptions{})
  1619  		}, index)
  1620  	}
  1621  	// return errors if any during deletion
  1622  	return reduceWriteQuorumErrs(ctx, g.Wait(), objectOpIgnoredErrs, writeQuorum)
  1623  }
  1624  
  1625  // DeleteObjects deletes objects/versions in bulk, this function will still automatically split objects list
  1626  // into smaller bulks if some object names are found to be duplicated in the delete list, splitting
  1627  // into smaller bulks will avoid holding twice the write lock of the duplicated object names.
  1628  func (er erasureObjects) DeleteObjects(ctx context.Context, bucket string, objects []ObjectToDelete, opts ObjectOptions) ([]DeletedObject, []error) {
  1629  	if !opts.NoAuditLog {
  1630  		for _, obj := range objects {
  1631  			auditObjectErasureSet(ctx, obj.ObjectV.ObjectName, &er)
  1632  		}
  1633  	}
  1634  
  1635  	errs := make([]error, len(objects))
  1636  	dobjects := make([]DeletedObject, len(objects))
  1637  	writeQuorums := make([]int, len(objects))
  1638  
  1639  	storageDisks := er.getDisks()
  1640  
  1641  	for i := range objects {
  1642  		// Assume (N/2 + 1) quorums for all objects
  1643  		// this is a theoretical assumption such that
  1644  		// for delete's we do not need to honor storage
  1645  		// class for objects which have reduced quorum
  1646  		// storage class only needs to be honored for
  1647  		// Read() requests alone which we already do.
  1648  		writeQuorums[i] = len(storageDisks)/2 + 1
  1649  	}
  1650  
  1651  	versionsMap := make(map[string]FileInfoVersions, len(objects))
  1652  	for i := range objects {
  1653  		// Construct the FileInfo data that needs to be preserved on the disk.
  1654  		vr := FileInfo{
  1655  			Name:             objects[i].ObjectName,
  1656  			VersionID:        objects[i].VersionID,
  1657  			ReplicationState: objects[i].ReplicationState(),
  1658  			// save the index to set correct error at this index.
  1659  			Idx: i,
  1660  		}
  1661  		vr.SetTierFreeVersionID(mustGetUUID())
  1662  		// VersionID is not set means delete is not specific about
  1663  		// any version, look for if the bucket is versioned or not.
  1664  		if objects[i].VersionID == "" {
  1665  			// MinIO extension to bucket version configuration
  1666  			suspended := opts.VersionSuspended
  1667  			versioned := opts.Versioned
  1668  			if opts.PrefixEnabledFn != nil {
  1669  				versioned = opts.PrefixEnabledFn(objects[i].ObjectName)
  1670  			}
  1671  			if versioned || suspended {
  1672  				// Bucket is versioned and no version was explicitly
  1673  				// mentioned for deletes, create a delete marker instead.
  1674  				vr.ModTime = UTCNow()
  1675  				vr.Deleted = true
  1676  				// Versioning suspended means that we add a `null` version
  1677  				// delete marker, if not add a new version for this delete
  1678  				// marker.
  1679  				if versioned {
  1680  					vr.VersionID = mustGetUUID()
  1681  				}
  1682  			}
  1683  		}
  1684  		// De-dup same object name to collect multiple versions for same object.
  1685  		v, ok := versionsMap[objects[i].ObjectName]
  1686  		if ok {
  1687  			v.Versions = append(v.Versions, vr)
  1688  		} else {
  1689  			v = FileInfoVersions{
  1690  				Name:     vr.Name,
  1691  				Versions: []FileInfo{vr},
  1692  			}
  1693  		}
  1694  		if vr.Deleted {
  1695  			dobjects[i] = DeletedObject{
  1696  				DeleteMarker:          vr.Deleted,
  1697  				DeleteMarkerVersionID: vr.VersionID,
  1698  				DeleteMarkerMTime:     DeleteMarkerMTime{vr.ModTime},
  1699  				ObjectName:            vr.Name,
  1700  				ReplicationState:      vr.ReplicationState,
  1701  			}
  1702  		} else {
  1703  			dobjects[i] = DeletedObject{
  1704  				ObjectName:       vr.Name,
  1705  				VersionID:        vr.VersionID,
  1706  				ReplicationState: vr.ReplicationState,
  1707  			}
  1708  		}
  1709  		versionsMap[objects[i].ObjectName] = v
  1710  	}
  1711  
  1712  	dedupVersions := make([]FileInfoVersions, 0, len(versionsMap))
  1713  	for _, version := range versionsMap {
  1714  		dedupVersions = append(dedupVersions, version)
  1715  	}
  1716  
  1717  	// Initialize list of errors.
  1718  	delObjErrs := make([][]error, len(storageDisks))
  1719  
  1720  	var wg sync.WaitGroup
  1721  	// Remove versions in bulk for each disk
  1722  	for index, disk := range storageDisks {
  1723  		wg.Add(1)
  1724  		go func(index int, disk StorageAPI) {
  1725  			defer wg.Done()
  1726  			delObjErrs[index] = make([]error, len(objects))
  1727  			if disk == nil {
  1728  				for i := range objects {
  1729  					delObjErrs[index][i] = errDiskNotFound
  1730  				}
  1731  				return
  1732  			}
  1733  			errs := disk.DeleteVersions(ctx, bucket, dedupVersions, DeleteOptions{})
  1734  			for i, err := range errs {
  1735  				if err == nil {
  1736  					continue
  1737  				}
  1738  				for _, v := range dedupVersions[i].Versions {
  1739  					if err == errFileNotFound || err == errFileVersionNotFound {
  1740  						if !dobjects[v.Idx].DeleteMarker {
  1741  							// Not delete marker, if not found, ok.
  1742  							continue
  1743  						}
  1744  					}
  1745  					delObjErrs[index][v.Idx] = err
  1746  				}
  1747  			}
  1748  		}(index, disk)
  1749  	}
  1750  	wg.Wait()
  1751  
  1752  	// Reduce errors for each object
  1753  	for objIndex := range objects {
  1754  		diskErrs := make([]error, len(storageDisks))
  1755  		// Iterate over disks to fetch the error
  1756  		// of deleting of the current object
  1757  		for i := range delObjErrs {
  1758  			// delObjErrs[i] is not nil when disks[i] is also not nil
  1759  			if delObjErrs[i] != nil {
  1760  				diskErrs[i] = delObjErrs[i][objIndex]
  1761  			}
  1762  		}
  1763  		err := reduceWriteQuorumErrs(ctx, diskErrs, objectOpIgnoredErrs, writeQuorums[objIndex])
  1764  		if objects[objIndex].VersionID != "" {
  1765  			errs[objIndex] = toObjectErr(err, bucket, objects[objIndex].ObjectName, objects[objIndex].VersionID)
  1766  		} else {
  1767  			errs[objIndex] = toObjectErr(err, bucket, objects[objIndex].ObjectName)
  1768  		}
  1769  	}
  1770  
  1771  	// Check failed deletes across multiple objects
  1772  	for i, dobj := range dobjects {
  1773  		// This object errored, we should attempt a heal just in case.
  1774  		if errs[i] != nil && !isErrVersionNotFound(errs[i]) && !isErrObjectNotFound(errs[i]) {
  1775  			// all other direct versionId references we should
  1776  			// ensure no dangling file is left over.
  1777  			er.addPartial(bucket, dobj.ObjectName, dobj.VersionID)
  1778  			continue
  1779  		}
  1780  
  1781  		// Check if there is any offline disk and add it to the MRF list
  1782  		for _, disk := range storageDisks {
  1783  			if disk != nil && disk.IsOnline() {
  1784  				// Skip attempted heal on online disks.
  1785  				continue
  1786  			}
  1787  
  1788  			// all other direct versionId references we should
  1789  			// ensure no dangling file is left over.
  1790  			er.addPartial(bucket, dobj.ObjectName, dobj.VersionID)
  1791  			break
  1792  		}
  1793  	}
  1794  
  1795  	return dobjects, errs
  1796  }
  1797  
  1798  func (er erasureObjects) deletePrefix(ctx context.Context, bucket, prefix string) error {
  1799  	disks := er.getDisks()
  1800  	g := errgroup.WithNErrs(len(disks))
  1801  	for index := range disks {
  1802  		index := index
  1803  		g.Go(func() error {
  1804  			if disks[index] == nil {
  1805  				return nil
  1806  			}
  1807  			return disks[index].Delete(ctx, bucket, prefix, DeleteOptions{
  1808  				Recursive: true,
  1809  				Immediate: true,
  1810  			})
  1811  		}, index)
  1812  	}
  1813  	for _, err := range g.Wait() {
  1814  		if err != nil {
  1815  			return err
  1816  		}
  1817  	}
  1818  	return nil
  1819  }
  1820  
  1821  // DeleteObject - deletes an object, this call doesn't necessary reply
  1822  // any error as it is not necessary for the handler to reply back a
  1823  // response to the client request.
  1824  func (er erasureObjects) DeleteObject(ctx context.Context, bucket, object string, opts ObjectOptions) (objInfo ObjectInfo, err error) {
  1825  	if !opts.NoAuditLog {
  1826  		auditObjectErasureSet(ctx, object, &er)
  1827  	}
  1828  
  1829  	var lc *lifecycle.Lifecycle
  1830  	var rcfg lock.Retention
  1831  	var replcfg *replication.Config
  1832  	if opts.Expiration.Expire {
  1833  		// Check if the current bucket has a configured lifecycle policy
  1834  		lc, _ = globalLifecycleSys.Get(bucket)
  1835  		rcfg, _ = globalBucketObjectLockSys.Get(bucket)
  1836  		replcfg, _ = getReplicationConfig(ctx, bucket)
  1837  	}
  1838  
  1839  	// expiration attempted on a bucket with no lifecycle
  1840  	// rules shall be rejected.
  1841  	if lc == nil && opts.Expiration.Expire {
  1842  		if opts.VersionID != "" {
  1843  			return objInfo, VersionNotFound{
  1844  				Bucket:    bucket,
  1845  				Object:    object,
  1846  				VersionID: opts.VersionID,
  1847  			}
  1848  		}
  1849  		return objInfo, ObjectNotFound{
  1850  			Bucket: bucket,
  1851  			Object: object,
  1852  		}
  1853  	}
  1854  
  1855  	if opts.DeletePrefix {
  1856  		if opts.Expiration.Expire {
  1857  			// Expire all versions expiration must still verify the state() on disk
  1858  			// via a getObjectInfo() call as follows, any read quorum issues we
  1859  			// must not proceed further for safety reasons. attempt a MRF heal
  1860  			// while we see such quorum errors.
  1861  			goi, _, gerr := er.getObjectInfoAndQuorum(ctx, bucket, object, opts)
  1862  			if gerr != nil && goi.Name == "" {
  1863  				if _, ok := gerr.(InsufficientReadQuorum); ok {
  1864  					// Add an MRF heal for next time.
  1865  					er.addPartial(bucket, object, opts.VersionID)
  1866  
  1867  					return objInfo, InsufficientWriteQuorum{}
  1868  				}
  1869  				return objInfo, gerr
  1870  			}
  1871  
  1872  			// Add protection and re-verify the ILM rules for qualification
  1873  			// based on the latest objectInfo and see if the object still
  1874  			// qualifies for deletion.
  1875  			if gerr == nil {
  1876  				evt := evalActionFromLifecycle(ctx, *lc, rcfg, replcfg, goi)
  1877  				var isErr bool
  1878  				switch evt.Action {
  1879  				case lifecycle.NoneAction:
  1880  					isErr = true
  1881  				case lifecycle.TransitionAction, lifecycle.TransitionVersionAction:
  1882  					isErr = true
  1883  				}
  1884  				if isErr {
  1885  					if goi.VersionID != "" {
  1886  						return goi, VersionNotFound{
  1887  							Bucket:    bucket,
  1888  							Object:    object,
  1889  							VersionID: goi.VersionID,
  1890  						}
  1891  					}
  1892  					return goi, ObjectNotFound{
  1893  						Bucket: bucket,
  1894  						Object: object,
  1895  					}
  1896  				}
  1897  			}
  1898  		} // Delete marker and any latest that qualifies shall be expired permanently.
  1899  
  1900  		return ObjectInfo{}, toObjectErr(er.deletePrefix(ctx, bucket, object), bucket, object)
  1901  	}
  1902  
  1903  	storageDisks := er.getDisks()
  1904  	versionFound := true
  1905  	objInfo = ObjectInfo{VersionID: opts.VersionID} // version id needed in Delete API response.
  1906  	goi, _, gerr := er.getObjectInfoAndQuorum(ctx, bucket, object, opts)
  1907  	if gerr != nil && goi.Name == "" {
  1908  		if _, ok := gerr.(InsufficientReadQuorum); ok {
  1909  			// Add an MRF heal for next time.
  1910  			er.addPartial(bucket, object, opts.VersionID)
  1911  
  1912  			return objInfo, InsufficientWriteQuorum{}
  1913  		}
  1914  		// For delete marker replication, versionID being replicated will not exist on disk
  1915  		if opts.DeleteMarker {
  1916  			versionFound = false
  1917  		} else {
  1918  			return objInfo, gerr
  1919  		}
  1920  	}
  1921  
  1922  	if opts.EvalMetadataFn != nil {
  1923  		dsc, err := opts.EvalMetadataFn(&goi, err)
  1924  		if err != nil {
  1925  			return ObjectInfo{}, err
  1926  		}
  1927  		if dsc.ReplicateAny() {
  1928  			opts.SetDeleteReplicationState(dsc, opts.VersionID)
  1929  			goi.replicationDecision = opts.DeleteReplication.ReplicateDecisionStr
  1930  		}
  1931  	}
  1932  
  1933  	if opts.EvalRetentionBypassFn != nil {
  1934  		if err := opts.EvalRetentionBypassFn(goi, gerr); err != nil {
  1935  			return ObjectInfo{}, err
  1936  		}
  1937  	}
  1938  
  1939  	if opts.Expiration.Expire {
  1940  		if gerr == nil {
  1941  			evt := evalActionFromLifecycle(ctx, *lc, rcfg, replcfg, goi)
  1942  			var isErr bool
  1943  			switch evt.Action {
  1944  			case lifecycle.NoneAction:
  1945  				isErr = true
  1946  			case lifecycle.TransitionAction, lifecycle.TransitionVersionAction:
  1947  				isErr = true
  1948  			}
  1949  			if isErr {
  1950  				if goi.VersionID != "" {
  1951  					return goi, VersionNotFound{
  1952  						Bucket:    bucket,
  1953  						Object:    object,
  1954  						VersionID: goi.VersionID,
  1955  					}
  1956  				}
  1957  				return goi, ObjectNotFound{
  1958  					Bucket: bucket,
  1959  					Object: object,
  1960  				}
  1961  			}
  1962  		}
  1963  	}
  1964  
  1965  	//  Determine whether to mark object deleted for replication
  1966  	markDelete := goi.VersionID != ""
  1967  
  1968  	// Default deleteMarker to true if object is under versioning
  1969  	deleteMarker := opts.Versioned
  1970  
  1971  	if opts.VersionID != "" {
  1972  		// case where replica version needs to be deleted on target cluster
  1973  		if versionFound && opts.DeleteMarkerReplicationStatus() == replication.Replica {
  1974  			markDelete = false
  1975  		}
  1976  		if opts.VersionPurgeStatus().Empty() && opts.DeleteMarkerReplicationStatus().Empty() {
  1977  			markDelete = false
  1978  		}
  1979  		if opts.VersionPurgeStatus() == Complete {
  1980  			markDelete = false
  1981  		}
  1982  		// now, since VersionPurgeStatus() is already set, we can let the
  1983  		// lower layers decide this. This fixes a regression that was introduced
  1984  		// in PR #14555 where !VersionPurgeStatus.Empty() is automatically
  1985  		// considered as Delete marker true to avoid listing such objects by
  1986  		// regular ListObjects() calls. However for delete replication this
  1987  		// ends up being a problem because "upon" a successful delete this
  1988  		// ends up creating a new delete marker that is spurious and unnecessary.
  1989  		//
  1990  		// Regression introduced by #14555 was reintroduced in #15564
  1991  		if versionFound {
  1992  			if !goi.VersionPurgeStatus.Empty() {
  1993  				deleteMarker = false
  1994  			} else if !goi.DeleteMarker { // implies a versioned delete of object
  1995  				deleteMarker = false
  1996  			}
  1997  		}
  1998  	}
  1999  
  2000  	modTime := opts.MTime
  2001  	if opts.MTime.IsZero() {
  2002  		modTime = UTCNow()
  2003  	}
  2004  	fvID := mustGetUUID()
  2005  
  2006  	defer func() {
  2007  		// attempt a heal before returning if there are offline disks
  2008  		// for both del marker and permanent delete situations.
  2009  		for _, disk := range storageDisks {
  2010  			if disk != nil && disk.IsOnline() {
  2011  				continue
  2012  			}
  2013  			er.addPartial(bucket, object, opts.VersionID)
  2014  			break
  2015  		}
  2016  	}()
  2017  
  2018  	if markDelete && (opts.Versioned || opts.VersionSuspended) {
  2019  		if !deleteMarker {
  2020  			// versioning suspended means we add `null` version as
  2021  			// delete marker, if its not decided already.
  2022  			deleteMarker = opts.VersionSuspended && opts.VersionID == ""
  2023  		}
  2024  		fi := FileInfo{
  2025  			Name:             object,
  2026  			Deleted:          deleteMarker,
  2027  			MarkDeleted:      markDelete,
  2028  			ModTime:          modTime,
  2029  			ReplicationState: opts.DeleteReplication,
  2030  			TransitionStatus: opts.Transition.Status,
  2031  			ExpireRestored:   opts.Transition.ExpireRestored,
  2032  		}
  2033  		fi.SetTierFreeVersionID(fvID)
  2034  		if opts.SkipFreeVersion {
  2035  			fi.SetSkipTierFreeVersion()
  2036  		}
  2037  		if opts.VersionID != "" {
  2038  			fi.VersionID = opts.VersionID
  2039  		} else if opts.Versioned {
  2040  			fi.VersionID = mustGetUUID()
  2041  		}
  2042  		// versioning suspended means we add `null` version as
  2043  		// delete marker. Add delete marker, since we don't have
  2044  		// any version specified explicitly. Or if a particular
  2045  		// version id needs to be replicated.
  2046  		if err = er.deleteObjectVersion(ctx, bucket, object, fi, opts.DeleteMarker); err != nil {
  2047  			return objInfo, toObjectErr(err, bucket, object)
  2048  		}
  2049  		oi := fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended)
  2050  		oi.replicationDecision = goi.replicationDecision
  2051  		return oi, nil
  2052  	}
  2053  
  2054  	// Delete the object version on all disks.
  2055  	dfi := FileInfo{
  2056  		Name:             object,
  2057  		VersionID:        opts.VersionID,
  2058  		MarkDeleted:      markDelete,
  2059  		Deleted:          deleteMarker,
  2060  		ModTime:          modTime,
  2061  		ReplicationState: opts.DeleteReplication,
  2062  		TransitionStatus: opts.Transition.Status,
  2063  		ExpireRestored:   opts.Transition.ExpireRestored,
  2064  	}
  2065  	dfi.SetTierFreeVersionID(fvID)
  2066  	if opts.SkipFreeVersion {
  2067  		dfi.SetSkipTierFreeVersion()
  2068  	}
  2069  	if err = er.deleteObjectVersion(ctx, bucket, object, dfi, opts.DeleteMarker); err != nil {
  2070  		return objInfo, toObjectErr(err, bucket, object)
  2071  	}
  2072  
  2073  	return dfi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended), nil
  2074  }
  2075  
  2076  // Send the successful but partial upload/delete, however ignore
  2077  // if the channel is blocked by other items.
  2078  func (er erasureObjects) addPartial(bucket, object, versionID string) {
  2079  	globalMRFState.addPartialOp(partialOperation{
  2080  		bucket:    bucket,
  2081  		object:    object,
  2082  		versionID: versionID,
  2083  		queued:    time.Now(),
  2084  	})
  2085  }
  2086  
  2087  func (er erasureObjects) PutObjectMetadata(ctx context.Context, bucket, object string, opts ObjectOptions) (ObjectInfo, error) {
  2088  	if !opts.NoLock {
  2089  		// Lock the object before updating metadata.
  2090  		lk := er.NewNSLock(bucket, object)
  2091  		lkctx, err := lk.GetLock(ctx, globalOperationTimeout)
  2092  		if err != nil {
  2093  			return ObjectInfo{}, err
  2094  		}
  2095  		ctx = lkctx.Context()
  2096  		defer lk.Unlock(lkctx)
  2097  	}
  2098  
  2099  	disks := er.getDisks()
  2100  
  2101  	var metaArr []FileInfo
  2102  	var errs []error
  2103  
  2104  	// Read metadata associated with the object from all disks.
  2105  	if opts.VersionID != "" {
  2106  		metaArr, errs = readAllFileInfo(ctx, disks, "", bucket, object, opts.VersionID, false, false)
  2107  	} else {
  2108  		metaArr, errs = readAllXL(ctx, disks, bucket, object, false, false, true)
  2109  	}
  2110  
  2111  	readQuorum, _, err := objectQuorumFromMeta(ctx, metaArr, errs, er.defaultParityCount)
  2112  	if err != nil {
  2113  		if errors.Is(err, errErasureReadQuorum) && !strings.HasPrefix(bucket, minioMetaBucket) {
  2114  			_, derr := er.deleteIfDangling(context.Background(), bucket, object, metaArr, errs, nil, opts)
  2115  			if derr != nil {
  2116  				err = derr
  2117  			}
  2118  		}
  2119  		return ObjectInfo{}, toObjectErr(err, bucket, object)
  2120  	}
  2121  
  2122  	// List all online disks.
  2123  	onlineDisks, modTime, etag := listOnlineDisks(disks, metaArr, errs, readQuorum)
  2124  
  2125  	// Pick latest valid metadata.
  2126  	fi, err := pickValidFileInfo(ctx, metaArr, modTime, etag, readQuorum)
  2127  	if err != nil {
  2128  		return ObjectInfo{}, toObjectErr(err, bucket, object)
  2129  	}
  2130  
  2131  	if fi.Deleted {
  2132  		return ObjectInfo{}, toObjectErr(errMethodNotAllowed, bucket, object)
  2133  	}
  2134  
  2135  	filterOnlineDisksInplace(fi, metaArr, onlineDisks)
  2136  
  2137  	// if version-id is not specified retention is supposed to be set on the latest object.
  2138  	if opts.VersionID == "" {
  2139  		opts.VersionID = fi.VersionID
  2140  	}
  2141  
  2142  	objInfo := fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended)
  2143  	if opts.EvalMetadataFn != nil {
  2144  		if _, err := opts.EvalMetadataFn(&objInfo, err); err != nil {
  2145  			return ObjectInfo{}, err
  2146  		}
  2147  	}
  2148  	for k, v := range objInfo.UserDefined {
  2149  		fi.Metadata[k] = v
  2150  	}
  2151  	fi.ModTime = opts.MTime
  2152  	fi.VersionID = opts.VersionID
  2153  
  2154  	if err = er.updateObjectMeta(ctx, bucket, object, fi, onlineDisks); err != nil {
  2155  		return ObjectInfo{}, toObjectErr(err, bucket, object)
  2156  	}
  2157  
  2158  	return fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended), nil
  2159  }
  2160  
  2161  // PutObjectTags - replace or add tags to an existing object
  2162  func (er erasureObjects) PutObjectTags(ctx context.Context, bucket, object string, tags string, opts ObjectOptions) (ObjectInfo, error) {
  2163  	// Lock the object before updating tags.
  2164  	lk := er.NewNSLock(bucket, object)
  2165  	lkctx, err := lk.GetLock(ctx, globalOperationTimeout)
  2166  	if err != nil {
  2167  		return ObjectInfo{}, err
  2168  	}
  2169  	ctx = lkctx.Context()
  2170  	defer lk.Unlock(lkctx)
  2171  
  2172  	disks := er.getDisks()
  2173  
  2174  	var metaArr []FileInfo
  2175  	var errs []error
  2176  
  2177  	// Read metadata associated with the object from all disks.
  2178  	if opts.VersionID != "" {
  2179  		metaArr, errs = readAllFileInfo(ctx, disks, "", bucket, object, opts.VersionID, false, false)
  2180  	} else {
  2181  		metaArr, errs = readAllXL(ctx, disks, bucket, object, false, false, true)
  2182  	}
  2183  
  2184  	readQuorum, _, err := objectQuorumFromMeta(ctx, metaArr, errs, er.defaultParityCount)
  2185  	if err != nil {
  2186  		if errors.Is(err, errErasureReadQuorum) && !strings.HasPrefix(bucket, minioMetaBucket) {
  2187  			_, derr := er.deleteIfDangling(context.Background(), bucket, object, metaArr, errs, nil, opts)
  2188  			if derr != nil {
  2189  				err = derr
  2190  			}
  2191  		}
  2192  		return ObjectInfo{}, toObjectErr(err, bucket, object)
  2193  	}
  2194  
  2195  	// List all online disks.
  2196  	onlineDisks, modTime, etag := listOnlineDisks(disks, metaArr, errs, readQuorum)
  2197  
  2198  	// Pick latest valid metadata.
  2199  	fi, err := pickValidFileInfo(ctx, metaArr, modTime, etag, readQuorum)
  2200  	if err != nil {
  2201  		return ObjectInfo{}, toObjectErr(err, bucket, object)
  2202  	}
  2203  	if fi.Deleted {
  2204  		if opts.VersionID == "" {
  2205  			return ObjectInfo{}, toObjectErr(errFileNotFound, bucket, object)
  2206  		}
  2207  		return ObjectInfo{}, toObjectErr(errMethodNotAllowed, bucket, object)
  2208  	}
  2209  
  2210  	filterOnlineDisksInplace(fi, metaArr, onlineDisks)
  2211  
  2212  	fi.Metadata[xhttp.AmzObjectTagging] = tags
  2213  	fi.ReplicationState = opts.PutReplicationState()
  2214  	for k, v := range opts.UserDefined {
  2215  		fi.Metadata[k] = v
  2216  	}
  2217  
  2218  	if err = er.updateObjectMeta(ctx, bucket, object, fi, onlineDisks); err != nil {
  2219  		return ObjectInfo{}, toObjectErr(err, bucket, object)
  2220  	}
  2221  
  2222  	return fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended), nil
  2223  }
  2224  
  2225  func (er erasureObjects) updateObjectMetaWithOpts(ctx context.Context, bucket, object string, fi FileInfo, onlineDisks []StorageAPI, opts UpdateMetadataOpts) error {
  2226  	if len(fi.Metadata) == 0 {
  2227  		return nil
  2228  	}
  2229  
  2230  	g := errgroup.WithNErrs(len(onlineDisks))
  2231  
  2232  	// Start writing `xl.meta` to all disks in parallel.
  2233  	for index := range onlineDisks {
  2234  		index := index
  2235  		g.Go(func() error {
  2236  			if onlineDisks[index] == nil {
  2237  				return errDiskNotFound
  2238  			}
  2239  			return onlineDisks[index].UpdateMetadata(ctx, bucket, object, fi, opts)
  2240  		}, index)
  2241  	}
  2242  
  2243  	// Wait for all the routines.
  2244  	mErrs := g.Wait()
  2245  
  2246  	return reduceWriteQuorumErrs(ctx, mErrs, objectOpIgnoredErrs, fi.WriteQuorum(er.defaultWQuorum()))
  2247  }
  2248  
  2249  // updateObjectMeta will update the metadata of a file.
  2250  func (er erasureObjects) updateObjectMeta(ctx context.Context, bucket, object string, fi FileInfo, onlineDisks []StorageAPI) error {
  2251  	return er.updateObjectMetaWithOpts(ctx, bucket, object, fi, onlineDisks, UpdateMetadataOpts{})
  2252  }
  2253  
  2254  // DeleteObjectTags - delete object tags from an existing object
  2255  func (er erasureObjects) DeleteObjectTags(ctx context.Context, bucket, object string, opts ObjectOptions) (ObjectInfo, error) {
  2256  	return er.PutObjectTags(ctx, bucket, object, "", opts)
  2257  }
  2258  
  2259  // GetObjectTags - get object tags from an existing object
  2260  func (er erasureObjects) GetObjectTags(ctx context.Context, bucket, object string, opts ObjectOptions) (*tags.Tags, error) {
  2261  	// GetObjectInfo will return tag value as well
  2262  	oi, err := er.GetObjectInfo(ctx, bucket, object, opts)
  2263  	if err != nil {
  2264  		return nil, err
  2265  	}
  2266  
  2267  	return tags.ParseObjectTags(oi.UserTags)
  2268  }
  2269  
  2270  // TransitionObject - transition object content to target tier.
  2271  func (er erasureObjects) TransitionObject(ctx context.Context, bucket, object string, opts ObjectOptions) error {
  2272  	tgtClient, err := globalTierConfigMgr.getDriver(opts.Transition.Tier)
  2273  	if err != nil {
  2274  		return err
  2275  	}
  2276  
  2277  	// Acquire write lock before starting to transition the object.
  2278  	lk := er.NewNSLock(bucket, object)
  2279  	lkctx, err := lk.GetLock(ctx, globalDeleteOperationTimeout)
  2280  	if err != nil {
  2281  		return err
  2282  	}
  2283  	ctx = lkctx.Context()
  2284  	defer lk.Unlock(lkctx)
  2285  
  2286  	fi, metaArr, onlineDisks, err := er.getObjectFileInfo(ctx, bucket, object, opts, true)
  2287  	if err != nil {
  2288  		return toObjectErr(err, bucket, object)
  2289  	}
  2290  	if fi.Deleted {
  2291  		if opts.VersionID == "" {
  2292  			return toObjectErr(errFileNotFound, bucket, object)
  2293  		}
  2294  		// Make sure to return object info to provide extra information.
  2295  		return toObjectErr(errMethodNotAllowed, bucket, object)
  2296  	}
  2297  	// verify that the object queued for transition is identical to that on disk.
  2298  	if !opts.MTime.Equal(fi.ModTime) || !strings.EqualFold(opts.Transition.ETag, extractETag(fi.Metadata)) {
  2299  		return toObjectErr(errFileNotFound, bucket, object)
  2300  	}
  2301  	// if object already transitioned, return
  2302  	if fi.TransitionStatus == lifecycle.TransitionComplete {
  2303  		return nil
  2304  	}
  2305  
  2306  	if fi.XLV1 {
  2307  		if _, err = er.HealObject(ctx, bucket, object, "", madmin.HealOpts{NoLock: true}); err != nil {
  2308  			return err
  2309  		}
  2310  		// Fetch FileInfo again. HealObject migrates object the latest
  2311  		// format. Among other things this changes fi.DataDir and
  2312  		// possibly fi.Data (if data is inlined).
  2313  		fi, metaArr, onlineDisks, err = er.getObjectFileInfo(ctx, bucket, object, opts, true)
  2314  		if err != nil {
  2315  			return toObjectErr(err, bucket, object)
  2316  		}
  2317  	}
  2318  	traceFn := globalLifecycleSys.trace(fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended))
  2319  
  2320  	destObj, err := genTransitionObjName(bucket)
  2321  	if err != nil {
  2322  		return err
  2323  	}
  2324  
  2325  	pr, pw := xioutil.WaitPipe()
  2326  	go func() {
  2327  		err := er.getObjectWithFileInfo(ctx, bucket, object, 0, fi.Size, pw, fi, metaArr, onlineDisks)
  2328  		pw.CloseWithError(err)
  2329  	}()
  2330  
  2331  	var rv remoteVersionID
  2332  	rv, err = tgtClient.Put(ctx, destObj, pr, fi.Size)
  2333  	pr.CloseWithError(err)
  2334  	if err != nil {
  2335  		return err
  2336  	}
  2337  	fi.TransitionStatus = lifecycle.TransitionComplete
  2338  	fi.TransitionedObjName = destObj
  2339  	fi.TransitionTier = opts.Transition.Tier
  2340  	fi.TransitionVersionID = string(rv)
  2341  	eventName := event.ObjectTransitionComplete
  2342  
  2343  	storageDisks := er.getDisks()
  2344  
  2345  	if err = er.deleteObjectVersion(ctx, bucket, object, fi, false); err != nil {
  2346  		eventName = event.ObjectTransitionFailed
  2347  	}
  2348  
  2349  	for _, disk := range storageDisks {
  2350  		if disk != nil && disk.IsOnline() {
  2351  			continue
  2352  		}
  2353  		er.addPartial(bucket, object, opts.VersionID)
  2354  		break
  2355  	}
  2356  
  2357  	objInfo := fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended)
  2358  	sendEvent(eventArgs{
  2359  		EventName:  eventName,
  2360  		BucketName: bucket,
  2361  		Object:     objInfo,
  2362  		UserAgent:  "Internal: [ILM-Transition]",
  2363  		Host:       globalLocalNodeName,
  2364  	})
  2365  	tags := opts.LifecycleAuditEvent.Tags()
  2366  	auditLogLifecycle(ctx, objInfo, ILMTransition, tags, traceFn)
  2367  	return err
  2368  }
  2369  
  2370  // RestoreTransitionedObject - restore transitioned object content locally on this cluster.
  2371  // This is similar to PostObjectRestore from AWS GLACIER
  2372  // storage class. When PostObjectRestore API is called, a temporary copy of the object
  2373  // is restored locally to the bucket on source cluster until the restore expiry date.
  2374  // The copy that was transitioned continues to reside in the transitioned tier.
  2375  func (er erasureObjects) RestoreTransitionedObject(ctx context.Context, bucket, object string, opts ObjectOptions) error {
  2376  	return er.restoreTransitionedObject(ctx, bucket, object, opts)
  2377  }
  2378  
  2379  // update restore status header in the metadata
  2380  func (er erasureObjects) updateRestoreMetadata(ctx context.Context, bucket, object string, objInfo ObjectInfo, opts ObjectOptions) error {
  2381  	oi := objInfo.Clone()
  2382  	oi.metadataOnly = true // Perform only metadata updates.
  2383  
  2384  	// allow retry in the case of failure to restore
  2385  	delete(oi.UserDefined, xhttp.AmzRestore)
  2386  
  2387  	if _, err := er.CopyObject(ctx, bucket, object, bucket, object, oi, ObjectOptions{
  2388  		VersionID: oi.VersionID,
  2389  	}, ObjectOptions{
  2390  		VersionID: oi.VersionID,
  2391  	}); err != nil {
  2392  		logger.LogIf(ctx, fmt.Errorf("Unable to update transition restore metadata for %s/%s(%s): %s", bucket, object, oi.VersionID, err))
  2393  		return err
  2394  	}
  2395  	return nil
  2396  }
  2397  
  2398  // restoreTransitionedObject for multipart object chunks the file stream from remote tier into the same number of parts
  2399  // as in the xl.meta for this version and rehydrates the part.n into the fi.DataDir for this version as in the xl.meta
  2400  func (er erasureObjects) restoreTransitionedObject(ctx context.Context, bucket string, object string, opts ObjectOptions) error {
  2401  	setRestoreHeaderFn := func(oi ObjectInfo, rerr error) error {
  2402  		if rerr == nil {
  2403  			return nil // nothing to do; restore object was successful
  2404  		}
  2405  		er.updateRestoreMetadata(ctx, bucket, object, oi, opts)
  2406  		return rerr
  2407  	}
  2408  	var oi ObjectInfo
  2409  	// get the file info on disk for transitioned object
  2410  	actualfi, _, _, err := er.getObjectFileInfo(ctx, bucket, object, opts, false)
  2411  	if err != nil {
  2412  		return setRestoreHeaderFn(oi, toObjectErr(err, bucket, object))
  2413  	}
  2414  
  2415  	oi = actualfi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended)
  2416  	ropts := putRestoreOpts(bucket, object, opts.Transition.RestoreRequest, oi)
  2417  	if len(oi.Parts) == 1 {
  2418  		var rs *HTTPRangeSpec
  2419  		gr, err := getTransitionedObjectReader(ctx, bucket, object, rs, http.Header{}, oi, opts)
  2420  		if err != nil {
  2421  			return setRestoreHeaderFn(oi, toObjectErr(err, bucket, object))
  2422  		}
  2423  		defer gr.Close()
  2424  		hashReader, err := hash.NewReader(ctx, gr, gr.ObjInfo.Size, "", "", gr.ObjInfo.Size)
  2425  		if err != nil {
  2426  			return setRestoreHeaderFn(oi, toObjectErr(err, bucket, object))
  2427  		}
  2428  		pReader := NewPutObjReader(hashReader)
  2429  		_, err = er.PutObject(ctx, bucket, object, pReader, ropts)
  2430  		return setRestoreHeaderFn(oi, toObjectErr(err, bucket, object))
  2431  	}
  2432  
  2433  	res, err := er.NewMultipartUpload(ctx, bucket, object, ropts)
  2434  	if err != nil {
  2435  		return setRestoreHeaderFn(oi, err)
  2436  	}
  2437  
  2438  	var uploadedParts []CompletePart
  2439  	var rs *HTTPRangeSpec
  2440  	// get reader from the warm backend - note that even in the case of encrypted objects, this stream is still encrypted.
  2441  	gr, err := getTransitionedObjectReader(ctx, bucket, object, rs, http.Header{}, oi, opts)
  2442  	if err != nil {
  2443  		return setRestoreHeaderFn(oi, err)
  2444  	}
  2445  	defer gr.Close()
  2446  
  2447  	// rehydrate the parts back on disk as per the original xl.meta prior to transition
  2448  	for _, partInfo := range oi.Parts {
  2449  		hr, err := hash.NewReader(ctx, io.LimitReader(gr, partInfo.Size), partInfo.Size, "", "", partInfo.Size)
  2450  		if err != nil {
  2451  			return setRestoreHeaderFn(oi, err)
  2452  		}
  2453  		pInfo, err := er.PutObjectPart(ctx, bucket, object, res.UploadID, partInfo.Number, NewPutObjReader(hr), ObjectOptions{})
  2454  		if err != nil {
  2455  			return setRestoreHeaderFn(oi, err)
  2456  		}
  2457  		if pInfo.Size != partInfo.Size {
  2458  			return setRestoreHeaderFn(oi, InvalidObjectState{Bucket: bucket, Object: object})
  2459  		}
  2460  		uploadedParts = append(uploadedParts, CompletePart{
  2461  			PartNumber: pInfo.PartNumber,
  2462  			ETag:       pInfo.ETag,
  2463  		})
  2464  	}
  2465  	_, err = er.CompleteMultipartUpload(ctx, bucket, object, res.UploadID, uploadedParts, ObjectOptions{
  2466  		MTime: oi.ModTime,
  2467  	})
  2468  	return setRestoreHeaderFn(oi, err)
  2469  }
  2470  
  2471  // DecomTieredObject - moves tiered object to another pool during decommissioning.
  2472  func (er erasureObjects) DecomTieredObject(ctx context.Context, bucket, object string, fi FileInfo, opts ObjectOptions) error {
  2473  	if opts.UserDefined == nil {
  2474  		opts.UserDefined = make(map[string]string)
  2475  	}
  2476  	// overlay Erasure info for this set of disks
  2477  	storageDisks := er.getDisks()
  2478  	// Get parity and data drive count based on storage class metadata
  2479  	parityDrives := globalStorageClass.GetParityForSC(opts.UserDefined[xhttp.AmzStorageClass])
  2480  	if parityDrives < 0 {
  2481  		parityDrives = er.defaultParityCount
  2482  	}
  2483  	dataDrives := len(storageDisks) - parityDrives
  2484  
  2485  	// we now know the number of blocks this object needs for data and parity.
  2486  	// writeQuorum is dataBlocks + 1
  2487  	writeQuorum := dataDrives
  2488  	if dataDrives == parityDrives {
  2489  		writeQuorum++
  2490  	}
  2491  
  2492  	// Initialize parts metadata
  2493  	partsMetadata := make([]FileInfo, len(storageDisks))
  2494  
  2495  	fi2 := newFileInfo(pathJoin(bucket, object), dataDrives, parityDrives)
  2496  	fi.Erasure = fi2.Erasure
  2497  	// Initialize erasure metadata.
  2498  	for index := range partsMetadata {
  2499  		partsMetadata[index] = fi
  2500  		partsMetadata[index].Erasure.Index = index + 1
  2501  	}
  2502  
  2503  	// Order disks according to erasure distribution
  2504  	var onlineDisks []StorageAPI
  2505  	onlineDisks, partsMetadata = shuffleDisksAndPartsMetadata(storageDisks, partsMetadata, fi)
  2506  
  2507  	if _, err := writeUniqueFileInfo(ctx, onlineDisks, "", bucket, object, partsMetadata, writeQuorum); err != nil {
  2508  		return toObjectErr(err, bucket, object)
  2509  	}
  2510  
  2511  	return nil
  2512  }