github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/erasure-healing.go (about)

     1  // Copyright (c) 2015-2021 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package cmd
    19  
    20  import (
    21  	"bytes"
    22  	"context"
    23  	"errors"
    24  	"fmt"
    25  	"io"
    26  	"strconv"
    27  	"strings"
    28  	"sync"
    29  	"time"
    30  
    31  	"github.com/minio/madmin-go/v3"
    32  	"github.com/minio/minio/internal/logger"
    33  	"github.com/minio/pkg/v2/sync/errgroup"
    34  )
    35  
    36  //go:generate stringer -type=healingMetric -trimprefix=healingMetric $GOFILE
    37  
    38  type healingMetric uint8
    39  
    40  const (
    41  	healingMetricBucket healingMetric = iota
    42  	healingMetricObject
    43  	healingMetricCheckAbandonedParts
    44  )
    45  
    46  func (er erasureObjects) listAndHeal(bucket, prefix string, scanMode madmin.HealScanMode, healEntry func(string, metaCacheEntry, madmin.HealScanMode) error) error {
    47  	ctx, cancel := context.WithCancel(context.Background())
    48  	defer cancel()
    49  
    50  	disks, _ := er.getOnlineDisksWithHealing(false)
    51  	if len(disks) == 0 {
    52  		return errors.New("listAndHeal: No non-healing drives found")
    53  	}
    54  
    55  	// How to resolve partial results.
    56  	resolver := metadataResolutionParams{
    57  		dirQuorum: 1,
    58  		objQuorum: 1,
    59  		bucket:    bucket,
    60  		strict:    false, // Allow less strict matching.
    61  	}
    62  
    63  	path := baseDirFromPrefix(prefix)
    64  	filterPrefix := strings.Trim(strings.TrimPrefix(prefix, path), slashSeparator)
    65  	if path == prefix {
    66  		filterPrefix = ""
    67  	}
    68  
    69  	lopts := listPathRawOptions{
    70  		disks:          disks,
    71  		bucket:         bucket,
    72  		path:           path,
    73  		filterPrefix:   filterPrefix,
    74  		recursive:      true,
    75  		forwardTo:      "",
    76  		minDisks:       1,
    77  		reportNotFound: false,
    78  		agreed: func(entry metaCacheEntry) {
    79  			if err := healEntry(bucket, entry, scanMode); err != nil {
    80  				cancel()
    81  			}
    82  		},
    83  		partial: func(entries metaCacheEntries, _ []error) {
    84  			entry, ok := entries.resolve(&resolver)
    85  			if !ok {
    86  				// check if we can get one entry at least
    87  				// proceed to heal nonetheless.
    88  				entry, _ = entries.firstFound()
    89  			}
    90  
    91  			if err := healEntry(bucket, *entry, scanMode); err != nil {
    92  				cancel()
    93  				return
    94  			}
    95  		},
    96  		finished: nil,
    97  	}
    98  
    99  	if err := listPathRaw(ctx, lopts); err != nil {
   100  		return fmt.Errorf("listPathRaw returned %w: opts(%#v)", err, lopts)
   101  	}
   102  
   103  	return nil
   104  }
   105  
   106  // listAllBuckets lists all buckets from all disks. It also
   107  // returns the occurrence of each buckets in all disks
   108  func listAllBuckets(ctx context.Context, storageDisks []StorageAPI, healBuckets map[string]VolInfo, readQuorum int) error {
   109  	g := errgroup.WithNErrs(len(storageDisks))
   110  	var mu sync.Mutex
   111  	for index := range storageDisks {
   112  		index := index
   113  		g.Go(func() error {
   114  			if storageDisks[index] == nil {
   115  				// we ignore disk not found errors
   116  				return nil
   117  			}
   118  			if storageDisks[index].Healing() != nil {
   119  				// we ignore disks under healing
   120  				return nil
   121  			}
   122  			volsInfo, err := storageDisks[index].ListVols(ctx)
   123  			if err != nil {
   124  				return err
   125  			}
   126  			for _, volInfo := range volsInfo {
   127  				// StorageAPI can send volume names which are
   128  				// incompatible with buckets - these are
   129  				// skipped, like the meta-bucket.
   130  				if isReservedOrInvalidBucket(volInfo.Name, false) {
   131  					continue
   132  				}
   133  				mu.Lock()
   134  				if _, ok := healBuckets[volInfo.Name]; !ok {
   135  					healBuckets[volInfo.Name] = volInfo
   136  				}
   137  				mu.Unlock()
   138  			}
   139  			return nil
   140  		}, index)
   141  	}
   142  	return reduceReadQuorumErrs(ctx, g.Wait(), bucketMetadataOpIgnoredErrs, readQuorum)
   143  }
   144  
   145  // Only heal on disks where we are sure that healing is needed. We can expand
   146  // this list as and when we figure out more errors can be added to this list safely.
   147  func shouldHealObjectOnDisk(erErr, dataErr error, meta FileInfo, latestMeta FileInfo) bool {
   148  	switch {
   149  	case errors.Is(erErr, errFileNotFound) || errors.Is(erErr, errFileVersionNotFound):
   150  		return true
   151  	case errors.Is(erErr, errFileCorrupt):
   152  		return true
   153  	}
   154  	if erErr == nil {
   155  		if meta.XLV1 {
   156  			// Legacy means heal always
   157  			// always check first.
   158  			return true
   159  		}
   160  		if !meta.Deleted && !meta.IsRemote() {
   161  			// If xl.meta was read fine but there may be problem with the part.N files.
   162  			if IsErr(dataErr, []error{
   163  				errFileNotFound,
   164  				errFileVersionNotFound,
   165  				errFileCorrupt,
   166  			}...) {
   167  				return true
   168  			}
   169  		}
   170  		if !latestMeta.Equals(meta) {
   171  			return true
   172  		}
   173  	}
   174  	return false
   175  }
   176  
   177  const (
   178  	xMinIOHealing = ReservedMetadataPrefix + "healing"
   179  	xMinIODataMov = ReservedMetadataPrefix + "data-mov"
   180  )
   181  
   182  // SetHealing marks object (version) as being healed.
   183  // Note: this is to be used only from healObject
   184  func (fi *FileInfo) SetHealing() {
   185  	if fi.Metadata == nil {
   186  		fi.Metadata = make(map[string]string)
   187  	}
   188  	fi.Metadata[xMinIOHealing] = "true"
   189  }
   190  
   191  // Healing returns true if object is being healed (i.e fi is being passed down
   192  // from healObject)
   193  func (fi FileInfo) Healing() bool {
   194  	_, ok := fi.Metadata[xMinIOHealing]
   195  	return ok
   196  }
   197  
   198  // SetDataMov marks object (version) as being currently
   199  // in movement, such as decommissioning or rebalance.
   200  func (fi *FileInfo) SetDataMov() {
   201  	if fi.Metadata == nil {
   202  		fi.Metadata = make(map[string]string)
   203  	}
   204  	fi.Metadata[xMinIODataMov] = "true"
   205  }
   206  
   207  // DataMov returns true if object is being in movement
   208  func (fi FileInfo) DataMov() bool {
   209  	_, ok := fi.Metadata[xMinIODataMov]
   210  	return ok
   211  }
   212  
   213  // Heals an object by re-writing corrupt/missing erasure blocks.
   214  func (er *erasureObjects) healObject(ctx context.Context, bucket string, object string, versionID string, opts madmin.HealOpts) (result madmin.HealResultItem, err error) {
   215  	dryRun := opts.DryRun
   216  	scanMode := opts.ScanMode
   217  
   218  	storageDisks := er.getDisks()
   219  	storageEndpoints := er.getEndpoints()
   220  
   221  	if globalTrace.NumSubscribers(madmin.TraceHealing) > 0 {
   222  		startTime := time.Now()
   223  		defer func() {
   224  			healTrace(healingMetricObject, startTime, bucket, object, &opts, err, &result)
   225  		}()
   226  	}
   227  	// Initialize heal result object
   228  	result = madmin.HealResultItem{
   229  		Type:      madmin.HealItemObject,
   230  		Bucket:    bucket,
   231  		Object:    object,
   232  		VersionID: versionID,
   233  		DiskCount: len(storageDisks),
   234  	}
   235  
   236  	if !opts.NoLock {
   237  		lk := er.NewNSLock(bucket, object)
   238  		lkctx, err := lk.GetLock(ctx, globalOperationTimeout)
   239  		if err != nil {
   240  			return result, err
   241  		}
   242  		ctx = lkctx.Context()
   243  		defer lk.Unlock(lkctx)
   244  	}
   245  
   246  	// Re-read when we have lock...
   247  	partsMetadata, errs := readAllFileInfo(ctx, storageDisks, "", bucket, object, versionID, true, true)
   248  	if isAllNotFound(errs) {
   249  		err := errFileNotFound
   250  		if versionID != "" {
   251  			err = errFileVersionNotFound
   252  		}
   253  		// Nothing to do, file is already gone.
   254  		return er.defaultHealResult(FileInfo{}, storageDisks, storageEndpoints,
   255  			errs, bucket, object, versionID), err
   256  	}
   257  
   258  	readQuorum, _, err := objectQuorumFromMeta(ctx, partsMetadata, errs, er.defaultParityCount)
   259  	if err != nil {
   260  		m, err := er.deleteIfDangling(ctx, bucket, object, partsMetadata, errs, nil, ObjectOptions{
   261  			VersionID: versionID,
   262  		})
   263  		errs = make([]error, len(errs))
   264  		for i := range errs {
   265  			errs[i] = err
   266  		}
   267  		if err == nil {
   268  			// Dangling object successfully purged, size is '0'
   269  			m.Size = 0
   270  		}
   271  		// Generate file/version not found with default heal result
   272  		err = errFileNotFound
   273  		if versionID != "" {
   274  			err = errFileVersionNotFound
   275  		}
   276  		return er.defaultHealResult(m, storageDisks, storageEndpoints,
   277  			errs, bucket, object, versionID), err
   278  	}
   279  
   280  	result.ParityBlocks = result.DiskCount - readQuorum
   281  	result.DataBlocks = readQuorum
   282  
   283  	// List of disks having latest version of the object xl.meta
   284  	// (by modtime).
   285  	onlineDisks, modTime, etag := listOnlineDisks(storageDisks, partsMetadata, errs, readQuorum)
   286  
   287  	// Latest FileInfo for reference. If a valid metadata is not
   288  	// present, it is as good as object not found.
   289  	latestMeta, err := pickValidFileInfo(ctx, partsMetadata, modTime, etag, readQuorum)
   290  	if err != nil {
   291  		return result, err
   292  	}
   293  
   294  	// List of disks having all parts as per latest metadata.
   295  	// NOTE: do not pass in latestDisks to diskWithAllParts since
   296  	// the diskWithAllParts needs to reach the drive to ensure
   297  	// validity of the metadata content, we should make sure that
   298  	// we pass in disks as is for it to be verified. Once verified
   299  	// the disksWithAllParts() returns the actual disks that can be
   300  	// used here for reconstruction. This is done to ensure that
   301  	// we do not skip drives that have inconsistent metadata to be
   302  	// skipped from purging when they are stale.
   303  	availableDisks, dataErrs, _ := disksWithAllParts(ctx, onlineDisks, partsMetadata,
   304  		errs, latestMeta, bucket, object, scanMode)
   305  
   306  	var erasure Erasure
   307  	if !latestMeta.Deleted && !latestMeta.IsRemote() {
   308  		// Initialize erasure coding
   309  		erasure, err = NewErasure(ctx, latestMeta.Erasure.DataBlocks,
   310  			latestMeta.Erasure.ParityBlocks, latestMeta.Erasure.BlockSize)
   311  		if err != nil {
   312  			return result, err
   313  		}
   314  	}
   315  
   316  	result.ObjectSize, err = latestMeta.ToObjectInfo(bucket, object, true).GetActualSize()
   317  	if err != nil {
   318  		return result, err
   319  	}
   320  
   321  	// Loop to find number of disks with valid data, per-drive
   322  	// data state and a list of outdated disks on which data needs
   323  	// to be healed.
   324  	outDatedDisks := make([]StorageAPI, len(storageDisks))
   325  	disksToHealCount := 0
   326  	for i, v := range availableDisks {
   327  		driveState := ""
   328  		switch {
   329  		case v != nil:
   330  			driveState = madmin.DriveStateOk
   331  		case errs[i] == errDiskNotFound, dataErrs[i] == errDiskNotFound:
   332  			driveState = madmin.DriveStateOffline
   333  		case errs[i] == errFileNotFound, errs[i] == errFileVersionNotFound, errs[i] == errVolumeNotFound:
   334  			fallthrough
   335  		case dataErrs[i] == errFileNotFound, dataErrs[i] == errFileVersionNotFound, dataErrs[i] == errVolumeNotFound:
   336  			driveState = madmin.DriveStateMissing
   337  		default:
   338  			// all remaining cases imply corrupt data/metadata
   339  			driveState = madmin.DriveStateCorrupt
   340  		}
   341  
   342  		if shouldHealObjectOnDisk(errs[i], dataErrs[i], partsMetadata[i], latestMeta) {
   343  			outDatedDisks[i] = storageDisks[i]
   344  			disksToHealCount++
   345  			result.Before.Drives = append(result.Before.Drives, madmin.HealDriveInfo{
   346  				UUID:     "",
   347  				Endpoint: storageEndpoints[i].String(),
   348  				State:    driveState,
   349  			})
   350  			result.After.Drives = append(result.After.Drives, madmin.HealDriveInfo{
   351  				UUID:     "",
   352  				Endpoint: storageEndpoints[i].String(),
   353  				State:    driveState,
   354  			})
   355  			continue
   356  		}
   357  		result.Before.Drives = append(result.Before.Drives, madmin.HealDriveInfo{
   358  			UUID:     "",
   359  			Endpoint: storageEndpoints[i].String(),
   360  			State:    driveState,
   361  		})
   362  		result.After.Drives = append(result.After.Drives, madmin.HealDriveInfo{
   363  			UUID:     "",
   364  			Endpoint: storageEndpoints[i].String(),
   365  			State:    driveState,
   366  		})
   367  	}
   368  
   369  	if isAllNotFound(errs) {
   370  		// File is fully gone, fileInfo is empty.
   371  		err := errFileNotFound
   372  		if versionID != "" {
   373  			err = errFileVersionNotFound
   374  		}
   375  		return er.defaultHealResult(FileInfo{}, storageDisks, storageEndpoints, errs,
   376  			bucket, object, versionID), err
   377  	}
   378  
   379  	if disksToHealCount == 0 {
   380  		// Nothing to heal!
   381  		return result, nil
   382  	}
   383  
   384  	// After this point, only have to repair data on disk - so
   385  	// return if it is a dry-run
   386  	if dryRun {
   387  		return result, nil
   388  	}
   389  
   390  	if !latestMeta.XLV1 && !latestMeta.Deleted && disksToHealCount > latestMeta.Erasure.ParityBlocks {
   391  		// Allow for dangling deletes, on versions that have DataDir missing etc.
   392  		// this would end up restoring the correct readable versions.
   393  		m, err := er.deleteIfDangling(ctx, bucket, object, partsMetadata, errs, dataErrs, ObjectOptions{
   394  			VersionID: versionID,
   395  		})
   396  		errs = make([]error, len(errs))
   397  		for i := range errs {
   398  			errs[i] = err
   399  		}
   400  		if err == nil {
   401  			// Dangling object successfully purged, size is '0'
   402  			m.Size = 0
   403  		}
   404  		// Generate file/version not found with default heal result
   405  		err = errFileNotFound
   406  		if versionID != "" {
   407  			err = errFileVersionNotFound
   408  		}
   409  		return er.defaultHealResult(m, storageDisks, storageEndpoints,
   410  			errs, bucket, object, versionID), err
   411  	}
   412  
   413  	cleanFileInfo := func(fi FileInfo) FileInfo {
   414  		// Returns a copy of the 'fi' with erasure index, checksums and inline data niled.
   415  		nfi := fi
   416  		if !nfi.IsRemote() {
   417  			nfi.Data = nil
   418  			nfi.Erasure.Index = 0
   419  			nfi.Erasure.Checksums = nil
   420  		}
   421  		return nfi
   422  	}
   423  
   424  	// We write at temporary location and then rename to final location.
   425  	tmpID := mustGetUUID()
   426  	migrateDataDir := mustGetUUID()
   427  
   428  	// Reorder so that we have data disks first and parity disks next.
   429  	if !latestMeta.Deleted && len(latestMeta.Erasure.Distribution) != len(availableDisks) {
   430  		err := fmt.Errorf("unexpected file distribution (%v) from available disks (%v), looks like backend disks have been manually modified refusing to heal %s/%s(%s)",
   431  			latestMeta.Erasure.Distribution, availableDisks, bucket, object, versionID)
   432  		logger.LogOnceIf(ctx, err, "heal-object-available-disks")
   433  		return er.defaultHealResult(latestMeta, storageDisks, storageEndpoints, errs,
   434  			bucket, object, versionID), err
   435  	}
   436  
   437  	latestDisks := shuffleDisks(availableDisks, latestMeta.Erasure.Distribution)
   438  
   439  	if !latestMeta.Deleted && len(latestMeta.Erasure.Distribution) != len(outDatedDisks) {
   440  		err := fmt.Errorf("unexpected file distribution (%v) from outdated disks (%v), looks like backend disks have been manually modified refusing to heal %s/%s(%s)",
   441  			latestMeta.Erasure.Distribution, outDatedDisks, bucket, object, versionID)
   442  		logger.LogOnceIf(ctx, err, "heal-object-outdated-disks")
   443  		return er.defaultHealResult(latestMeta, storageDisks, storageEndpoints, errs,
   444  			bucket, object, versionID), err
   445  	}
   446  
   447  	outDatedDisks = shuffleDisks(outDatedDisks, latestMeta.Erasure.Distribution)
   448  
   449  	if !latestMeta.Deleted && len(latestMeta.Erasure.Distribution) != len(partsMetadata) {
   450  		err := fmt.Errorf("unexpected file distribution (%v) from metadata entries (%v), looks like backend disks have been manually modified refusing to heal %s/%s(%s)",
   451  			latestMeta.Erasure.Distribution, len(partsMetadata), bucket, object, versionID)
   452  		logger.LogOnceIf(ctx, err, "heal-object-metadata-entries")
   453  		return er.defaultHealResult(latestMeta, storageDisks, storageEndpoints, errs,
   454  			bucket, object, versionID), err
   455  	}
   456  
   457  	partsMetadata = shufflePartsMetadata(partsMetadata, latestMeta.Erasure.Distribution)
   458  
   459  	copyPartsMetadata := make([]FileInfo, len(partsMetadata))
   460  	for i := range latestDisks {
   461  		if latestDisks[i] == nil {
   462  			continue
   463  		}
   464  		copyPartsMetadata[i] = partsMetadata[i]
   465  	}
   466  
   467  	for i := range outDatedDisks {
   468  		if outDatedDisks[i] == nil {
   469  			continue
   470  		}
   471  		// Make sure to write the FileInfo information
   472  		// that is expected to be in quorum.
   473  		partsMetadata[i] = cleanFileInfo(latestMeta)
   474  	}
   475  
   476  	// source data dir shall be empty in case of XLV1
   477  	// differentiate it with dstDataDir for readability
   478  	// srcDataDir is the one used with newBitrotReader()
   479  	// to read existing content.
   480  	srcDataDir := latestMeta.DataDir
   481  	dstDataDir := latestMeta.DataDir
   482  	if latestMeta.XLV1 {
   483  		dstDataDir = migrateDataDir
   484  	}
   485  
   486  	var inlineBuffers []*bytes.Buffer
   487  	if !latestMeta.Deleted && !latestMeta.IsRemote() {
   488  		if latestMeta.InlineData() {
   489  			inlineBuffers = make([]*bytes.Buffer, len(outDatedDisks))
   490  		}
   491  
   492  		erasureInfo := latestMeta.Erasure
   493  		for partIndex := 0; partIndex < len(latestMeta.Parts); partIndex++ {
   494  			partSize := latestMeta.Parts[partIndex].Size
   495  			partActualSize := latestMeta.Parts[partIndex].ActualSize
   496  			partModTime := latestMeta.Parts[partIndex].ModTime
   497  			partNumber := latestMeta.Parts[partIndex].Number
   498  			partIdx := latestMeta.Parts[partIndex].Index
   499  			partChecksums := latestMeta.Parts[partIndex].Checksums
   500  			tillOffset := erasure.ShardFileOffset(0, partSize, partSize)
   501  			readers := make([]io.ReaderAt, len(latestDisks))
   502  			prefer := make([]bool, len(latestDisks))
   503  			checksumAlgo := erasureInfo.GetChecksumInfo(partNumber).Algorithm
   504  			for i, disk := range latestDisks {
   505  				if disk == OfflineDisk {
   506  					continue
   507  				}
   508  				checksumInfo := copyPartsMetadata[i].Erasure.GetChecksumInfo(partNumber)
   509  				partPath := pathJoin(object, srcDataDir, fmt.Sprintf("part.%d", partNumber))
   510  				readers[i] = newBitrotReader(disk, copyPartsMetadata[i].Data, bucket, partPath, tillOffset, checksumAlgo,
   511  					checksumInfo.Hash, erasure.ShardSize())
   512  				prefer[i] = disk.Hostname() == ""
   513  
   514  			}
   515  			writers := make([]io.Writer, len(outDatedDisks))
   516  			for i, disk := range outDatedDisks {
   517  				if disk == OfflineDisk {
   518  					continue
   519  				}
   520  				partPath := pathJoin(tmpID, dstDataDir, fmt.Sprintf("part.%d", partNumber))
   521  				if len(inlineBuffers) > 0 {
   522  					inlineBuffers[i] = bytes.NewBuffer(make([]byte, 0, erasure.ShardFileSize(latestMeta.Size)+32))
   523  					writers[i] = newStreamingBitrotWriterBuffer(inlineBuffers[i], DefaultBitrotAlgorithm, erasure.ShardSize())
   524  				} else {
   525  					writers[i] = newBitrotWriter(disk, bucket, minioMetaTmpBucket, partPath,
   526  						tillOffset, DefaultBitrotAlgorithm, erasure.ShardSize())
   527  				}
   528  			}
   529  
   530  			// Heal each part. erasure.Heal() will write the healed
   531  			// part to .minio/tmp/uuid/ which needs to be renamed
   532  			// later to the final location.
   533  			err = erasure.Heal(ctx, writers, readers, partSize, prefer)
   534  			closeBitrotReaders(readers)
   535  			closeBitrotWriters(writers)
   536  			if err != nil {
   537  				return result, err
   538  			}
   539  
   540  			// outDatedDisks that had write errors should not be
   541  			// written to for remaining parts, so we nil it out.
   542  			for i, disk := range outDatedDisks {
   543  				if disk == OfflineDisk {
   544  					continue
   545  				}
   546  
   547  				// A non-nil stale disk which did not receive
   548  				// a healed part checksum had a write error.
   549  				if writers[i] == nil {
   550  					outDatedDisks[i] = nil
   551  					disksToHealCount--
   552  					continue
   553  				}
   554  
   555  				partsMetadata[i].DataDir = dstDataDir
   556  				partsMetadata[i].AddObjectPart(partNumber, "", partSize, partActualSize, partModTime, partIdx, partChecksums)
   557  				if len(inlineBuffers) > 0 && inlineBuffers[i] != nil {
   558  					partsMetadata[i].Data = inlineBuffers[i].Bytes()
   559  					partsMetadata[i].SetInlineData()
   560  				} else {
   561  					partsMetadata[i].Data = nil
   562  				}
   563  			}
   564  
   565  			// If all disks are having errors, we give up.
   566  			if disksToHealCount == 0 {
   567  				return result, fmt.Errorf("all drives had write errors, unable to heal %s/%s", bucket, object)
   568  			}
   569  
   570  		}
   571  
   572  	}
   573  
   574  	defer er.deleteAll(context.Background(), minioMetaTmpBucket, tmpID)
   575  
   576  	// Rename from tmp location to the actual location.
   577  	for i, disk := range outDatedDisks {
   578  		if disk == OfflineDisk {
   579  			continue
   580  		}
   581  
   582  		// record the index of the updated disks
   583  		partsMetadata[i].Erasure.Index = i + 1
   584  
   585  		// Attempt a rename now from healed data to final location.
   586  		partsMetadata[i].SetHealing()
   587  
   588  		if _, err = disk.RenameData(ctx, minioMetaTmpBucket, tmpID, partsMetadata[i], bucket, object, RenameOptions{}); err != nil {
   589  			return result, err
   590  		}
   591  
   592  		// - Remove any remaining parts from outdated disks from before transition.
   593  		if partsMetadata[i].IsRemote() {
   594  			rmDataDir := partsMetadata[i].DataDir
   595  			disk.Delete(ctx, bucket, pathJoin(encodeDirObject(object), rmDataDir), DeleteOptions{
   596  				Immediate: true,
   597  				Recursive: true,
   598  			})
   599  		}
   600  
   601  		for i, v := range result.Before.Drives {
   602  			if v.Endpoint == disk.String() {
   603  				result.After.Drives[i].State = madmin.DriveStateOk
   604  			}
   605  		}
   606  	}
   607  
   608  	return result, nil
   609  }
   610  
   611  // checkAbandonedParts will check if an object has abandoned parts,
   612  // meaning data-dirs or inlined data that are no longer referenced by the xl.meta
   613  // Errors are generally ignored by this function.
   614  func (er *erasureObjects) checkAbandonedParts(ctx context.Context, bucket string, object string, opts madmin.HealOpts) (err error) {
   615  	if !opts.Remove || opts.DryRun {
   616  		return nil
   617  	}
   618  	if globalTrace.NumSubscribers(madmin.TraceHealing) > 0 {
   619  		startTime := time.Now()
   620  		defer func() {
   621  			healTrace(healingMetricCheckAbandonedParts, startTime, bucket, object, nil, err, nil)
   622  		}()
   623  	}
   624  	if !opts.NoLock {
   625  		lk := er.NewNSLock(bucket, object)
   626  		lkctx, err := lk.GetLock(ctx, globalOperationTimeout)
   627  		if err != nil {
   628  			return err
   629  		}
   630  		ctx = lkctx.Context()
   631  		defer lk.Unlock(lkctx)
   632  	}
   633  	var wg sync.WaitGroup
   634  	for _, disk := range er.getDisks() {
   635  		if disk != nil {
   636  			wg.Add(1)
   637  			go func(disk StorageAPI) {
   638  				defer wg.Done()
   639  				_ = disk.CleanAbandonedData(ctx, bucket, object)
   640  			}(disk)
   641  		}
   642  	}
   643  	wg.Wait()
   644  	return nil
   645  }
   646  
   647  // healObjectDir - heals object directory specifically, this special call
   648  // is needed since we do not have a special backend format for directories.
   649  func (er *erasureObjects) healObjectDir(ctx context.Context, bucket, object string, dryRun bool, remove bool) (hr madmin.HealResultItem, err error) {
   650  	storageDisks := er.getDisks()
   651  	storageEndpoints := er.getEndpoints()
   652  
   653  	// Initialize heal result object
   654  	hr = madmin.HealResultItem{
   655  		Type:         madmin.HealItemObject,
   656  		Bucket:       bucket,
   657  		Object:       object,
   658  		DiskCount:    len(storageDisks),
   659  		ParityBlocks: er.defaultParityCount,
   660  		DataBlocks:   len(storageDisks) - er.defaultParityCount,
   661  		ObjectSize:   0,
   662  	}
   663  
   664  	hr.Before.Drives = make([]madmin.HealDriveInfo, len(storageDisks))
   665  	hr.After.Drives = make([]madmin.HealDriveInfo, len(storageDisks))
   666  
   667  	errs := statAllDirs(ctx, storageDisks, bucket, object)
   668  	danglingObject := isObjectDirDangling(errs)
   669  	if danglingObject {
   670  		if !dryRun && remove {
   671  			var wg sync.WaitGroup
   672  			// Remove versions in bulk for each disk
   673  			for index, disk := range storageDisks {
   674  				if disk == nil {
   675  					continue
   676  				}
   677  				wg.Add(1)
   678  				go func(index int, disk StorageAPI) {
   679  					defer wg.Done()
   680  					_ = disk.Delete(ctx, bucket, object, DeleteOptions{
   681  						Recursive: false,
   682  						Immediate: false,
   683  					})
   684  				}(index, disk)
   685  			}
   686  			wg.Wait()
   687  		}
   688  	}
   689  
   690  	// Prepare object creation in all disks
   691  	for i, err := range errs {
   692  		drive := storageEndpoints[i].String()
   693  		switch err {
   694  		case nil:
   695  			hr.Before.Drives[i] = madmin.HealDriveInfo{Endpoint: drive, State: madmin.DriveStateOk}
   696  			hr.After.Drives[i] = madmin.HealDriveInfo{Endpoint: drive, State: madmin.DriveStateOk}
   697  		case errDiskNotFound:
   698  			hr.Before.Drives[i] = madmin.HealDriveInfo{State: madmin.DriveStateOffline}
   699  			hr.After.Drives[i] = madmin.HealDriveInfo{State: madmin.DriveStateOffline}
   700  		case errVolumeNotFound, errFileNotFound:
   701  			// Bucket or prefix/directory not found
   702  			hr.Before.Drives[i] = madmin.HealDriveInfo{Endpoint: drive, State: madmin.DriveStateMissing}
   703  			hr.After.Drives[i] = madmin.HealDriveInfo{Endpoint: drive, State: madmin.DriveStateMissing}
   704  		default:
   705  			hr.Before.Drives[i] = madmin.HealDriveInfo{Endpoint: drive, State: madmin.DriveStateCorrupt}
   706  			hr.After.Drives[i] = madmin.HealDriveInfo{Endpoint: drive, State: madmin.DriveStateCorrupt}
   707  		}
   708  	}
   709  	if danglingObject || isAllNotFound(errs) {
   710  		// Nothing to do, file is already gone.
   711  		return hr, errFileNotFound
   712  	}
   713  
   714  	if dryRun {
   715  		// Quit without try to heal the object dir
   716  		return hr, nil
   717  	}
   718  
   719  	for i, err := range errs {
   720  		if err == errVolumeNotFound || err == errFileNotFound {
   721  			// Bucket or prefix/directory not found
   722  			merr := storageDisks[i].MakeVol(ctx, pathJoin(bucket, object))
   723  			switch merr {
   724  			case nil, errVolumeExists:
   725  				hr.After.Drives[i].State = madmin.DriveStateOk
   726  			case errDiskNotFound:
   727  				hr.After.Drives[i].State = madmin.DriveStateOffline
   728  			default:
   729  				hr.After.Drives[i].State = madmin.DriveStateCorrupt
   730  			}
   731  		}
   732  	}
   733  	return hr, nil
   734  }
   735  
   736  // Populates default heal result item entries with possible values when we are returning prematurely.
   737  // This is to ensure that in any circumstance we are not returning empty arrays with wrong values.
   738  func (er *erasureObjects) defaultHealResult(lfi FileInfo, storageDisks []StorageAPI, storageEndpoints []Endpoint, errs []error, bucket, object, versionID string) madmin.HealResultItem {
   739  	// Initialize heal result object
   740  	result := madmin.HealResultItem{
   741  		Type:       madmin.HealItemObject,
   742  		Bucket:     bucket,
   743  		Object:     object,
   744  		ObjectSize: lfi.Size,
   745  		VersionID:  versionID,
   746  		DiskCount:  len(storageDisks),
   747  	}
   748  
   749  	if lfi.IsValid() {
   750  		result.ParityBlocks = lfi.Erasure.ParityBlocks
   751  	} else {
   752  		// Default to most common configuration for erasure blocks.
   753  		result.ParityBlocks = er.defaultParityCount
   754  	}
   755  	result.DataBlocks = len(storageDisks) - result.ParityBlocks
   756  
   757  	for index, disk := range storageDisks {
   758  		if disk == nil {
   759  			result.Before.Drives = append(result.Before.Drives, madmin.HealDriveInfo{
   760  				UUID:     "",
   761  				Endpoint: storageEndpoints[index].String(),
   762  				State:    madmin.DriveStateOffline,
   763  			})
   764  			result.After.Drives = append(result.After.Drives, madmin.HealDriveInfo{
   765  				UUID:     "",
   766  				Endpoint: storageEndpoints[index].String(),
   767  				State:    madmin.DriveStateOffline,
   768  			})
   769  			continue
   770  		}
   771  		driveState := madmin.DriveStateCorrupt
   772  		switch errs[index] {
   773  		case errFileNotFound, errVolumeNotFound:
   774  			driveState = madmin.DriveStateMissing
   775  		case nil:
   776  			driveState = madmin.DriveStateOk
   777  		}
   778  		result.Before.Drives = append(result.Before.Drives, madmin.HealDriveInfo{
   779  			UUID:     "",
   780  			Endpoint: storageEndpoints[index].String(),
   781  			State:    driveState,
   782  		})
   783  		result.After.Drives = append(result.After.Drives, madmin.HealDriveInfo{
   784  			UUID:     "",
   785  			Endpoint: storageEndpoints[index].String(),
   786  			State:    driveState,
   787  		})
   788  	}
   789  
   790  	return result
   791  }
   792  
   793  // Stat all directories.
   794  func statAllDirs(ctx context.Context, storageDisks []StorageAPI, bucket, prefix string) []error {
   795  	g := errgroup.WithNErrs(len(storageDisks))
   796  	for index, disk := range storageDisks {
   797  		if disk == nil {
   798  			continue
   799  		}
   800  		index := index
   801  		g.Go(func() error {
   802  			entries, err := storageDisks[index].ListDir(ctx, "", bucket, prefix, 1)
   803  			if err != nil {
   804  				return err
   805  			}
   806  			if len(entries) > 0 {
   807  				return errVolumeNotEmpty
   808  			}
   809  			return nil
   810  		}, index)
   811  	}
   812  
   813  	return g.Wait()
   814  }
   815  
   816  func isAllVolumeNotFound(errs []error) bool {
   817  	return countErrs(errs, errVolumeNotFound) == len(errs)
   818  }
   819  
   820  // isAllNotFound will return if any element of the error slice is not
   821  // errFileNotFound, errFileVersionNotFound or errVolumeNotFound.
   822  // A 0 length slice will always return false.
   823  func isAllNotFound(errs []error) bool {
   824  	for _, err := range errs {
   825  		if err != nil {
   826  			switch err.Error() {
   827  			case errFileNotFound.Error():
   828  				fallthrough
   829  			case errVolumeNotFound.Error():
   830  				fallthrough
   831  			case errFileVersionNotFound.Error():
   832  				continue
   833  			}
   834  		}
   835  		return false
   836  	}
   837  	return len(errs) > 0
   838  }
   839  
   840  // isAllBucketsNotFound will return true if all the errors are either errFileNotFound
   841  // or errFileCorrupt
   842  // A 0 length slice will always return false.
   843  func isAllBucketsNotFound(errs []error) bool {
   844  	if len(errs) == 0 {
   845  		return false
   846  	}
   847  	notFoundCount := 0
   848  	for _, err := range errs {
   849  		if err != nil {
   850  			if errors.Is(err, errVolumeNotFound) {
   851  				notFoundCount++
   852  			} else if isErrBucketNotFound(err) {
   853  				notFoundCount++
   854  			}
   855  		}
   856  	}
   857  	return len(errs) == notFoundCount
   858  }
   859  
   860  // ObjectDir is considered dangling/corrupted if any only
   861  // if total disks - a combination of corrupted and missing
   862  // files is lesser than N/2+1 number of disks.
   863  // If no files were found false will be returned.
   864  func isObjectDirDangling(errs []error) (ok bool) {
   865  	var found int
   866  	var notFound int
   867  	var foundNotEmpty int
   868  	var otherFound int
   869  	for _, readErr := range errs {
   870  		switch {
   871  		case readErr == nil:
   872  			found++
   873  		case readErr == errFileNotFound || readErr == errVolumeNotFound:
   874  			notFound++
   875  		case readErr == errVolumeNotEmpty:
   876  			foundNotEmpty++
   877  		default:
   878  			otherFound++
   879  		}
   880  	}
   881  	found = found + foundNotEmpty + otherFound
   882  	return found < notFound && found > 0
   883  }
   884  
   885  // Object is considered dangling/corrupted if and only
   886  // if total disks - a combination of corrupted and missing
   887  // files is lesser than number of data blocks.
   888  func isObjectDangling(metaArr []FileInfo, errs []error, dataErrs []error) (validMeta FileInfo, ok bool) {
   889  	// We can consider an object data not reliable
   890  	// when xl.meta is not found in read quorum disks.
   891  	// or when xl.meta is not readable in read quorum disks.
   892  	danglingErrsCount := func(cerrs []error) (int, int) {
   893  		var (
   894  			notFoundCount      int
   895  			nonActionableCount int
   896  		)
   897  		for _, readErr := range cerrs {
   898  			if readErr == nil {
   899  				continue
   900  			}
   901  			switch {
   902  			case errors.Is(readErr, errFileNotFound) || errors.Is(readErr, errFileVersionNotFound):
   903  				notFoundCount++
   904  			default:
   905  				// All other errors are non-actionable
   906  				nonActionableCount++
   907  			}
   908  		}
   909  		return notFoundCount, nonActionableCount
   910  	}
   911  
   912  	notFoundMetaErrs, nonActionableMetaErrs := danglingErrsCount(errs)
   913  	notFoundPartsErrs, nonActionablePartsErrs := danglingErrsCount(dataErrs)
   914  
   915  	for _, m := range metaArr {
   916  		if m.IsValid() {
   917  			validMeta = m
   918  			break
   919  		}
   920  	}
   921  
   922  	if !validMeta.IsValid() {
   923  		// validMeta is invalid because all xl.meta is missing apparently
   924  		// we should figure out if dataDirs are also missing > dataBlocks.
   925  		dataBlocks := (len(dataErrs) + 1) / 2
   926  		if notFoundPartsErrs > dataBlocks {
   927  			// Not using parity to ensure that we do not delete
   928  			// any valid content, if any is recoverable. But if
   929  			// notFoundDataDirs are already greater than the data
   930  			// blocks all bets are off and it is safe to purge.
   931  			//
   932  			// This is purely a defensive code, ideally parityBlocks
   933  			// is sufficient, however we can't know that since we
   934  			// do have the FileInfo{}.
   935  			return validMeta, true
   936  		}
   937  
   938  		// We have no idea what this file is, leave it as is.
   939  		return validMeta, false
   940  	}
   941  
   942  	if nonActionableMetaErrs > 0 || nonActionablePartsErrs > 0 {
   943  		return validMeta, false
   944  	}
   945  
   946  	if validMeta.Deleted {
   947  		// notFoundPartsErrs is ignored since
   948  		// - delete marker does not have any parts
   949  		dataBlocks := (len(errs) + 1) / 2
   950  		return validMeta, notFoundMetaErrs > dataBlocks
   951  	}
   952  
   953  	// TODO: It is possible to replay the object via just single
   954  	// xl.meta file, considering quorum number of data-dirs are still
   955  	// present on other drives.
   956  	//
   957  	// However this requires a bit of a rewrite, leave this up for
   958  	// future work.
   959  	if notFoundMetaErrs > 0 && notFoundMetaErrs > validMeta.Erasure.ParityBlocks {
   960  		// All xl.meta is beyond data blocks missing, this is dangling
   961  		return validMeta, true
   962  	}
   963  
   964  	if !validMeta.IsRemote() && notFoundPartsErrs > 0 && notFoundPartsErrs > validMeta.Erasure.ParityBlocks {
   965  		// All data-dir is beyond data blocks missing, this is dangling
   966  		return validMeta, true
   967  	}
   968  
   969  	return validMeta, false
   970  }
   971  
   972  // HealObject - heal the given object, automatically deletes the object if stale/corrupted if `remove` is true.
   973  func (er erasureObjects) HealObject(ctx context.Context, bucket, object, versionID string, opts madmin.HealOpts) (hr madmin.HealResultItem, err error) {
   974  	// Create context that also contains information about the object and bucket.
   975  	// The top level handler might not have this information.
   976  	reqInfo := logger.GetReqInfo(ctx)
   977  	var newReqInfo *logger.ReqInfo
   978  	if reqInfo != nil {
   979  		newReqInfo = logger.NewReqInfo(reqInfo.RemoteHost, reqInfo.UserAgent, reqInfo.DeploymentID, reqInfo.RequestID, reqInfo.API, bucket, object)
   980  	} else {
   981  		newReqInfo = logger.NewReqInfo("", "", globalDeploymentID(), "", "Heal", bucket, object)
   982  	}
   983  	healCtx := logger.SetReqInfo(GlobalContext, newReqInfo)
   984  
   985  	// Healing directories handle it separately.
   986  	if HasSuffix(object, SlashSeparator) {
   987  		hr, err := er.healObjectDir(healCtx, bucket, object, opts.DryRun, opts.Remove)
   988  		return hr, toObjectErr(err, bucket, object)
   989  	}
   990  
   991  	storageDisks := er.getDisks()
   992  	storageEndpoints := er.getEndpoints()
   993  
   994  	// When versionID is empty, we read directly from the `null` versionID for healing.
   995  	if versionID == "" {
   996  		versionID = nullVersionID
   997  	}
   998  
   999  	// Perform quick read without lock.
  1000  	// This allows to quickly check if all is ok or all are missing.
  1001  	_, errs := readAllFileInfo(healCtx, storageDisks, "", bucket, object, versionID, false, false)
  1002  	if isAllNotFound(errs) {
  1003  		err := errFileNotFound
  1004  		if versionID != "" {
  1005  			err = errFileVersionNotFound
  1006  		}
  1007  		// Nothing to do, file is already gone.
  1008  		return er.defaultHealResult(FileInfo{}, storageDisks, storageEndpoints,
  1009  			errs, bucket, object, versionID), toObjectErr(err, bucket, object, versionID)
  1010  	}
  1011  
  1012  	// Heal the object.
  1013  	hr, err = er.healObject(healCtx, bucket, object, versionID, opts)
  1014  	if errors.Is(err, errFileCorrupt) && opts.ScanMode != madmin.HealDeepScan {
  1015  		// Instead of returning an error when a bitrot error is detected
  1016  		// during a normal heal scan, heal again with bitrot flag enabled.
  1017  		opts.ScanMode = madmin.HealDeepScan
  1018  		hr, err = er.healObject(healCtx, bucket, object, versionID, opts)
  1019  	}
  1020  	return hr, toObjectErr(err, bucket, object, versionID)
  1021  }
  1022  
  1023  // healTrace sends healing results to trace output.
  1024  func healTrace(funcName healingMetric, startTime time.Time, bucket, object string, opts *madmin.HealOpts, err error, result *madmin.HealResultItem) {
  1025  	tr := madmin.TraceInfo{
  1026  		TraceType: madmin.TraceHealing,
  1027  		Time:      startTime,
  1028  		NodeName:  globalLocalNodeName,
  1029  		FuncName:  "heal." + funcName.String(),
  1030  		Duration:  time.Since(startTime),
  1031  		Path:      pathJoin(bucket, decodeDirObject(object)),
  1032  	}
  1033  	if opts != nil {
  1034  		tr.Custom = map[string]string{
  1035  			"dry":    fmt.Sprint(opts.DryRun),
  1036  			"remove": fmt.Sprint(opts.Remove),
  1037  			"mode":   fmt.Sprint(opts.ScanMode),
  1038  		}
  1039  		if result != nil {
  1040  			tr.Custom["version-id"] = result.VersionID
  1041  			tr.Custom["disks"] = strconv.Itoa(result.DiskCount)
  1042  		}
  1043  	}
  1044  	if err != nil {
  1045  		tr.Error = err.Error()
  1046  	} else {
  1047  		tr.HealResult = result
  1048  	}
  1049  	globalTrace.Publish(tr)
  1050  }