storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/cmd/erasure-healing.go

storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/cmd/erasure-healing.go (about)

     1  /*
     2   * MinIO Cloud Storage, (C) 2016-2020 MinIO, Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package cmd
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"io"
    25  	"sync"
    26  	"time"
    27  
    28  	"storj.io/minio/cmd/logger"
    29  	"storj.io/minio/pkg/bucket/lifecycle"
    30  	"storj.io/minio/pkg/madmin"
    31  	"storj.io/minio/pkg/sync/errgroup"
    32  )
    33  
    34  // Heals a bucket if it doesn't exist on one of the disks, additionally
    35  // also heals the missing entries for bucket metadata files
    36  // `policy.json, notification.xml, listeners.json`.
    37  func (er erasureObjects) HealBucket(ctx context.Context, bucket string, opts madmin.HealOpts) (
    38  	result madmin.HealResultItem, err error) {
    39  	if !opts.DryRun {
    40  		defer ObjectPathUpdated(bucket)
    41  	}
    42  
    43  	storageDisks := er.getDisks()
    44  	storageEndpoints := er.getEndpoints()
    45  
    46  	// get write quorum for an object
    47  	writeQuorum := len(storageDisks) - er.defaultParityCount
    48  	if writeQuorum == er.defaultParityCount {
    49  		writeQuorum++
    50  	}
    51  
    52  	// Heal bucket.
    53  	return healBucket(ctx, storageDisks, storageEndpoints, bucket, writeQuorum, opts)
    54  }
    55  
    56  // Heal bucket - create buckets on disks where it does not exist.
    57  func healBucket(ctx context.Context, storageDisks []StorageAPI, storageEndpoints []string, bucket string, writeQuorum int,
    58  	opts madmin.HealOpts) (res madmin.HealResultItem, err error) {
    59  
    60  	// Initialize sync waitgroup.
    61  	g := errgroup.WithNErrs(len(storageDisks))
    62  
    63  	// Disk states slices
    64  	beforeState := make([]string, len(storageDisks))
    65  	afterState := make([]string, len(storageDisks))
    66  
    67  	// Make a volume entry on all underlying storage disks.
    68  	for index := range storageDisks {
    69  		index := index
    70  		g.Go(func() error {
    71  			if storageDisks[index] == nil {
    72  				beforeState[index] = madmin.DriveStateOffline
    73  				afterState[index] = madmin.DriveStateOffline
    74  				return errDiskNotFound
    75  			}
    76  			if _, serr := storageDisks[index].StatVol(ctx, bucket); serr != nil {
    77  				if serr == errDiskNotFound {
    78  					beforeState[index] = madmin.DriveStateOffline
    79  					afterState[index] = madmin.DriveStateOffline
    80  					return serr
    81  				}
    82  				if serr != errVolumeNotFound {
    83  					beforeState[index] = madmin.DriveStateCorrupt
    84  					afterState[index] = madmin.DriveStateCorrupt
    85  					return serr
    86  				}
    87  
    88  				beforeState[index] = madmin.DriveStateMissing
    89  				afterState[index] = madmin.DriveStateMissing
    90  
    91  				// mutate only if not a dry-run
    92  				if opts.DryRun {
    93  					return nil
    94  				}
    95  
    96  				return serr
    97  			}
    98  			beforeState[index] = madmin.DriveStateOk
    99  			afterState[index] = madmin.DriveStateOk
   100  			return nil
   101  		}, index)
   102  	}
   103  
   104  	errs := g.Wait()
   105  
   106  	// Initialize heal result info
   107  	res = madmin.HealResultItem{
   108  		Type:         madmin.HealItemBucket,
   109  		Bucket:       bucket,
   110  		DiskCount:    len(storageDisks),
   111  		ParityBlocks: len(storageDisks) / 2,
   112  		DataBlocks:   len(storageDisks) / 2,
   113  	}
   114  
   115  	for i := range beforeState {
   116  		res.Before.Drives = append(res.Before.Drives, madmin.HealDriveInfo{
   117  			UUID:     "",
   118  			Endpoint: storageEndpoints[i],
   119  			State:    beforeState[i],
   120  		})
   121  	}
   122  
   123  	reducedErr := reduceWriteQuorumErrs(ctx, errs, bucketOpIgnoredErrs, writeQuorum-1)
   124  	if errors.Is(reducedErr, errVolumeNotFound) && !opts.Recreate {
   125  		for i := range beforeState {
   126  			res.After.Drives = append(res.After.Drives, madmin.HealDriveInfo{
   127  				UUID:     "",
   128  				Endpoint: storageEndpoints[i],
   129  				State:    madmin.DriveStateOk,
   130  			})
   131  		}
   132  		return res, nil
   133  	}
   134  
   135  	// Initialize sync waitgroup.
   136  	g = errgroup.WithNErrs(len(storageDisks))
   137  
   138  	// Make a volume entry on all underlying storage disks.
   139  	for index := range storageDisks {
   140  		index := index
   141  		g.Go(func() error {
   142  			if beforeState[index] == madmin.DriveStateMissing {
   143  				makeErr := storageDisks[index].MakeVol(ctx, bucket)
   144  				if makeErr == nil {
   145  					afterState[index] = madmin.DriveStateOk
   146  				}
   147  				return makeErr
   148  			}
   149  			return errs[index]
   150  		}, index)
   151  	}
   152  
   153  	errs = g.Wait()
   154  
   155  	reducedErr = reduceWriteQuorumErrs(ctx, errs, bucketOpIgnoredErrs, writeQuorum)
   156  	if reducedErr != nil {
   157  		return res, reducedErr
   158  	}
   159  
   160  	for i := range afterState {
   161  		res.After.Drives = append(res.After.Drives, madmin.HealDriveInfo{
   162  			UUID:     "",
   163  			Endpoint: storageEndpoints[i],
   164  			State:    afterState[i],
   165  		})
   166  	}
   167  	return res, nil
   168  }
   169  
   170  // listAllBuckets lists all buckets from all disks. It also
   171  // returns the occurrence of each buckets in all disks
   172  func listAllBuckets(ctx context.Context, storageDisks []StorageAPI, healBuckets map[string]VolInfo) error {
   173  	g := errgroup.WithNErrs(len(storageDisks))
   174  	var mu sync.Mutex
   175  	for index := range storageDisks {
   176  		index := index
   177  		g.Go(func() error {
   178  			if storageDisks[index] == nil {
   179  				// we ignore disk not found errors
   180  				return nil
   181  			}
   182  			volsInfo, err := storageDisks[index].ListVols(ctx)
   183  			if err != nil {
   184  				return err
   185  			}
   186  			for _, volInfo := range volsInfo {
   187  				// StorageAPI can send volume names which are
   188  				// incompatible with buckets - these are
   189  				// skipped, like the meta-bucket.
   190  				if isReservedOrInvalidBucket(volInfo.Name, false) {
   191  					continue
   192  				}
   193  				mu.Lock()
   194  				if _, ok := healBuckets[volInfo.Name]; !ok {
   195  					healBuckets[volInfo.Name] = volInfo
   196  				}
   197  				mu.Unlock()
   198  			}
   199  			return nil
   200  		}, index)
   201  	}
   202  	return reduceReadQuorumErrs(ctx, g.Wait(), bucketMetadataOpIgnoredErrs, len(storageDisks)/2)
   203  }
   204  
   205  // Only heal on disks where we are sure that healing is needed. We can expand
   206  // this list as and when we figure out more errors can be added to this list safely.
   207  func shouldHealObjectOnDisk(erErr, dataErr error, meta FileInfo, quorumModTime time.Time) bool {
   208  	switch {
   209  	case errors.Is(erErr, errFileNotFound) || errors.Is(erErr, errFileVersionNotFound):
   210  		return true
   211  	case errors.Is(erErr, errCorruptedFormat):
   212  		return true
   213  	}
   214  	if erErr == nil {
   215  		// If xl.meta was read fine but there may be problem with the part.N files.
   216  		if IsErr(dataErr, []error{
   217  			errFileNotFound,
   218  			errFileVersionNotFound,
   219  			errFileCorrupt,
   220  		}...) {
   221  			return true
   222  		}
   223  		if !quorumModTime.Equal(meta.ModTime) {
   224  			return true
   225  		}
   226  		if meta.XLV1 {
   227  			return true
   228  		}
   229  	}
   230  	return false
   231  }
   232  
   233  // Heals an object by re-writing corrupt/missing erasure blocks.
   234  func (er erasureObjects) healObject(ctx context.Context, bucket string, object string, versionID string, opts madmin.HealOpts) (result madmin.HealResultItem, err error) {
   235  
   236  	dryRun := opts.DryRun
   237  	scanMode := opts.ScanMode
   238  
   239  	storageDisks := er.getDisks()
   240  	storageEndpoints := er.getEndpoints()
   241  
   242  	// Initialize heal result object
   243  	result = madmin.HealResultItem{
   244  		Type:         madmin.HealItemObject,
   245  		Bucket:       bucket,
   246  		Object:       object,
   247  		DiskCount:    len(storageDisks),
   248  		ParityBlocks: er.defaultParityCount,
   249  		DataBlocks:   len(storageDisks) - er.defaultParityCount,
   250  	}
   251  
   252  	lk := er.NewNSLock(bucket, object)
   253  	if ctx, err = lk.GetLock(ctx, globalOperationTimeout); err != nil {
   254  		return result, err
   255  	}
   256  	defer lk.Unlock()
   257  
   258  	// Re-read when we have lock...
   259  	partsMetadata, errs := readAllFileInfo(ctx, storageDisks, bucket, object, versionID, true)
   260  
   261  	// List of disks having latest version of the object er.meta
   262  	// (by modtime).
   263  	latestDisks, modTime, dataDir := listOnlineDisks(storageDisks, partsMetadata, errs)
   264  
   265  	// List of disks having all parts as per latest er.meta.
   266  	availableDisks, dataErrs := disksWithAllParts(ctx, latestDisks, partsMetadata, errs, bucket, object, scanMode)
   267  
   268  	// Loop to find number of disks with valid data, per-drive
   269  	// data state and a list of outdated disks on which data needs
   270  	// to be healed.
   271  	outDatedDisks := make([]StorageAPI, len(storageDisks))
   272  	numAvailableDisks := 0
   273  	disksToHealCount := 0
   274  	for i, v := range availableDisks {
   275  		driveState := ""
   276  		switch {
   277  		case v != nil:
   278  			driveState = madmin.DriveStateOk
   279  			numAvailableDisks++
   280  			// If data is sane on any one disk, we can
   281  			// extract the correct object size.
   282  			result.ObjectSize = partsMetadata[i].Size
   283  			if partsMetadata[i].Erasure.ParityBlocks > 0 && partsMetadata[i].Erasure.DataBlocks > 0 {
   284  				result.ParityBlocks = partsMetadata[i].Erasure.ParityBlocks
   285  				result.DataBlocks = partsMetadata[i].Erasure.DataBlocks
   286  			}
   287  		case errs[i] == errDiskNotFound, dataErrs[i] == errDiskNotFound:
   288  			driveState = madmin.DriveStateOffline
   289  		case errs[i] == errFileNotFound, errs[i] == errFileVersionNotFound, errs[i] == errVolumeNotFound:
   290  			fallthrough
   291  		case dataErrs[i] == errFileNotFound, dataErrs[i] == errFileVersionNotFound, dataErrs[i] == errVolumeNotFound:
   292  			driveState = madmin.DriveStateMissing
   293  		default:
   294  			// all remaining cases imply corrupt data/metadata
   295  			driveState = madmin.DriveStateCorrupt
   296  		}
   297  
   298  		if shouldHealObjectOnDisk(errs[i], dataErrs[i], partsMetadata[i], modTime) {
   299  			outDatedDisks[i] = storageDisks[i]
   300  			disksToHealCount++
   301  			result.Before.Drives = append(result.Before.Drives, madmin.HealDriveInfo{
   302  				UUID:     "",
   303  				Endpoint: storageEndpoints[i],
   304  				State:    driveState,
   305  			})
   306  			result.After.Drives = append(result.After.Drives, madmin.HealDriveInfo{
   307  				UUID:     "",
   308  				Endpoint: storageEndpoints[i],
   309  				State:    driveState,
   310  			})
   311  			continue
   312  		}
   313  		result.Before.Drives = append(result.Before.Drives, madmin.HealDriveInfo{
   314  			UUID:     "",
   315  			Endpoint: storageEndpoints[i],
   316  			State:    driveState,
   317  		})
   318  		result.After.Drives = append(result.After.Drives, madmin.HealDriveInfo{
   319  			UUID:     "",
   320  			Endpoint: storageEndpoints[i],
   321  			State:    driveState,
   322  		})
   323  	}
   324  
   325  	if isAllNotFound(errs) {
   326  		err = toObjectErr(errFileNotFound, bucket, object)
   327  		if versionID != "" {
   328  			err = toObjectErr(errFileVersionNotFound, bucket, object, versionID)
   329  		}
   330  		// File is fully gone, fileInfo is empty.
   331  		return defaultHealResult(FileInfo{}, storageDisks, storageEndpoints, errs, bucket, object, versionID, er.defaultParityCount), err
   332  	}
   333  
   334  	// If less than read quorum number of disks have all the parts
   335  	// of the data, we can't reconstruct the erasure-coded data.
   336  	if numAvailableDisks < result.DataBlocks {
   337  		return er.purgeObjectDangling(ctx, bucket, object, versionID, partsMetadata, errs, dataErrs, opts)
   338  	}
   339  
   340  	if disksToHealCount == 0 {
   341  		// Nothing to heal!
   342  		return result, nil
   343  	}
   344  
   345  	// After this point, only have to repair data on disk - so
   346  	// return if it is a dry-run
   347  	if dryRun {
   348  		return result, nil
   349  	}
   350  
   351  	// Latest FileInfo for reference. If a valid metadata is not
   352  	// present, it is as good as object not found.
   353  	latestMeta, err := pickValidFileInfo(ctx, partsMetadata, modTime, dataDir, result.DataBlocks)
   354  	if err != nil {
   355  		return result, toObjectErr(err, bucket, object, versionID)
   356  	}
   357  	defer ObjectPathUpdated(pathJoin(bucket, object))
   358  
   359  	cleanFileInfo := func(fi FileInfo) FileInfo {
   360  		// Returns a copy of the 'fi' with checksums and parts nil'ed.
   361  		nfi := fi
   362  		nfi.Erasure.Index = 0
   363  		nfi.Erasure.Checksums = nil
   364  		nfi.Parts = nil
   365  		return nfi
   366  	}
   367  
   368  	// We write at temporary location and then rename to final location.
   369  	tmpID := mustGetUUID()
   370  	migrateDataDir := mustGetUUID()
   371  
   372  	copyPartsMetadata := make([]FileInfo, len(partsMetadata))
   373  	for i := range outDatedDisks {
   374  		if outDatedDisks[i] == nil {
   375  			continue
   376  		}
   377  		copyPartsMetadata[i] = partsMetadata[i]
   378  		partsMetadata[i] = cleanFileInfo(latestMeta)
   379  	}
   380  
   381  	// source data dir shall be empty in case of XLV1
   382  	// differentiate it with dstDataDir for readability
   383  	// srcDataDir is the one used with newBitrotReader()
   384  	// to read existing content.
   385  	srcDataDir := latestMeta.DataDir
   386  	dstDataDir := latestMeta.DataDir
   387  	if latestMeta.XLV1 {
   388  		dstDataDir = migrateDataDir
   389  	}
   390  
   391  	var inlineBuffers []*bytes.Buffer
   392  	if len(latestMeta.Parts) <= 1 && latestMeta.Size < smallFileThreshold {
   393  		inlineBuffers = make([]*bytes.Buffer, len(outDatedDisks))
   394  	}
   395  
   396  	if !latestMeta.Deleted || latestMeta.TransitionStatus != lifecycle.TransitionComplete {
   397  		result.DataBlocks = latestMeta.Erasure.DataBlocks
   398  		result.ParityBlocks = latestMeta.Erasure.ParityBlocks
   399  
   400  		// Reorder so that we have data disks first and parity disks next.
   401  		latestDisks = shuffleDisks(availableDisks, latestMeta.Erasure.Distribution)
   402  		outDatedDisks = shuffleDisks(outDatedDisks, latestMeta.Erasure.Distribution)
   403  		partsMetadata = shufflePartsMetadata(partsMetadata, latestMeta.Erasure.Distribution)
   404  		copyPartsMetadata = shufflePartsMetadata(copyPartsMetadata, latestMeta.Erasure.Distribution)
   405  
   406  		// Heal each part. erasureHealFile() will write the healed
   407  		// part to .minio/tmp/uuid/ which needs to be renamed later to
   408  		// the final location.
   409  		erasure, err := NewErasure(ctx, latestMeta.Erasure.DataBlocks,
   410  			latestMeta.Erasure.ParityBlocks, latestMeta.Erasure.BlockSize)
   411  		if err != nil {
   412  			return result, toObjectErr(err, bucket, object)
   413  		}
   414  
   415  		erasureInfo := latestMeta.Erasure
   416  
   417  		for partIndex := 0; partIndex < len(latestMeta.Parts); partIndex++ {
   418  			partSize := latestMeta.Parts[partIndex].Size
   419  			partActualSize := latestMeta.Parts[partIndex].ActualSize
   420  			partNumber := latestMeta.Parts[partIndex].Number
   421  			tillOffset := erasure.ShardFileOffset(0, partSize, partSize)
   422  			readers := make([]io.ReaderAt, len(latestDisks))
   423  			checksumAlgo := erasureInfo.GetChecksumInfo(partNumber).Algorithm
   424  			for i, disk := range latestDisks {
   425  				if disk == OfflineDisk {
   426  					continue
   427  				}
   428  				checksumInfo := copyPartsMetadata[i].Erasure.GetChecksumInfo(partNumber)
   429  				partPath := pathJoin(object, srcDataDir, fmt.Sprintf("part.%d", partNumber))
   430  				readers[i] = newBitrotReader(disk, partsMetadata[i].Data, bucket, partPath, tillOffset, checksumAlgo, checksumInfo.Hash, erasure.ShardSize())
   431  			}
   432  			writers := make([]io.Writer, len(outDatedDisks))
   433  			for i, disk := range outDatedDisks {
   434  				if disk == OfflineDisk {
   435  					continue
   436  				}
   437  				partPath := pathJoin(tmpID, dstDataDir, fmt.Sprintf("part.%d", partNumber))
   438  				if len(inlineBuffers) > 0 {
   439  					inlineBuffers[i] = bytes.NewBuffer(make([]byte, 0, erasure.ShardFileSize(latestMeta.Size)))
   440  					writers[i] = newStreamingBitrotWriterBuffer(inlineBuffers[i], DefaultBitrotAlgorithm, erasure.ShardSize())
   441  				} else {
   442  					writers[i] = newBitrotWriter(disk, minioMetaTmpBucket, partPath,
   443  						tillOffset, DefaultBitrotAlgorithm, erasure.ShardSize(), true)
   444  				}
   445  			}
   446  			err = erasure.Heal(ctx, readers, writers, partSize)
   447  			closeBitrotReaders(readers)
   448  			closeBitrotWriters(writers)
   449  			if err != nil {
   450  				return result, toObjectErr(err, bucket, object)
   451  			}
   452  			// outDatedDisks that had write errors should not be
   453  			// written to for remaining parts, so we nil it out.
   454  			for i, disk := range outDatedDisks {
   455  				if disk == OfflineDisk {
   456  					continue
   457  				}
   458  
   459  				// A non-nil stale disk which did not receive
   460  				// a healed part checksum had a write error.
   461  				if writers[i] == nil {
   462  					outDatedDisks[i] = nil
   463  					disksToHealCount--
   464  					continue
   465  				}
   466  
   467  				partsMetadata[i].DataDir = dstDataDir
   468  				partsMetadata[i].AddObjectPart(partNumber, "", partSize, partActualSize)
   469  				partsMetadata[i].Erasure.AddChecksumInfo(ChecksumInfo{
   470  					PartNumber: partNumber,
   471  					Algorithm:  checksumAlgo,
   472  					Hash:       bitrotWriterSum(writers[i]),
   473  				})
   474  				if len(inlineBuffers) > 0 && inlineBuffers[i] != nil {
   475  					partsMetadata[i].Data = inlineBuffers[i].Bytes()
   476  				} else {
   477  					partsMetadata[i].Data = nil
   478  				}
   479  			}
   480  
   481  			// If all disks are having errors, we give up.
   482  			if disksToHealCount == 0 {
   483  				return result, fmt.Errorf("all disks had write errors, unable to heal")
   484  			}
   485  		}
   486  	}
   487  
   488  	defer er.deleteObject(context.Background(), minioMetaTmpBucket, tmpID, len(storageDisks)/2+1)
   489  
   490  	// Rename from tmp location to the actual location.
   491  	for i, disk := range outDatedDisks {
   492  		if disk == OfflineDisk {
   493  			continue
   494  		}
   495  
   496  		// record the index of the updated disks
   497  		partsMetadata[i].Erasure.Index = i + 1
   498  
   499  		// Attempt a rename now from healed data to final location.
   500  		if err = disk.RenameData(ctx, minioMetaTmpBucket, tmpID, partsMetadata[i], bucket, object); err != nil {
   501  			logger.LogIf(ctx, err)
   502  			return result, toObjectErr(err, bucket, object)
   503  		}
   504  
   505  		for i, v := range result.Before.Drives {
   506  			if v.Endpoint == disk.String() {
   507  				result.After.Drives[i].State = madmin.DriveStateOk
   508  			}
   509  		}
   510  	}
   511  
   512  	// Set the size of the object in the heal result
   513  	result.ObjectSize = latestMeta.Size
   514  
   515  	return result, nil
   516  }
   517  
   518  // healObjectDir - heals object directory specifically, this special call
   519  // is needed since we do not have a special backend format for directories.
   520  func (er erasureObjects) healObjectDir(ctx context.Context, bucket, object string, dryRun bool, remove bool) (hr madmin.HealResultItem, err error) {
   521  	storageDisks := er.getDisks()
   522  	storageEndpoints := er.getEndpoints()
   523  
   524  	// Initialize heal result object
   525  	hr = madmin.HealResultItem{
   526  		Type:         madmin.HealItemObject,
   527  		Bucket:       bucket,
   528  		Object:       object,
   529  		DiskCount:    len(storageDisks),
   530  		ParityBlocks: er.defaultParityCount,
   531  		DataBlocks:   len(storageDisks) - er.defaultParityCount,
   532  		ObjectSize:   0,
   533  	}
   534  
   535  	hr.Before.Drives = make([]madmin.HealDriveInfo, len(storageDisks))
   536  	hr.After.Drives = make([]madmin.HealDriveInfo, len(storageDisks))
   537  
   538  	errs := statAllDirs(ctx, storageDisks, bucket, object)
   539  	danglingObject := isObjectDirDangling(errs)
   540  	if danglingObject {
   541  		if !dryRun && remove {
   542  			var wg sync.WaitGroup
   543  			// Remove versions in bulk for each disk
   544  			for index, disk := range storageDisks {
   545  				if disk == nil {
   546  					continue
   547  				}
   548  				wg.Add(1)
   549  				go func(index int, disk StorageAPI) {
   550  					defer wg.Done()
   551  					_ = disk.Delete(ctx, bucket, object, false)
   552  				}(index, disk)
   553  			}
   554  			wg.Wait()
   555  			ObjectPathUpdated(pathJoin(bucket, object))
   556  		}
   557  	}
   558  
   559  	// Prepare object creation in all disks
   560  	for i, err := range errs {
   561  		drive := storageEndpoints[i]
   562  		switch err {
   563  		case nil:
   564  			hr.Before.Drives[i] = madmin.HealDriveInfo{Endpoint: drive, State: madmin.DriveStateOk}
   565  			hr.After.Drives[i] = madmin.HealDriveInfo{Endpoint: drive, State: madmin.DriveStateOk}
   566  		case errDiskNotFound:
   567  			hr.Before.Drives[i] = madmin.HealDriveInfo{State: madmin.DriveStateOffline}
   568  			hr.After.Drives[i] = madmin.HealDriveInfo{State: madmin.DriveStateOffline}
   569  		case errVolumeNotFound, errFileNotFound:
   570  			// Bucket or prefix/directory not found
   571  			hr.Before.Drives[i] = madmin.HealDriveInfo{Endpoint: drive, State: madmin.DriveStateMissing}
   572  			hr.After.Drives[i] = madmin.HealDriveInfo{Endpoint: drive, State: madmin.DriveStateMissing}
   573  		default:
   574  			hr.Before.Drives[i] = madmin.HealDriveInfo{Endpoint: drive, State: madmin.DriveStateCorrupt}
   575  			hr.After.Drives[i] = madmin.HealDriveInfo{Endpoint: drive, State: madmin.DriveStateCorrupt}
   576  		}
   577  	}
   578  	if dryRun || danglingObject || isAllNotFound(errs) {
   579  		// Nothing to do, file is already gone.
   580  		return hr, toObjectErr(errFileNotFound, bucket, object)
   581  	}
   582  	for i, err := range errs {
   583  		if err == errVolumeNotFound || err == errFileNotFound {
   584  			// Bucket or prefix/directory not found
   585  			merr := storageDisks[i].MakeVol(ctx, pathJoin(bucket, object))
   586  			switch merr {
   587  			case nil, errVolumeExists:
   588  				hr.After.Drives[i].State = madmin.DriveStateOk
   589  			case errDiskNotFound:
   590  				hr.After.Drives[i].State = madmin.DriveStateOffline
   591  			default:
   592  				logger.LogIf(ctx, merr)
   593  				hr.After.Drives[i].State = madmin.DriveStateCorrupt
   594  			}
   595  		}
   596  	}
   597  	return hr, nil
   598  }
   599  
   600  // Populates default heal result item entries with possible values when we are returning prematurely.
   601  // This is to ensure that in any circumstance we are not returning empty arrays with wrong values.
   602  func defaultHealResult(lfi FileInfo, storageDisks []StorageAPI, storageEndpoints []string, errs []error, bucket, object, versionID string, defaultParityCount int) madmin.HealResultItem {
   603  	// Initialize heal result object
   604  	result := madmin.HealResultItem{
   605  		Type:      madmin.HealItemObject,
   606  		Bucket:    bucket,
   607  		Object:    object,
   608  		VersionID: versionID,
   609  		DiskCount: len(storageDisks),
   610  	}
   611  	if lfi.IsValid() {
   612  		result.ObjectSize = lfi.Size
   613  	}
   614  
   615  	for index, disk := range storageDisks {
   616  		if disk == nil {
   617  			result.Before.Drives = append(result.Before.Drives, madmin.HealDriveInfo{
   618  				UUID:     "",
   619  				Endpoint: storageEndpoints[index],
   620  				State:    madmin.DriveStateOffline,
   621  			})
   622  			result.After.Drives = append(result.After.Drives, madmin.HealDriveInfo{
   623  				UUID:     "",
   624  				Endpoint: storageEndpoints[index],
   625  				State:    madmin.DriveStateOffline,
   626  			})
   627  			continue
   628  		}
   629  		driveState := madmin.DriveStateCorrupt
   630  		switch errs[index] {
   631  		case errFileNotFound, errVolumeNotFound:
   632  			driveState = madmin.DriveStateMissing
   633  		}
   634  		result.Before.Drives = append(result.Before.Drives, madmin.HealDriveInfo{
   635  			UUID:     "",
   636  			Endpoint: storageEndpoints[index],
   637  			State:    driveState,
   638  		})
   639  		result.After.Drives = append(result.After.Drives, madmin.HealDriveInfo{
   640  			UUID:     "",
   641  			Endpoint: storageEndpoints[index],
   642  			State:    driveState,
   643  		})
   644  	}
   645  
   646  	if !lfi.IsValid() {
   647  		// Default to most common configuration for erasure blocks.
   648  		result.ParityBlocks = defaultParityCount
   649  		result.DataBlocks = len(storageDisks) - defaultParityCount
   650  	} else {
   651  		result.ParityBlocks = lfi.Erasure.ParityBlocks
   652  		result.DataBlocks = lfi.Erasure.DataBlocks
   653  	}
   654  
   655  	return result
   656  }
   657  
   658  // Stat all directories.
   659  func statAllDirs(ctx context.Context, storageDisks []StorageAPI, bucket, prefix string) []error {
   660  	g := errgroup.WithNErrs(len(storageDisks))
   661  	for index, disk := range storageDisks {
   662  		if disk == nil {
   663  			continue
   664  		}
   665  		index := index
   666  		g.Go(func() error {
   667  			entries, err := storageDisks[index].ListDir(ctx, bucket, prefix, 1)
   668  			if err != nil {
   669  				return err
   670  			}
   671  			if len(entries) > 0 {
   672  				return errVolumeNotEmpty
   673  			}
   674  			return nil
   675  		}, index)
   676  	}
   677  
   678  	return g.Wait()
   679  }
   680  
   681  // isAllNotFound will return if any element of the error slice is not
   682  // errFileNotFound, errFileVersionNotFound or errVolumeNotFound.
   683  // A 0 length slice will always return false.
   684  func isAllNotFound(errs []error) bool {
   685  	for _, err := range errs {
   686  		if errors.Is(err, errFileNotFound) || errors.Is(err, errVolumeNotFound) || errors.Is(err, errFileVersionNotFound) {
   687  			continue
   688  		}
   689  		return false
   690  	}
   691  	return len(errs) > 0
   692  }
   693  
   694  // ObjectDir is considered dangling/corrupted if any only
   695  // if total disks - a combination of corrupted and missing
   696  // files is lesser than N/2+1 number of disks.
   697  // If no files were found false will be returned.
   698  func isObjectDirDangling(errs []error) (ok bool) {
   699  	var found int
   700  	var notFound int
   701  	var foundNotEmpty int
   702  	var otherFound int
   703  	for _, readErr := range errs {
   704  		if readErr == nil {
   705  			found++
   706  		} else if readErr == errFileNotFound || readErr == errVolumeNotFound {
   707  			notFound++
   708  		} else if readErr == errVolumeNotEmpty {
   709  			foundNotEmpty++
   710  		} else {
   711  			otherFound++
   712  		}
   713  	}
   714  	found = found + foundNotEmpty + otherFound
   715  	return found < notFound && found > 0
   716  }
   717  
   718  func (er erasureObjects) purgeObjectDangling(ctx context.Context, bucket, object, versionID string,
   719  	metaArr []FileInfo, errs []error, dataErrs []error, opts madmin.HealOpts) (madmin.HealResultItem, error) {
   720  
   721  	storageDisks := er.getDisks()
   722  	storageEndpoints := er.getEndpoints()
   723  
   724  	// Check if the object is dangling, if yes and user requested
   725  	// remove we simply delete it from namespace.
   726  	m, ok := isObjectDangling(metaArr, errs, dataErrs)
   727  	if ok {
   728  		writeQuorum := m.Erasure.DataBlocks
   729  		if m.Erasure.DataBlocks == 0 || m.Erasure.DataBlocks == m.Erasure.ParityBlocks {
   730  			writeQuorum++
   731  		}
   732  		var err error
   733  		var returnNotFound bool
   734  		if !opts.DryRun && opts.Remove {
   735  			if versionID == "" {
   736  				err = er.deleteObject(ctx, bucket, object, writeQuorum)
   737  			} else {
   738  				err = er.deleteObjectVersion(ctx, bucket, object, writeQuorum, FileInfo{VersionID: versionID}, false)
   739  			}
   740  
   741  			// If Delete was successful, make sure to return the appropriate error
   742  			// and heal result appropriate with delete's error messages
   743  			errs = make([]error, len(errs))
   744  			for i := range errs {
   745  				errs[i] = err
   746  			}
   747  			if err == nil {
   748  				// Dangling object successfully purged, size is '0'
   749  				m.Size = 0
   750  			}
   751  
   752  			// Delete successfully purged dangling content, return ObjectNotFound/VersionNotFound instead.
   753  			if countErrs(errs, nil) == len(errs) {
   754  				returnNotFound = true
   755  			}
   756  		}
   757  		if returnNotFound {
   758  			err = toObjectErr(errFileNotFound, bucket, object)
   759  			if versionID != "" {
   760  				err = toObjectErr(errFileVersionNotFound, bucket, object, versionID)
   761  			}
   762  			return defaultHealResult(m, storageDisks, storageEndpoints, errs, bucket, object, versionID, er.defaultParityCount), err
   763  		}
   764  		return defaultHealResult(m, storageDisks, storageEndpoints, errs, bucket, object, versionID, er.defaultParityCount), toObjectErr(err, bucket, object, versionID)
   765  	}
   766  
   767  	readQuorum := len(storageDisks) - er.defaultParityCount
   768  
   769  	err := toObjectErr(reduceReadQuorumErrs(ctx, errs, objectOpIgnoredErrs, readQuorum), bucket, object, versionID)
   770  	return defaultHealResult(m, storageDisks, storageEndpoints, errs, bucket, object, versionID, er.defaultParityCount), err
   771  }
   772  
   773  // Object is considered dangling/corrupted if any only
   774  // if total disks - a combination of corrupted and missing
   775  // files is lesser than number of data blocks.
   776  func isObjectDangling(metaArr []FileInfo, errs []error, dataErrs []error) (validMeta FileInfo, ok bool) {
   777  	// We can consider an object data not reliable
   778  	// when er.meta is not found in read quorum disks.
   779  	// or when er.meta is not readable in read quorum disks.
   780  	var notFoundErasureMeta, corruptedErasureMeta int
   781  	for _, readErr := range errs {
   782  		if errors.Is(readErr, errFileNotFound) || errors.Is(readErr, errFileVersionNotFound) {
   783  			notFoundErasureMeta++
   784  		} else if errors.Is(readErr, errCorruptedFormat) {
   785  			corruptedErasureMeta++
   786  		}
   787  	}
   788  	var notFoundParts int
   789  	for i := range dataErrs {
   790  		// Only count part errors, if the error is not
   791  		// same as er.meta error. This is to avoid
   792  		// double counting when both parts and er.meta
   793  		// are not available.
   794  		if errs[i] != dataErrs[i] {
   795  			if IsErr(dataErrs[i], []error{
   796  				errFileNotFound,
   797  				errFileVersionNotFound,
   798  			}...) {
   799  				notFoundParts++
   800  			}
   801  		}
   802  	}
   803  
   804  	for _, m := range metaArr {
   805  		if !m.IsValid() {
   806  			continue
   807  		}
   808  		validMeta = m
   809  		break
   810  	}
   811  
   812  	if validMeta.Deleted || validMeta.TransitionStatus == lifecycle.TransitionComplete {
   813  		// notFoundParts is ignored since a
   814  		// - delete marker does not have any parts
   815  		// - transition status of complete has no parts
   816  		return validMeta, corruptedErasureMeta+notFoundErasureMeta > len(errs)/2
   817  	}
   818  
   819  	// We couldn't find any valid meta we are indeed corrupted, return true right away.
   820  	if validMeta.Erasure.DataBlocks == 0 {
   821  		return validMeta, true
   822  	}
   823  
   824  	// We have valid meta, now verify if we have enough files with parity blocks.
   825  	return validMeta, corruptedErasureMeta+notFoundErasureMeta+notFoundParts > validMeta.Erasure.ParityBlocks
   826  }
   827  
   828  // HealObject - heal the given object, automatically deletes the object if stale/corrupted if `remove` is true.
   829  func (er erasureObjects) HealObject(ctx context.Context, bucket, object, versionID string, opts madmin.HealOpts) (hr madmin.HealResultItem, err error) {
   830  	// Create context that also contains information about the object and bucket.
   831  	// The top level handler might not have this information.
   832  	reqInfo := logger.GetReqInfo(ctx)
   833  	var newReqInfo *logger.ReqInfo
   834  	if reqInfo != nil {
   835  		newReqInfo = logger.NewReqInfo(reqInfo.RemoteHost, reqInfo.UserAgent, reqInfo.DeploymentID, reqInfo.RequestID, reqInfo.API, bucket, object)
   836  	} else {
   837  		newReqInfo = logger.NewReqInfo("", "", globalDeploymentID, "", "Heal", bucket, object)
   838  	}
   839  	healCtx := logger.SetReqInfo(GlobalContext, newReqInfo)
   840  
   841  	// Healing directories handle it separately.
   842  	if HasSuffix(object, SlashSeparator) {
   843  		return er.healObjectDir(healCtx, bucket, object, opts.DryRun, opts.Remove)
   844  	}
   845  
   846  	storageDisks := er.getDisks()
   847  	storageEndpoints := er.getEndpoints()
   848  
   849  	// Read metadata files from all the disks
   850  
   851  	// When versionID is empty, we read directly from the `null` versionID for healing.
   852  	if versionID == "" {
   853  		versionID = nullVersionID
   854  	}
   855  
   856  	partsMetadata, errs := readAllFileInfo(healCtx, storageDisks, bucket, object, versionID, false)
   857  
   858  	if isAllNotFound(errs) {
   859  		err = toObjectErr(errFileNotFound, bucket, object)
   860  		if versionID != "" {
   861  			err = toObjectErr(errFileVersionNotFound, bucket, object, versionID)
   862  		}
   863  		// Nothing to do, file is already gone.
   864  		return defaultHealResult(FileInfo{}, storageDisks, storageEndpoints, errs, bucket, object, versionID, er.defaultParityCount), err
   865  	}
   866  
   867  	_, err = getLatestFileInfo(healCtx, partsMetadata, errs)
   868  	if err != nil {
   869  		return er.purgeObjectDangling(healCtx, bucket, object, versionID, partsMetadata, errs, []error{}, opts)
   870  	}
   871  
   872  	// Heal the object.
   873  	return er.healObject(healCtx, bucket, object, versionID, opts)
   874  }