github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/erasure-healing-common.go (about)

     1  // Copyright (c) 2015-2021 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package cmd
    19  
    20  import (
    21  	"bytes"
    22  	"context"
    23  	"time"
    24  
    25  	"github.com/minio/madmin-go/v3"
    26  )
    27  
    28  func commonETags(etags []string) (etag string, maxima int) {
    29  	etagOccurrenceMap := make(map[string]int, len(etags))
    30  
    31  	// Ignore the uuid sentinel and count the rest.
    32  	for _, etag := range etags {
    33  		if etag == "" {
    34  			continue
    35  		}
    36  		etagOccurrenceMap[etag]++
    37  	}
    38  
    39  	maxima = 0 // Counter for remembering max occurrence of elements.
    40  	latest := ""
    41  
    42  	// Find the common cardinality from previously collected
    43  	// occurrences of elements.
    44  	for etag, count := range etagOccurrenceMap {
    45  		if count < maxima {
    46  			continue
    47  		}
    48  
    49  		// We are at or above maxima
    50  		if count > maxima {
    51  			maxima = count
    52  			latest = etag
    53  		}
    54  	}
    55  
    56  	// Return the collected common max time, with maxima
    57  	return latest, maxima
    58  }
    59  
    60  // commonTime returns a maximally occurring time from a list of time.
    61  func commonTimeAndOccurrence(times []time.Time, group time.Duration) (maxTime time.Time, maxima int) {
    62  	timeOccurrenceMap := make(map[int64]int, len(times))
    63  	groupNano := group.Nanoseconds()
    64  	// Ignore the uuid sentinel and count the rest.
    65  	for _, t := range times {
    66  		if t.Equal(timeSentinel) || t.IsZero() {
    67  			continue
    68  		}
    69  		nano := t.UnixNano()
    70  		if group > 0 {
    71  			for k := range timeOccurrenceMap {
    72  				if k == nano {
    73  					// We add to ourself later
    74  					continue
    75  				}
    76  				diff := k - nano
    77  				if diff < 0 {
    78  					diff = -diff
    79  				}
    80  				// We are within the limit
    81  				if diff < groupNano {
    82  					timeOccurrenceMap[k]++
    83  				}
    84  			}
    85  		}
    86  		// Add ourself...
    87  		timeOccurrenceMap[nano]++
    88  	}
    89  
    90  	maxima = 0 // Counter for remembering max occurrence of elements.
    91  	latest := int64(0)
    92  
    93  	// Find the common cardinality from previously collected
    94  	// occurrences of elements.
    95  	for nano, count := range timeOccurrenceMap {
    96  		if count < maxima {
    97  			continue
    98  		}
    99  
   100  		// We are at or above maxima
   101  		if count > maxima || nano > latest {
   102  			maxima = count
   103  			latest = nano
   104  		}
   105  	}
   106  
   107  	// Return the collected common max time, with maxima
   108  	return time.Unix(0, latest).UTC(), maxima
   109  }
   110  
   111  // commonTime returns a maximally occurring time from a list of time if it
   112  // occurs >= quorum, else return timeSentinel
   113  func commonTime(modTimes []time.Time, quorum int) time.Time {
   114  	if modTime, count := commonTimeAndOccurrence(modTimes, 0); count >= quorum {
   115  		return modTime
   116  	}
   117  
   118  	return timeSentinel
   119  }
   120  
   121  func commonETag(etags []string, quorum int) string {
   122  	if etag, count := commonETags(etags); count >= quorum {
   123  		return etag
   124  	}
   125  	return ""
   126  }
   127  
   128  // Beginning of unix time is treated as sentinel value here.
   129  var (
   130  	timeSentinel     = time.Unix(0, 0).UTC()
   131  	timeSentinel1970 = time.Unix(0, 1).UTC() // 1970 used for special cases when xlmeta.version == 0
   132  )
   133  
   134  // Boot modTimes up to disk count, setting the value to time sentinel.
   135  func bootModtimes(diskCount int) []time.Time {
   136  	modTimes := make([]time.Time, diskCount)
   137  	// Boots up all the modtimes.
   138  	for i := range modTimes {
   139  		modTimes[i] = timeSentinel
   140  	}
   141  	return modTimes
   142  }
   143  
   144  func listObjectETags(partsMetadata []FileInfo, errs []error, quorum int) (etags []string) {
   145  	etags = make([]string, len(partsMetadata))
   146  	vidMap := map[string]int{}
   147  	for index, metadata := range partsMetadata {
   148  		if errs[index] != nil {
   149  			continue
   150  		}
   151  		vid := metadata.VersionID
   152  		if metadata.VersionID == "" {
   153  			vid = nullVersionID
   154  		}
   155  		vidMap[vid]++
   156  		etags[index] = metadata.Metadata["etag"]
   157  	}
   158  
   159  	for _, count := range vidMap {
   160  		// do we have enough common versions
   161  		// that have enough quorum to satisfy
   162  		// the etag.
   163  		if count >= quorum {
   164  			return etags
   165  		}
   166  	}
   167  
   168  	return make([]string, len(partsMetadata))
   169  }
   170  
   171  // Extracts list of times from FileInfo slice and returns, skips
   172  // slice elements which have errors.
   173  func listObjectModtimes(partsMetadata []FileInfo, errs []error) (modTimes []time.Time) {
   174  	modTimes = bootModtimes(len(partsMetadata))
   175  	for index, metadata := range partsMetadata {
   176  		if errs[index] != nil {
   177  			continue
   178  		}
   179  		// Once the file is found, save the uuid saved on disk.
   180  		modTimes[index] = metadata.ModTime
   181  	}
   182  	return modTimes
   183  }
   184  
   185  func filterOnlineDisksInplace(fi FileInfo, partsMetadata []FileInfo, onlineDisks []StorageAPI) {
   186  	for i, meta := range partsMetadata {
   187  		if fi.XLV1 == meta.XLV1 {
   188  			continue
   189  		}
   190  		onlineDisks[i] = nil
   191  	}
   192  }
   193  
   194  // Notes:
   195  // There are 5 possible states a disk could be in,
   196  // 1. __online__             - has the latest copy of xl.meta - returned by listOnlineDisks
   197  //
   198  // 2. __offline__            - err == errDiskNotFound
   199  //
   200  // 3. __availableWithParts__ - has the latest copy of xl.meta and has all
   201  //                             parts with checksums matching; returned by disksWithAllParts
   202  //
   203  // 4. __outdated__           - returned by outDatedDisk, provided []StorageAPI
   204  //                             returned by diskWithAllParts is passed for latestDisks.
   205  //    - has an old copy of xl.meta
   206  //    - doesn't have xl.meta (errFileNotFound)
   207  //    - has the latest xl.meta but one or more parts are corrupt
   208  //
   209  // 5. __missingParts__       - has the latest copy of xl.meta but has some parts
   210  // missing.  This is identified separately since this may need manual
   211  // inspection to understand the root cause. E.g, this could be due to
   212  // backend filesystem corruption.
   213  
   214  // listOnlineDisks - returns
   215  // - a slice of disks where disk having 'older' xl.meta (or nothing)
   216  // are set to nil.
   217  // - latest (in time) of the maximally occurring modTime(s), which has at least quorum occurrences.
   218  func listOnlineDisks(disks []StorageAPI, partsMetadata []FileInfo, errs []error, quorum int) (onlineDisks []StorageAPI, modTime time.Time, etag string) {
   219  	onlineDisks = make([]StorageAPI, len(disks))
   220  
   221  	// List all the file commit ids from parts metadata.
   222  	modTimes := listObjectModtimes(partsMetadata, errs)
   223  
   224  	// Reduce list of UUIDs to a single common value.
   225  	modTime = commonTime(modTimes, quorum)
   226  
   227  	if modTime.IsZero() || modTime.Equal(timeSentinel) {
   228  		etags := listObjectETags(partsMetadata, errs, quorum)
   229  
   230  		etag = commonETag(etags, quorum)
   231  
   232  		if etag != "" { // allow this fallback only if a non-empty etag is found.
   233  			for index, e := range etags {
   234  				if partsMetadata[index].IsValid() && e == etag {
   235  					onlineDisks[index] = disks[index]
   236  				} else {
   237  					onlineDisks[index] = nil
   238  				}
   239  			}
   240  			return onlineDisks, modTime, etag
   241  		}
   242  	}
   243  
   244  	// Create a new online disks slice, which have common uuid.
   245  	for index, t := range modTimes {
   246  		if partsMetadata[index].IsValid() && t.Equal(modTime) {
   247  			onlineDisks[index] = disks[index]
   248  		} else {
   249  			onlineDisks[index] = nil
   250  		}
   251  	}
   252  
   253  	return onlineDisks, modTime, ""
   254  }
   255  
   256  // disksWithAllParts - This function needs to be called with
   257  // []StorageAPI returned by listOnlineDisks. Returns,
   258  //
   259  // - disks which have all parts specified in the latest xl.meta.
   260  //
   261  //   - slice of errors about the state of data files on disk - can have
   262  //     a not-found error or a hash-mismatch error.
   263  func disksWithAllParts(ctx context.Context, onlineDisks []StorageAPI, partsMetadata []FileInfo,
   264  	errs []error, latestMeta FileInfo, bucket, object string,
   265  	scanMode madmin.HealScanMode) ([]StorageAPI, []error, time.Time,
   266  ) {
   267  	availableDisks := make([]StorageAPI, len(onlineDisks))
   268  	dataErrs := make([]error, len(onlineDisks))
   269  
   270  	inconsistent := 0
   271  	for i, meta := range partsMetadata {
   272  		if !meta.IsValid() {
   273  			// Since for majority of the cases erasure.Index matches with erasure.Distribution we can
   274  			// consider the offline disks as consistent.
   275  			continue
   276  		}
   277  		if !meta.Deleted {
   278  			if len(meta.Erasure.Distribution) != len(onlineDisks) {
   279  				// Erasure distribution seems to have lesser
   280  				// number of items than number of online disks.
   281  				inconsistent++
   282  				continue
   283  			}
   284  			if meta.Erasure.Distribution[i] != meta.Erasure.Index {
   285  				// Mismatch indexes with distribution order
   286  				inconsistent++
   287  			}
   288  		}
   289  	}
   290  
   291  	erasureDistributionReliable := true
   292  	if inconsistent > len(partsMetadata)/2 {
   293  		// If there are too many inconsistent files, then we can't trust erasure.Distribution (most likely
   294  		// because of bugs found in CopyObject/PutObjectTags) https://github.com/minio/minio/pull/10772
   295  		erasureDistributionReliable = false
   296  	}
   297  
   298  	for i, onlineDisk := range onlineDisks {
   299  		if errs[i] != nil {
   300  			dataErrs[i] = errs[i]
   301  			continue
   302  		}
   303  		if onlineDisk == OfflineDisk {
   304  			dataErrs[i] = errDiskNotFound
   305  			continue
   306  		}
   307  
   308  		meta := partsMetadata[i]
   309  		if !meta.ModTime.Equal(latestMeta.ModTime) || meta.DataDir != latestMeta.DataDir {
   310  			dataErrs[i] = errFileCorrupt
   311  			partsMetadata[i] = FileInfo{}
   312  			continue
   313  		}
   314  
   315  		if erasureDistributionReliable {
   316  			if !meta.IsValid() {
   317  				partsMetadata[i] = FileInfo{}
   318  				dataErrs[i] = errFileCorrupt
   319  				continue
   320  			}
   321  
   322  			if !meta.Deleted {
   323  				if len(meta.Erasure.Distribution) != len(onlineDisks) {
   324  					// Erasure distribution is not the same as onlineDisks
   325  					// attempt a fix if possible, assuming other entries
   326  					// might have the right erasure distribution.
   327  					partsMetadata[i] = FileInfo{}
   328  					dataErrs[i] = errFileCorrupt
   329  					continue
   330  				}
   331  			}
   332  		}
   333  
   334  		// Always check data, if we got it.
   335  		if (len(meta.Data) > 0 || meta.Size == 0) && len(meta.Parts) > 0 {
   336  			checksumInfo := meta.Erasure.GetChecksumInfo(meta.Parts[0].Number)
   337  			dataErrs[i] = bitrotVerify(bytes.NewReader(meta.Data),
   338  				int64(len(meta.Data)),
   339  				meta.Erasure.ShardFileSize(meta.Size),
   340  				checksumInfo.Algorithm,
   341  				checksumInfo.Hash, meta.Erasure.ShardSize())
   342  			if dataErrs[i] == nil {
   343  				// All parts verified, mark it as all data available.
   344  				availableDisks[i] = onlineDisk
   345  			} else {
   346  				// upon errors just make that disk's fileinfo invalid
   347  				partsMetadata[i] = FileInfo{}
   348  			}
   349  			continue
   350  		}
   351  
   352  		meta.DataDir = latestMeta.DataDir
   353  		switch scanMode {
   354  		case madmin.HealDeepScan:
   355  			// disk has a valid xl.meta but may not have all the
   356  			// parts. This is considered an outdated disk, since
   357  			// it needs healing too.
   358  			if !meta.Deleted && !meta.IsRemote() {
   359  				dataErrs[i] = onlineDisk.VerifyFile(ctx, bucket, object, meta)
   360  			}
   361  		case madmin.HealNormalScan:
   362  			if !meta.Deleted && !meta.IsRemote() {
   363  				dataErrs[i] = onlineDisk.CheckParts(ctx, bucket, object, meta)
   364  			}
   365  		}
   366  
   367  		if dataErrs[i] == nil {
   368  			// All parts verified, mark it as all data available.
   369  			availableDisks[i] = onlineDisk
   370  		} else {
   371  			// upon errors just make that disk's fileinfo invalid
   372  			partsMetadata[i] = FileInfo{}
   373  		}
   374  	}
   375  
   376  	return availableDisks, dataErrs, timeSentinel
   377  }