github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/erasure-healing-common.go (about) 1 // Copyright (c) 2015-2021 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package cmd 19 20 import ( 21 "bytes" 22 "context" 23 "time" 24 25 "github.com/minio/madmin-go/v3" 26 ) 27 28 func commonETags(etags []string) (etag string, maxima int) { 29 etagOccurrenceMap := make(map[string]int, len(etags)) 30 31 // Ignore the uuid sentinel and count the rest. 32 for _, etag := range etags { 33 if etag == "" { 34 continue 35 } 36 etagOccurrenceMap[etag]++ 37 } 38 39 maxima = 0 // Counter for remembering max occurrence of elements. 40 latest := "" 41 42 // Find the common cardinality from previously collected 43 // occurrences of elements. 44 for etag, count := range etagOccurrenceMap { 45 if count < maxima { 46 continue 47 } 48 49 // We are at or above maxima 50 if count > maxima { 51 maxima = count 52 latest = etag 53 } 54 } 55 56 // Return the collected common max time, with maxima 57 return latest, maxima 58 } 59 60 // commonTime returns a maximally occurring time from a list of time. 61 func commonTimeAndOccurrence(times []time.Time, group time.Duration) (maxTime time.Time, maxima int) { 62 timeOccurrenceMap := make(map[int64]int, len(times)) 63 groupNano := group.Nanoseconds() 64 // Ignore the uuid sentinel and count the rest. 65 for _, t := range times { 66 if t.Equal(timeSentinel) || t.IsZero() { 67 continue 68 } 69 nano := t.UnixNano() 70 if group > 0 { 71 for k := range timeOccurrenceMap { 72 if k == nano { 73 // We add to ourself later 74 continue 75 } 76 diff := k - nano 77 if diff < 0 { 78 diff = -diff 79 } 80 // We are within the limit 81 if diff < groupNano { 82 timeOccurrenceMap[k]++ 83 } 84 } 85 } 86 // Add ourself... 87 timeOccurrenceMap[nano]++ 88 } 89 90 maxima = 0 // Counter for remembering max occurrence of elements. 91 latest := int64(0) 92 93 // Find the common cardinality from previously collected 94 // occurrences of elements. 95 for nano, count := range timeOccurrenceMap { 96 if count < maxima { 97 continue 98 } 99 100 // We are at or above maxima 101 if count > maxima || nano > latest { 102 maxima = count 103 latest = nano 104 } 105 } 106 107 // Return the collected common max time, with maxima 108 return time.Unix(0, latest).UTC(), maxima 109 } 110 111 // commonTime returns a maximally occurring time from a list of time if it 112 // occurs >= quorum, else return timeSentinel 113 func commonTime(modTimes []time.Time, quorum int) time.Time { 114 if modTime, count := commonTimeAndOccurrence(modTimes, 0); count >= quorum { 115 return modTime 116 } 117 118 return timeSentinel 119 } 120 121 func commonETag(etags []string, quorum int) string { 122 if etag, count := commonETags(etags); count >= quorum { 123 return etag 124 } 125 return "" 126 } 127 128 // Beginning of unix time is treated as sentinel value here. 129 var ( 130 timeSentinel = time.Unix(0, 0).UTC() 131 timeSentinel1970 = time.Unix(0, 1).UTC() // 1970 used for special cases when xlmeta.version == 0 132 ) 133 134 // Boot modTimes up to disk count, setting the value to time sentinel. 135 func bootModtimes(diskCount int) []time.Time { 136 modTimes := make([]time.Time, diskCount) 137 // Boots up all the modtimes. 138 for i := range modTimes { 139 modTimes[i] = timeSentinel 140 } 141 return modTimes 142 } 143 144 func listObjectETags(partsMetadata []FileInfo, errs []error, quorum int) (etags []string) { 145 etags = make([]string, len(partsMetadata)) 146 vidMap := map[string]int{} 147 for index, metadata := range partsMetadata { 148 if errs[index] != nil { 149 continue 150 } 151 vid := metadata.VersionID 152 if metadata.VersionID == "" { 153 vid = nullVersionID 154 } 155 vidMap[vid]++ 156 etags[index] = metadata.Metadata["etag"] 157 } 158 159 for _, count := range vidMap { 160 // do we have enough common versions 161 // that have enough quorum to satisfy 162 // the etag. 163 if count >= quorum { 164 return etags 165 } 166 } 167 168 return make([]string, len(partsMetadata)) 169 } 170 171 // Extracts list of times from FileInfo slice and returns, skips 172 // slice elements which have errors. 173 func listObjectModtimes(partsMetadata []FileInfo, errs []error) (modTimes []time.Time) { 174 modTimes = bootModtimes(len(partsMetadata)) 175 for index, metadata := range partsMetadata { 176 if errs[index] != nil { 177 continue 178 } 179 // Once the file is found, save the uuid saved on disk. 180 modTimes[index] = metadata.ModTime 181 } 182 return modTimes 183 } 184 185 func filterOnlineDisksInplace(fi FileInfo, partsMetadata []FileInfo, onlineDisks []StorageAPI) { 186 for i, meta := range partsMetadata { 187 if fi.XLV1 == meta.XLV1 { 188 continue 189 } 190 onlineDisks[i] = nil 191 } 192 } 193 194 // Notes: 195 // There are 5 possible states a disk could be in, 196 // 1. __online__ - has the latest copy of xl.meta - returned by listOnlineDisks 197 // 198 // 2. __offline__ - err == errDiskNotFound 199 // 200 // 3. __availableWithParts__ - has the latest copy of xl.meta and has all 201 // parts with checksums matching; returned by disksWithAllParts 202 // 203 // 4. __outdated__ - returned by outDatedDisk, provided []StorageAPI 204 // returned by diskWithAllParts is passed for latestDisks. 205 // - has an old copy of xl.meta 206 // - doesn't have xl.meta (errFileNotFound) 207 // - has the latest xl.meta but one or more parts are corrupt 208 // 209 // 5. __missingParts__ - has the latest copy of xl.meta but has some parts 210 // missing. This is identified separately since this may need manual 211 // inspection to understand the root cause. E.g, this could be due to 212 // backend filesystem corruption. 213 214 // listOnlineDisks - returns 215 // - a slice of disks where disk having 'older' xl.meta (or nothing) 216 // are set to nil. 217 // - latest (in time) of the maximally occurring modTime(s), which has at least quorum occurrences. 218 func listOnlineDisks(disks []StorageAPI, partsMetadata []FileInfo, errs []error, quorum int) (onlineDisks []StorageAPI, modTime time.Time, etag string) { 219 onlineDisks = make([]StorageAPI, len(disks)) 220 221 // List all the file commit ids from parts metadata. 222 modTimes := listObjectModtimes(partsMetadata, errs) 223 224 // Reduce list of UUIDs to a single common value. 225 modTime = commonTime(modTimes, quorum) 226 227 if modTime.IsZero() || modTime.Equal(timeSentinel) { 228 etags := listObjectETags(partsMetadata, errs, quorum) 229 230 etag = commonETag(etags, quorum) 231 232 if etag != "" { // allow this fallback only if a non-empty etag is found. 233 for index, e := range etags { 234 if partsMetadata[index].IsValid() && e == etag { 235 onlineDisks[index] = disks[index] 236 } else { 237 onlineDisks[index] = nil 238 } 239 } 240 return onlineDisks, modTime, etag 241 } 242 } 243 244 // Create a new online disks slice, which have common uuid. 245 for index, t := range modTimes { 246 if partsMetadata[index].IsValid() && t.Equal(modTime) { 247 onlineDisks[index] = disks[index] 248 } else { 249 onlineDisks[index] = nil 250 } 251 } 252 253 return onlineDisks, modTime, "" 254 } 255 256 // disksWithAllParts - This function needs to be called with 257 // []StorageAPI returned by listOnlineDisks. Returns, 258 // 259 // - disks which have all parts specified in the latest xl.meta. 260 // 261 // - slice of errors about the state of data files on disk - can have 262 // a not-found error or a hash-mismatch error. 263 func disksWithAllParts(ctx context.Context, onlineDisks []StorageAPI, partsMetadata []FileInfo, 264 errs []error, latestMeta FileInfo, bucket, object string, 265 scanMode madmin.HealScanMode) ([]StorageAPI, []error, time.Time, 266 ) { 267 availableDisks := make([]StorageAPI, len(onlineDisks)) 268 dataErrs := make([]error, len(onlineDisks)) 269 270 inconsistent := 0 271 for i, meta := range partsMetadata { 272 if !meta.IsValid() { 273 // Since for majority of the cases erasure.Index matches with erasure.Distribution we can 274 // consider the offline disks as consistent. 275 continue 276 } 277 if !meta.Deleted { 278 if len(meta.Erasure.Distribution) != len(onlineDisks) { 279 // Erasure distribution seems to have lesser 280 // number of items than number of online disks. 281 inconsistent++ 282 continue 283 } 284 if meta.Erasure.Distribution[i] != meta.Erasure.Index { 285 // Mismatch indexes with distribution order 286 inconsistent++ 287 } 288 } 289 } 290 291 erasureDistributionReliable := true 292 if inconsistent > len(partsMetadata)/2 { 293 // If there are too many inconsistent files, then we can't trust erasure.Distribution (most likely 294 // because of bugs found in CopyObject/PutObjectTags) https://github.com/minio/minio/pull/10772 295 erasureDistributionReliable = false 296 } 297 298 for i, onlineDisk := range onlineDisks { 299 if errs[i] != nil { 300 dataErrs[i] = errs[i] 301 continue 302 } 303 if onlineDisk == OfflineDisk { 304 dataErrs[i] = errDiskNotFound 305 continue 306 } 307 308 meta := partsMetadata[i] 309 if !meta.ModTime.Equal(latestMeta.ModTime) || meta.DataDir != latestMeta.DataDir { 310 dataErrs[i] = errFileCorrupt 311 partsMetadata[i] = FileInfo{} 312 continue 313 } 314 315 if erasureDistributionReliable { 316 if !meta.IsValid() { 317 partsMetadata[i] = FileInfo{} 318 dataErrs[i] = errFileCorrupt 319 continue 320 } 321 322 if !meta.Deleted { 323 if len(meta.Erasure.Distribution) != len(onlineDisks) { 324 // Erasure distribution is not the same as onlineDisks 325 // attempt a fix if possible, assuming other entries 326 // might have the right erasure distribution. 327 partsMetadata[i] = FileInfo{} 328 dataErrs[i] = errFileCorrupt 329 continue 330 } 331 } 332 } 333 334 // Always check data, if we got it. 335 if (len(meta.Data) > 0 || meta.Size == 0) && len(meta.Parts) > 0 { 336 checksumInfo := meta.Erasure.GetChecksumInfo(meta.Parts[0].Number) 337 dataErrs[i] = bitrotVerify(bytes.NewReader(meta.Data), 338 int64(len(meta.Data)), 339 meta.Erasure.ShardFileSize(meta.Size), 340 checksumInfo.Algorithm, 341 checksumInfo.Hash, meta.Erasure.ShardSize()) 342 if dataErrs[i] == nil { 343 // All parts verified, mark it as all data available. 344 availableDisks[i] = onlineDisk 345 } else { 346 // upon errors just make that disk's fileinfo invalid 347 partsMetadata[i] = FileInfo{} 348 } 349 continue 350 } 351 352 meta.DataDir = latestMeta.DataDir 353 switch scanMode { 354 case madmin.HealDeepScan: 355 // disk has a valid xl.meta but may not have all the 356 // parts. This is considered an outdated disk, since 357 // it needs healing too. 358 if !meta.Deleted && !meta.IsRemote() { 359 dataErrs[i] = onlineDisk.VerifyFile(ctx, bucket, object, meta) 360 } 361 case madmin.HealNormalScan: 362 if !meta.Deleted && !meta.IsRemote() { 363 dataErrs[i] = onlineDisk.CheckParts(ctx, bucket, object, meta) 364 } 365 } 366 367 if dataErrs[i] == nil { 368 // All parts verified, mark it as all data available. 369 availableDisks[i] = onlineDisk 370 } else { 371 // upon errors just make that disk's fileinfo invalid 372 partsMetadata[i] = FileInfo{} 373 } 374 } 375 376 return availableDisks, dataErrs, timeSentinel 377 }