github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/erasure-healing.go (about) 1 // Copyright (c) 2015-2021 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package cmd 19 20 import ( 21 "bytes" 22 "context" 23 "errors" 24 "fmt" 25 "io" 26 "strconv" 27 "strings" 28 "sync" 29 "time" 30 31 "github.com/minio/madmin-go/v3" 32 "github.com/minio/minio/internal/logger" 33 "github.com/minio/pkg/v2/sync/errgroup" 34 ) 35 36 //go:generate stringer -type=healingMetric -trimprefix=healingMetric $GOFILE 37 38 type healingMetric uint8 39 40 const ( 41 healingMetricBucket healingMetric = iota 42 healingMetricObject 43 healingMetricCheckAbandonedParts 44 ) 45 46 func (er erasureObjects) listAndHeal(bucket, prefix string, scanMode madmin.HealScanMode, healEntry func(string, metaCacheEntry, madmin.HealScanMode) error) error { 47 ctx, cancel := context.WithCancel(context.Background()) 48 defer cancel() 49 50 disks, _ := er.getOnlineDisksWithHealing(false) 51 if len(disks) == 0 { 52 return errors.New("listAndHeal: No non-healing drives found") 53 } 54 55 // How to resolve partial results. 56 resolver := metadataResolutionParams{ 57 dirQuorum: 1, 58 objQuorum: 1, 59 bucket: bucket, 60 strict: false, // Allow less strict matching. 61 } 62 63 path := baseDirFromPrefix(prefix) 64 filterPrefix := strings.Trim(strings.TrimPrefix(prefix, path), slashSeparator) 65 if path == prefix { 66 filterPrefix = "" 67 } 68 69 lopts := listPathRawOptions{ 70 disks: disks, 71 bucket: bucket, 72 path: path, 73 filterPrefix: filterPrefix, 74 recursive: true, 75 forwardTo: "", 76 minDisks: 1, 77 reportNotFound: false, 78 agreed: func(entry metaCacheEntry) { 79 if err := healEntry(bucket, entry, scanMode); err != nil { 80 cancel() 81 } 82 }, 83 partial: func(entries metaCacheEntries, _ []error) { 84 entry, ok := entries.resolve(&resolver) 85 if !ok { 86 // check if we can get one entry at least 87 // proceed to heal nonetheless. 88 entry, _ = entries.firstFound() 89 } 90 91 if err := healEntry(bucket, *entry, scanMode); err != nil { 92 cancel() 93 return 94 } 95 }, 96 finished: nil, 97 } 98 99 if err := listPathRaw(ctx, lopts); err != nil { 100 return fmt.Errorf("listPathRaw returned %w: opts(%#v)", err, lopts) 101 } 102 103 return nil 104 } 105 106 // listAllBuckets lists all buckets from all disks. It also 107 // returns the occurrence of each buckets in all disks 108 func listAllBuckets(ctx context.Context, storageDisks []StorageAPI, healBuckets map[string]VolInfo, readQuorum int) error { 109 g := errgroup.WithNErrs(len(storageDisks)) 110 var mu sync.Mutex 111 for index := range storageDisks { 112 index := index 113 g.Go(func() error { 114 if storageDisks[index] == nil { 115 // we ignore disk not found errors 116 return nil 117 } 118 if storageDisks[index].Healing() != nil { 119 // we ignore disks under healing 120 return nil 121 } 122 volsInfo, err := storageDisks[index].ListVols(ctx) 123 if err != nil { 124 return err 125 } 126 for _, volInfo := range volsInfo { 127 // StorageAPI can send volume names which are 128 // incompatible with buckets - these are 129 // skipped, like the meta-bucket. 130 if isReservedOrInvalidBucket(volInfo.Name, false) { 131 continue 132 } 133 mu.Lock() 134 if _, ok := healBuckets[volInfo.Name]; !ok { 135 healBuckets[volInfo.Name] = volInfo 136 } 137 mu.Unlock() 138 } 139 return nil 140 }, index) 141 } 142 return reduceReadQuorumErrs(ctx, g.Wait(), bucketMetadataOpIgnoredErrs, readQuorum) 143 } 144 145 // Only heal on disks where we are sure that healing is needed. We can expand 146 // this list as and when we figure out more errors can be added to this list safely. 147 func shouldHealObjectOnDisk(erErr, dataErr error, meta FileInfo, latestMeta FileInfo) bool { 148 switch { 149 case errors.Is(erErr, errFileNotFound) || errors.Is(erErr, errFileVersionNotFound): 150 return true 151 case errors.Is(erErr, errFileCorrupt): 152 return true 153 } 154 if erErr == nil { 155 if meta.XLV1 { 156 // Legacy means heal always 157 // always check first. 158 return true 159 } 160 if !meta.Deleted && !meta.IsRemote() { 161 // If xl.meta was read fine but there may be problem with the part.N files. 162 if IsErr(dataErr, []error{ 163 errFileNotFound, 164 errFileVersionNotFound, 165 errFileCorrupt, 166 }...) { 167 return true 168 } 169 } 170 if !latestMeta.Equals(meta) { 171 return true 172 } 173 } 174 return false 175 } 176 177 const ( 178 xMinIOHealing = ReservedMetadataPrefix + "healing" 179 xMinIODataMov = ReservedMetadataPrefix + "data-mov" 180 ) 181 182 // SetHealing marks object (version) as being healed. 183 // Note: this is to be used only from healObject 184 func (fi *FileInfo) SetHealing() { 185 if fi.Metadata == nil { 186 fi.Metadata = make(map[string]string) 187 } 188 fi.Metadata[xMinIOHealing] = "true" 189 } 190 191 // Healing returns true if object is being healed (i.e fi is being passed down 192 // from healObject) 193 func (fi FileInfo) Healing() bool { 194 _, ok := fi.Metadata[xMinIOHealing] 195 return ok 196 } 197 198 // SetDataMov marks object (version) as being currently 199 // in movement, such as decommissioning or rebalance. 200 func (fi *FileInfo) SetDataMov() { 201 if fi.Metadata == nil { 202 fi.Metadata = make(map[string]string) 203 } 204 fi.Metadata[xMinIODataMov] = "true" 205 } 206 207 // DataMov returns true if object is being in movement 208 func (fi FileInfo) DataMov() bool { 209 _, ok := fi.Metadata[xMinIODataMov] 210 return ok 211 } 212 213 // Heals an object by re-writing corrupt/missing erasure blocks. 214 func (er *erasureObjects) healObject(ctx context.Context, bucket string, object string, versionID string, opts madmin.HealOpts) (result madmin.HealResultItem, err error) { 215 dryRun := opts.DryRun 216 scanMode := opts.ScanMode 217 218 storageDisks := er.getDisks() 219 storageEndpoints := er.getEndpoints() 220 221 if globalTrace.NumSubscribers(madmin.TraceHealing) > 0 { 222 startTime := time.Now() 223 defer func() { 224 healTrace(healingMetricObject, startTime, bucket, object, &opts, err, &result) 225 }() 226 } 227 // Initialize heal result object 228 result = madmin.HealResultItem{ 229 Type: madmin.HealItemObject, 230 Bucket: bucket, 231 Object: object, 232 VersionID: versionID, 233 DiskCount: len(storageDisks), 234 } 235 236 if !opts.NoLock { 237 lk := er.NewNSLock(bucket, object) 238 lkctx, err := lk.GetLock(ctx, globalOperationTimeout) 239 if err != nil { 240 return result, err 241 } 242 ctx = lkctx.Context() 243 defer lk.Unlock(lkctx) 244 } 245 246 // Re-read when we have lock... 247 partsMetadata, errs := readAllFileInfo(ctx, storageDisks, "", bucket, object, versionID, true, true) 248 if isAllNotFound(errs) { 249 err := errFileNotFound 250 if versionID != "" { 251 err = errFileVersionNotFound 252 } 253 // Nothing to do, file is already gone. 254 return er.defaultHealResult(FileInfo{}, storageDisks, storageEndpoints, 255 errs, bucket, object, versionID), err 256 } 257 258 readQuorum, _, err := objectQuorumFromMeta(ctx, partsMetadata, errs, er.defaultParityCount) 259 if err != nil { 260 m, err := er.deleteIfDangling(ctx, bucket, object, partsMetadata, errs, nil, ObjectOptions{ 261 VersionID: versionID, 262 }) 263 errs = make([]error, len(errs)) 264 for i := range errs { 265 errs[i] = err 266 } 267 if err == nil { 268 // Dangling object successfully purged, size is '0' 269 m.Size = 0 270 } 271 // Generate file/version not found with default heal result 272 err = errFileNotFound 273 if versionID != "" { 274 err = errFileVersionNotFound 275 } 276 return er.defaultHealResult(m, storageDisks, storageEndpoints, 277 errs, bucket, object, versionID), err 278 } 279 280 result.ParityBlocks = result.DiskCount - readQuorum 281 result.DataBlocks = readQuorum 282 283 // List of disks having latest version of the object xl.meta 284 // (by modtime). 285 onlineDisks, modTime, etag := listOnlineDisks(storageDisks, partsMetadata, errs, readQuorum) 286 287 // Latest FileInfo for reference. If a valid metadata is not 288 // present, it is as good as object not found. 289 latestMeta, err := pickValidFileInfo(ctx, partsMetadata, modTime, etag, readQuorum) 290 if err != nil { 291 return result, err 292 } 293 294 // List of disks having all parts as per latest metadata. 295 // NOTE: do not pass in latestDisks to diskWithAllParts since 296 // the diskWithAllParts needs to reach the drive to ensure 297 // validity of the metadata content, we should make sure that 298 // we pass in disks as is for it to be verified. Once verified 299 // the disksWithAllParts() returns the actual disks that can be 300 // used here for reconstruction. This is done to ensure that 301 // we do not skip drives that have inconsistent metadata to be 302 // skipped from purging when they are stale. 303 availableDisks, dataErrs, _ := disksWithAllParts(ctx, onlineDisks, partsMetadata, 304 errs, latestMeta, bucket, object, scanMode) 305 306 var erasure Erasure 307 if !latestMeta.Deleted && !latestMeta.IsRemote() { 308 // Initialize erasure coding 309 erasure, err = NewErasure(ctx, latestMeta.Erasure.DataBlocks, 310 latestMeta.Erasure.ParityBlocks, latestMeta.Erasure.BlockSize) 311 if err != nil { 312 return result, err 313 } 314 } 315 316 result.ObjectSize, err = latestMeta.ToObjectInfo(bucket, object, true).GetActualSize() 317 if err != nil { 318 return result, err 319 } 320 321 // Loop to find number of disks with valid data, per-drive 322 // data state and a list of outdated disks on which data needs 323 // to be healed. 324 outDatedDisks := make([]StorageAPI, len(storageDisks)) 325 disksToHealCount := 0 326 for i, v := range availableDisks { 327 driveState := "" 328 switch { 329 case v != nil: 330 driveState = madmin.DriveStateOk 331 case errs[i] == errDiskNotFound, dataErrs[i] == errDiskNotFound: 332 driveState = madmin.DriveStateOffline 333 case errs[i] == errFileNotFound, errs[i] == errFileVersionNotFound, errs[i] == errVolumeNotFound: 334 fallthrough 335 case dataErrs[i] == errFileNotFound, dataErrs[i] == errFileVersionNotFound, dataErrs[i] == errVolumeNotFound: 336 driveState = madmin.DriveStateMissing 337 default: 338 // all remaining cases imply corrupt data/metadata 339 driveState = madmin.DriveStateCorrupt 340 } 341 342 if shouldHealObjectOnDisk(errs[i], dataErrs[i], partsMetadata[i], latestMeta) { 343 outDatedDisks[i] = storageDisks[i] 344 disksToHealCount++ 345 result.Before.Drives = append(result.Before.Drives, madmin.HealDriveInfo{ 346 UUID: "", 347 Endpoint: storageEndpoints[i].String(), 348 State: driveState, 349 }) 350 result.After.Drives = append(result.After.Drives, madmin.HealDriveInfo{ 351 UUID: "", 352 Endpoint: storageEndpoints[i].String(), 353 State: driveState, 354 }) 355 continue 356 } 357 result.Before.Drives = append(result.Before.Drives, madmin.HealDriveInfo{ 358 UUID: "", 359 Endpoint: storageEndpoints[i].String(), 360 State: driveState, 361 }) 362 result.After.Drives = append(result.After.Drives, madmin.HealDriveInfo{ 363 UUID: "", 364 Endpoint: storageEndpoints[i].String(), 365 State: driveState, 366 }) 367 } 368 369 if isAllNotFound(errs) { 370 // File is fully gone, fileInfo is empty. 371 err := errFileNotFound 372 if versionID != "" { 373 err = errFileVersionNotFound 374 } 375 return er.defaultHealResult(FileInfo{}, storageDisks, storageEndpoints, errs, 376 bucket, object, versionID), err 377 } 378 379 if disksToHealCount == 0 { 380 // Nothing to heal! 381 return result, nil 382 } 383 384 // After this point, only have to repair data on disk - so 385 // return if it is a dry-run 386 if dryRun { 387 return result, nil 388 } 389 390 if !latestMeta.XLV1 && !latestMeta.Deleted && disksToHealCount > latestMeta.Erasure.ParityBlocks { 391 // Allow for dangling deletes, on versions that have DataDir missing etc. 392 // this would end up restoring the correct readable versions. 393 m, err := er.deleteIfDangling(ctx, bucket, object, partsMetadata, errs, dataErrs, ObjectOptions{ 394 VersionID: versionID, 395 }) 396 errs = make([]error, len(errs)) 397 for i := range errs { 398 errs[i] = err 399 } 400 if err == nil { 401 // Dangling object successfully purged, size is '0' 402 m.Size = 0 403 } 404 // Generate file/version not found with default heal result 405 err = errFileNotFound 406 if versionID != "" { 407 err = errFileVersionNotFound 408 } 409 return er.defaultHealResult(m, storageDisks, storageEndpoints, 410 errs, bucket, object, versionID), err 411 } 412 413 cleanFileInfo := func(fi FileInfo) FileInfo { 414 // Returns a copy of the 'fi' with erasure index, checksums and inline data niled. 415 nfi := fi 416 if !nfi.IsRemote() { 417 nfi.Data = nil 418 nfi.Erasure.Index = 0 419 nfi.Erasure.Checksums = nil 420 } 421 return nfi 422 } 423 424 // We write at temporary location and then rename to final location. 425 tmpID := mustGetUUID() 426 migrateDataDir := mustGetUUID() 427 428 // Reorder so that we have data disks first and parity disks next. 429 if !latestMeta.Deleted && len(latestMeta.Erasure.Distribution) != len(availableDisks) { 430 err := fmt.Errorf("unexpected file distribution (%v) from available disks (%v), looks like backend disks have been manually modified refusing to heal %s/%s(%s)", 431 latestMeta.Erasure.Distribution, availableDisks, bucket, object, versionID) 432 logger.LogOnceIf(ctx, err, "heal-object-available-disks") 433 return er.defaultHealResult(latestMeta, storageDisks, storageEndpoints, errs, 434 bucket, object, versionID), err 435 } 436 437 latestDisks := shuffleDisks(availableDisks, latestMeta.Erasure.Distribution) 438 439 if !latestMeta.Deleted && len(latestMeta.Erasure.Distribution) != len(outDatedDisks) { 440 err := fmt.Errorf("unexpected file distribution (%v) from outdated disks (%v), looks like backend disks have been manually modified refusing to heal %s/%s(%s)", 441 latestMeta.Erasure.Distribution, outDatedDisks, bucket, object, versionID) 442 logger.LogOnceIf(ctx, err, "heal-object-outdated-disks") 443 return er.defaultHealResult(latestMeta, storageDisks, storageEndpoints, errs, 444 bucket, object, versionID), err 445 } 446 447 outDatedDisks = shuffleDisks(outDatedDisks, latestMeta.Erasure.Distribution) 448 449 if !latestMeta.Deleted && len(latestMeta.Erasure.Distribution) != len(partsMetadata) { 450 err := fmt.Errorf("unexpected file distribution (%v) from metadata entries (%v), looks like backend disks have been manually modified refusing to heal %s/%s(%s)", 451 latestMeta.Erasure.Distribution, len(partsMetadata), bucket, object, versionID) 452 logger.LogOnceIf(ctx, err, "heal-object-metadata-entries") 453 return er.defaultHealResult(latestMeta, storageDisks, storageEndpoints, errs, 454 bucket, object, versionID), err 455 } 456 457 partsMetadata = shufflePartsMetadata(partsMetadata, latestMeta.Erasure.Distribution) 458 459 copyPartsMetadata := make([]FileInfo, len(partsMetadata)) 460 for i := range latestDisks { 461 if latestDisks[i] == nil { 462 continue 463 } 464 copyPartsMetadata[i] = partsMetadata[i] 465 } 466 467 for i := range outDatedDisks { 468 if outDatedDisks[i] == nil { 469 continue 470 } 471 // Make sure to write the FileInfo information 472 // that is expected to be in quorum. 473 partsMetadata[i] = cleanFileInfo(latestMeta) 474 } 475 476 // source data dir shall be empty in case of XLV1 477 // differentiate it with dstDataDir for readability 478 // srcDataDir is the one used with newBitrotReader() 479 // to read existing content. 480 srcDataDir := latestMeta.DataDir 481 dstDataDir := latestMeta.DataDir 482 if latestMeta.XLV1 { 483 dstDataDir = migrateDataDir 484 } 485 486 var inlineBuffers []*bytes.Buffer 487 if !latestMeta.Deleted && !latestMeta.IsRemote() { 488 if latestMeta.InlineData() { 489 inlineBuffers = make([]*bytes.Buffer, len(outDatedDisks)) 490 } 491 492 erasureInfo := latestMeta.Erasure 493 for partIndex := 0; partIndex < len(latestMeta.Parts); partIndex++ { 494 partSize := latestMeta.Parts[partIndex].Size 495 partActualSize := latestMeta.Parts[partIndex].ActualSize 496 partModTime := latestMeta.Parts[partIndex].ModTime 497 partNumber := latestMeta.Parts[partIndex].Number 498 partIdx := latestMeta.Parts[partIndex].Index 499 partChecksums := latestMeta.Parts[partIndex].Checksums 500 tillOffset := erasure.ShardFileOffset(0, partSize, partSize) 501 readers := make([]io.ReaderAt, len(latestDisks)) 502 prefer := make([]bool, len(latestDisks)) 503 checksumAlgo := erasureInfo.GetChecksumInfo(partNumber).Algorithm 504 for i, disk := range latestDisks { 505 if disk == OfflineDisk { 506 continue 507 } 508 checksumInfo := copyPartsMetadata[i].Erasure.GetChecksumInfo(partNumber) 509 partPath := pathJoin(object, srcDataDir, fmt.Sprintf("part.%d", partNumber)) 510 readers[i] = newBitrotReader(disk, copyPartsMetadata[i].Data, bucket, partPath, tillOffset, checksumAlgo, 511 checksumInfo.Hash, erasure.ShardSize()) 512 prefer[i] = disk.Hostname() == "" 513 514 } 515 writers := make([]io.Writer, len(outDatedDisks)) 516 for i, disk := range outDatedDisks { 517 if disk == OfflineDisk { 518 continue 519 } 520 partPath := pathJoin(tmpID, dstDataDir, fmt.Sprintf("part.%d", partNumber)) 521 if len(inlineBuffers) > 0 { 522 inlineBuffers[i] = bytes.NewBuffer(make([]byte, 0, erasure.ShardFileSize(latestMeta.Size)+32)) 523 writers[i] = newStreamingBitrotWriterBuffer(inlineBuffers[i], DefaultBitrotAlgorithm, erasure.ShardSize()) 524 } else { 525 writers[i] = newBitrotWriter(disk, bucket, minioMetaTmpBucket, partPath, 526 tillOffset, DefaultBitrotAlgorithm, erasure.ShardSize()) 527 } 528 } 529 530 // Heal each part. erasure.Heal() will write the healed 531 // part to .minio/tmp/uuid/ which needs to be renamed 532 // later to the final location. 533 err = erasure.Heal(ctx, writers, readers, partSize, prefer) 534 closeBitrotReaders(readers) 535 closeBitrotWriters(writers) 536 if err != nil { 537 return result, err 538 } 539 540 // outDatedDisks that had write errors should not be 541 // written to for remaining parts, so we nil it out. 542 for i, disk := range outDatedDisks { 543 if disk == OfflineDisk { 544 continue 545 } 546 547 // A non-nil stale disk which did not receive 548 // a healed part checksum had a write error. 549 if writers[i] == nil { 550 outDatedDisks[i] = nil 551 disksToHealCount-- 552 continue 553 } 554 555 partsMetadata[i].DataDir = dstDataDir 556 partsMetadata[i].AddObjectPart(partNumber, "", partSize, partActualSize, partModTime, partIdx, partChecksums) 557 if len(inlineBuffers) > 0 && inlineBuffers[i] != nil { 558 partsMetadata[i].Data = inlineBuffers[i].Bytes() 559 partsMetadata[i].SetInlineData() 560 } else { 561 partsMetadata[i].Data = nil 562 } 563 } 564 565 // If all disks are having errors, we give up. 566 if disksToHealCount == 0 { 567 return result, fmt.Errorf("all drives had write errors, unable to heal %s/%s", bucket, object) 568 } 569 570 } 571 572 } 573 574 defer er.deleteAll(context.Background(), minioMetaTmpBucket, tmpID) 575 576 // Rename from tmp location to the actual location. 577 for i, disk := range outDatedDisks { 578 if disk == OfflineDisk { 579 continue 580 } 581 582 // record the index of the updated disks 583 partsMetadata[i].Erasure.Index = i + 1 584 585 // Attempt a rename now from healed data to final location. 586 partsMetadata[i].SetHealing() 587 588 if _, err = disk.RenameData(ctx, minioMetaTmpBucket, tmpID, partsMetadata[i], bucket, object, RenameOptions{}); err != nil { 589 return result, err 590 } 591 592 // - Remove any remaining parts from outdated disks from before transition. 593 if partsMetadata[i].IsRemote() { 594 rmDataDir := partsMetadata[i].DataDir 595 disk.Delete(ctx, bucket, pathJoin(encodeDirObject(object), rmDataDir), DeleteOptions{ 596 Immediate: true, 597 Recursive: true, 598 }) 599 } 600 601 for i, v := range result.Before.Drives { 602 if v.Endpoint == disk.String() { 603 result.After.Drives[i].State = madmin.DriveStateOk 604 } 605 } 606 } 607 608 return result, nil 609 } 610 611 // checkAbandonedParts will check if an object has abandoned parts, 612 // meaning data-dirs or inlined data that are no longer referenced by the xl.meta 613 // Errors are generally ignored by this function. 614 func (er *erasureObjects) checkAbandonedParts(ctx context.Context, bucket string, object string, opts madmin.HealOpts) (err error) { 615 if !opts.Remove || opts.DryRun { 616 return nil 617 } 618 if globalTrace.NumSubscribers(madmin.TraceHealing) > 0 { 619 startTime := time.Now() 620 defer func() { 621 healTrace(healingMetricCheckAbandonedParts, startTime, bucket, object, nil, err, nil) 622 }() 623 } 624 if !opts.NoLock { 625 lk := er.NewNSLock(bucket, object) 626 lkctx, err := lk.GetLock(ctx, globalOperationTimeout) 627 if err != nil { 628 return err 629 } 630 ctx = lkctx.Context() 631 defer lk.Unlock(lkctx) 632 } 633 var wg sync.WaitGroup 634 for _, disk := range er.getDisks() { 635 if disk != nil { 636 wg.Add(1) 637 go func(disk StorageAPI) { 638 defer wg.Done() 639 _ = disk.CleanAbandonedData(ctx, bucket, object) 640 }(disk) 641 } 642 } 643 wg.Wait() 644 return nil 645 } 646 647 // healObjectDir - heals object directory specifically, this special call 648 // is needed since we do not have a special backend format for directories. 649 func (er *erasureObjects) healObjectDir(ctx context.Context, bucket, object string, dryRun bool, remove bool) (hr madmin.HealResultItem, err error) { 650 storageDisks := er.getDisks() 651 storageEndpoints := er.getEndpoints() 652 653 // Initialize heal result object 654 hr = madmin.HealResultItem{ 655 Type: madmin.HealItemObject, 656 Bucket: bucket, 657 Object: object, 658 DiskCount: len(storageDisks), 659 ParityBlocks: er.defaultParityCount, 660 DataBlocks: len(storageDisks) - er.defaultParityCount, 661 ObjectSize: 0, 662 } 663 664 hr.Before.Drives = make([]madmin.HealDriveInfo, len(storageDisks)) 665 hr.After.Drives = make([]madmin.HealDriveInfo, len(storageDisks)) 666 667 errs := statAllDirs(ctx, storageDisks, bucket, object) 668 danglingObject := isObjectDirDangling(errs) 669 if danglingObject { 670 if !dryRun && remove { 671 var wg sync.WaitGroup 672 // Remove versions in bulk for each disk 673 for index, disk := range storageDisks { 674 if disk == nil { 675 continue 676 } 677 wg.Add(1) 678 go func(index int, disk StorageAPI) { 679 defer wg.Done() 680 _ = disk.Delete(ctx, bucket, object, DeleteOptions{ 681 Recursive: false, 682 Immediate: false, 683 }) 684 }(index, disk) 685 } 686 wg.Wait() 687 } 688 } 689 690 // Prepare object creation in all disks 691 for i, err := range errs { 692 drive := storageEndpoints[i].String() 693 switch err { 694 case nil: 695 hr.Before.Drives[i] = madmin.HealDriveInfo{Endpoint: drive, State: madmin.DriveStateOk} 696 hr.After.Drives[i] = madmin.HealDriveInfo{Endpoint: drive, State: madmin.DriveStateOk} 697 case errDiskNotFound: 698 hr.Before.Drives[i] = madmin.HealDriveInfo{State: madmin.DriveStateOffline} 699 hr.After.Drives[i] = madmin.HealDriveInfo{State: madmin.DriveStateOffline} 700 case errVolumeNotFound, errFileNotFound: 701 // Bucket or prefix/directory not found 702 hr.Before.Drives[i] = madmin.HealDriveInfo{Endpoint: drive, State: madmin.DriveStateMissing} 703 hr.After.Drives[i] = madmin.HealDriveInfo{Endpoint: drive, State: madmin.DriveStateMissing} 704 default: 705 hr.Before.Drives[i] = madmin.HealDriveInfo{Endpoint: drive, State: madmin.DriveStateCorrupt} 706 hr.After.Drives[i] = madmin.HealDriveInfo{Endpoint: drive, State: madmin.DriveStateCorrupt} 707 } 708 } 709 if danglingObject || isAllNotFound(errs) { 710 // Nothing to do, file is already gone. 711 return hr, errFileNotFound 712 } 713 714 if dryRun { 715 // Quit without try to heal the object dir 716 return hr, nil 717 } 718 719 for i, err := range errs { 720 if err == errVolumeNotFound || err == errFileNotFound { 721 // Bucket or prefix/directory not found 722 merr := storageDisks[i].MakeVol(ctx, pathJoin(bucket, object)) 723 switch merr { 724 case nil, errVolumeExists: 725 hr.After.Drives[i].State = madmin.DriveStateOk 726 case errDiskNotFound: 727 hr.After.Drives[i].State = madmin.DriveStateOffline 728 default: 729 hr.After.Drives[i].State = madmin.DriveStateCorrupt 730 } 731 } 732 } 733 return hr, nil 734 } 735 736 // Populates default heal result item entries with possible values when we are returning prematurely. 737 // This is to ensure that in any circumstance we are not returning empty arrays with wrong values. 738 func (er *erasureObjects) defaultHealResult(lfi FileInfo, storageDisks []StorageAPI, storageEndpoints []Endpoint, errs []error, bucket, object, versionID string) madmin.HealResultItem { 739 // Initialize heal result object 740 result := madmin.HealResultItem{ 741 Type: madmin.HealItemObject, 742 Bucket: bucket, 743 Object: object, 744 ObjectSize: lfi.Size, 745 VersionID: versionID, 746 DiskCount: len(storageDisks), 747 } 748 749 if lfi.IsValid() { 750 result.ParityBlocks = lfi.Erasure.ParityBlocks 751 } else { 752 // Default to most common configuration for erasure blocks. 753 result.ParityBlocks = er.defaultParityCount 754 } 755 result.DataBlocks = len(storageDisks) - result.ParityBlocks 756 757 for index, disk := range storageDisks { 758 if disk == nil { 759 result.Before.Drives = append(result.Before.Drives, madmin.HealDriveInfo{ 760 UUID: "", 761 Endpoint: storageEndpoints[index].String(), 762 State: madmin.DriveStateOffline, 763 }) 764 result.After.Drives = append(result.After.Drives, madmin.HealDriveInfo{ 765 UUID: "", 766 Endpoint: storageEndpoints[index].String(), 767 State: madmin.DriveStateOffline, 768 }) 769 continue 770 } 771 driveState := madmin.DriveStateCorrupt 772 switch errs[index] { 773 case errFileNotFound, errVolumeNotFound: 774 driveState = madmin.DriveStateMissing 775 case nil: 776 driveState = madmin.DriveStateOk 777 } 778 result.Before.Drives = append(result.Before.Drives, madmin.HealDriveInfo{ 779 UUID: "", 780 Endpoint: storageEndpoints[index].String(), 781 State: driveState, 782 }) 783 result.After.Drives = append(result.After.Drives, madmin.HealDriveInfo{ 784 UUID: "", 785 Endpoint: storageEndpoints[index].String(), 786 State: driveState, 787 }) 788 } 789 790 return result 791 } 792 793 // Stat all directories. 794 func statAllDirs(ctx context.Context, storageDisks []StorageAPI, bucket, prefix string) []error { 795 g := errgroup.WithNErrs(len(storageDisks)) 796 for index, disk := range storageDisks { 797 if disk == nil { 798 continue 799 } 800 index := index 801 g.Go(func() error { 802 entries, err := storageDisks[index].ListDir(ctx, "", bucket, prefix, 1) 803 if err != nil { 804 return err 805 } 806 if len(entries) > 0 { 807 return errVolumeNotEmpty 808 } 809 return nil 810 }, index) 811 } 812 813 return g.Wait() 814 } 815 816 func isAllVolumeNotFound(errs []error) bool { 817 return countErrs(errs, errVolumeNotFound) == len(errs) 818 } 819 820 // isAllNotFound will return if any element of the error slice is not 821 // errFileNotFound, errFileVersionNotFound or errVolumeNotFound. 822 // A 0 length slice will always return false. 823 func isAllNotFound(errs []error) bool { 824 for _, err := range errs { 825 if err != nil { 826 switch err.Error() { 827 case errFileNotFound.Error(): 828 fallthrough 829 case errVolumeNotFound.Error(): 830 fallthrough 831 case errFileVersionNotFound.Error(): 832 continue 833 } 834 } 835 return false 836 } 837 return len(errs) > 0 838 } 839 840 // isAllBucketsNotFound will return true if all the errors are either errFileNotFound 841 // or errFileCorrupt 842 // A 0 length slice will always return false. 843 func isAllBucketsNotFound(errs []error) bool { 844 if len(errs) == 0 { 845 return false 846 } 847 notFoundCount := 0 848 for _, err := range errs { 849 if err != nil { 850 if errors.Is(err, errVolumeNotFound) { 851 notFoundCount++ 852 } else if isErrBucketNotFound(err) { 853 notFoundCount++ 854 } 855 } 856 } 857 return len(errs) == notFoundCount 858 } 859 860 // ObjectDir is considered dangling/corrupted if any only 861 // if total disks - a combination of corrupted and missing 862 // files is lesser than N/2+1 number of disks. 863 // If no files were found false will be returned. 864 func isObjectDirDangling(errs []error) (ok bool) { 865 var found int 866 var notFound int 867 var foundNotEmpty int 868 var otherFound int 869 for _, readErr := range errs { 870 switch { 871 case readErr == nil: 872 found++ 873 case readErr == errFileNotFound || readErr == errVolumeNotFound: 874 notFound++ 875 case readErr == errVolumeNotEmpty: 876 foundNotEmpty++ 877 default: 878 otherFound++ 879 } 880 } 881 found = found + foundNotEmpty + otherFound 882 return found < notFound && found > 0 883 } 884 885 // Object is considered dangling/corrupted if and only 886 // if total disks - a combination of corrupted and missing 887 // files is lesser than number of data blocks. 888 func isObjectDangling(metaArr []FileInfo, errs []error, dataErrs []error) (validMeta FileInfo, ok bool) { 889 // We can consider an object data not reliable 890 // when xl.meta is not found in read quorum disks. 891 // or when xl.meta is not readable in read quorum disks. 892 danglingErrsCount := func(cerrs []error) (int, int) { 893 var ( 894 notFoundCount int 895 nonActionableCount int 896 ) 897 for _, readErr := range cerrs { 898 if readErr == nil { 899 continue 900 } 901 switch { 902 case errors.Is(readErr, errFileNotFound) || errors.Is(readErr, errFileVersionNotFound): 903 notFoundCount++ 904 default: 905 // All other errors are non-actionable 906 nonActionableCount++ 907 } 908 } 909 return notFoundCount, nonActionableCount 910 } 911 912 notFoundMetaErrs, nonActionableMetaErrs := danglingErrsCount(errs) 913 notFoundPartsErrs, nonActionablePartsErrs := danglingErrsCount(dataErrs) 914 915 for _, m := range metaArr { 916 if m.IsValid() { 917 validMeta = m 918 break 919 } 920 } 921 922 if !validMeta.IsValid() { 923 // validMeta is invalid because all xl.meta is missing apparently 924 // we should figure out if dataDirs are also missing > dataBlocks. 925 dataBlocks := (len(dataErrs) + 1) / 2 926 if notFoundPartsErrs > dataBlocks { 927 // Not using parity to ensure that we do not delete 928 // any valid content, if any is recoverable. But if 929 // notFoundDataDirs are already greater than the data 930 // blocks all bets are off and it is safe to purge. 931 // 932 // This is purely a defensive code, ideally parityBlocks 933 // is sufficient, however we can't know that since we 934 // do have the FileInfo{}. 935 return validMeta, true 936 } 937 938 // We have no idea what this file is, leave it as is. 939 return validMeta, false 940 } 941 942 if nonActionableMetaErrs > 0 || nonActionablePartsErrs > 0 { 943 return validMeta, false 944 } 945 946 if validMeta.Deleted { 947 // notFoundPartsErrs is ignored since 948 // - delete marker does not have any parts 949 dataBlocks := (len(errs) + 1) / 2 950 return validMeta, notFoundMetaErrs > dataBlocks 951 } 952 953 // TODO: It is possible to replay the object via just single 954 // xl.meta file, considering quorum number of data-dirs are still 955 // present on other drives. 956 // 957 // However this requires a bit of a rewrite, leave this up for 958 // future work. 959 if notFoundMetaErrs > 0 && notFoundMetaErrs > validMeta.Erasure.ParityBlocks { 960 // All xl.meta is beyond data blocks missing, this is dangling 961 return validMeta, true 962 } 963 964 if !validMeta.IsRemote() && notFoundPartsErrs > 0 && notFoundPartsErrs > validMeta.Erasure.ParityBlocks { 965 // All data-dir is beyond data blocks missing, this is dangling 966 return validMeta, true 967 } 968 969 return validMeta, false 970 } 971 972 // HealObject - heal the given object, automatically deletes the object if stale/corrupted if `remove` is true. 973 func (er erasureObjects) HealObject(ctx context.Context, bucket, object, versionID string, opts madmin.HealOpts) (hr madmin.HealResultItem, err error) { 974 // Create context that also contains information about the object and bucket. 975 // The top level handler might not have this information. 976 reqInfo := logger.GetReqInfo(ctx) 977 var newReqInfo *logger.ReqInfo 978 if reqInfo != nil { 979 newReqInfo = logger.NewReqInfo(reqInfo.RemoteHost, reqInfo.UserAgent, reqInfo.DeploymentID, reqInfo.RequestID, reqInfo.API, bucket, object) 980 } else { 981 newReqInfo = logger.NewReqInfo("", "", globalDeploymentID(), "", "Heal", bucket, object) 982 } 983 healCtx := logger.SetReqInfo(GlobalContext, newReqInfo) 984 985 // Healing directories handle it separately. 986 if HasSuffix(object, SlashSeparator) { 987 hr, err := er.healObjectDir(healCtx, bucket, object, opts.DryRun, opts.Remove) 988 return hr, toObjectErr(err, bucket, object) 989 } 990 991 storageDisks := er.getDisks() 992 storageEndpoints := er.getEndpoints() 993 994 // When versionID is empty, we read directly from the `null` versionID for healing. 995 if versionID == "" { 996 versionID = nullVersionID 997 } 998 999 // Perform quick read without lock. 1000 // This allows to quickly check if all is ok or all are missing. 1001 _, errs := readAllFileInfo(healCtx, storageDisks, "", bucket, object, versionID, false, false) 1002 if isAllNotFound(errs) { 1003 err := errFileNotFound 1004 if versionID != "" { 1005 err = errFileVersionNotFound 1006 } 1007 // Nothing to do, file is already gone. 1008 return er.defaultHealResult(FileInfo{}, storageDisks, storageEndpoints, 1009 errs, bucket, object, versionID), toObjectErr(err, bucket, object, versionID) 1010 } 1011 1012 // Heal the object. 1013 hr, err = er.healObject(healCtx, bucket, object, versionID, opts) 1014 if errors.Is(err, errFileCorrupt) && opts.ScanMode != madmin.HealDeepScan { 1015 // Instead of returning an error when a bitrot error is detected 1016 // during a normal heal scan, heal again with bitrot flag enabled. 1017 opts.ScanMode = madmin.HealDeepScan 1018 hr, err = er.healObject(healCtx, bucket, object, versionID, opts) 1019 } 1020 return hr, toObjectErr(err, bucket, object, versionID) 1021 } 1022 1023 // healTrace sends healing results to trace output. 1024 func healTrace(funcName healingMetric, startTime time.Time, bucket, object string, opts *madmin.HealOpts, err error, result *madmin.HealResultItem) { 1025 tr := madmin.TraceInfo{ 1026 TraceType: madmin.TraceHealing, 1027 Time: startTime, 1028 NodeName: globalLocalNodeName, 1029 FuncName: "heal." + funcName.String(), 1030 Duration: time.Since(startTime), 1031 Path: pathJoin(bucket, decodeDirObject(object)), 1032 } 1033 if opts != nil { 1034 tr.Custom = map[string]string{ 1035 "dry": fmt.Sprint(opts.DryRun), 1036 "remove": fmt.Sprint(opts.Remove), 1037 "mode": fmt.Sprint(opts.ScanMode), 1038 } 1039 if result != nil { 1040 tr.Custom["version-id"] = result.VersionID 1041 tr.Custom["disks"] = strconv.Itoa(result.DiskCount) 1042 } 1043 } 1044 if err != nil { 1045 tr.Error = err.Error() 1046 } else { 1047 tr.HealResult = result 1048 } 1049 globalTrace.Publish(tr) 1050 }