github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/admin-heal-ops.go (about) 1 // Copyright (c) 2015-2021 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package cmd 19 20 import ( 21 "context" 22 "encoding/json" 23 "errors" 24 "fmt" 25 "net/http" 26 "sort" 27 "sync" 28 "time" 29 30 "github.com/minio/madmin-go/v3" 31 xioutil "github.com/minio/minio/internal/ioutil" 32 "github.com/minio/minio/internal/logger" 33 ) 34 35 // healStatusSummary - overall short summary of a healing sequence 36 type healStatusSummary string 37 38 // healStatusSummary constants 39 const ( 40 healNotStartedStatus healStatusSummary = "not started" 41 healRunningStatus = "running" 42 healStoppedStatus = "stopped" 43 healFinishedStatus = "finished" 44 ) 45 46 const ( 47 // a heal sequence with this many un-consumed heal result 48 // items blocks until heal-status consumption resumes or is 49 // aborted due to timeout. 50 maxUnconsumedHealResultItems = 1000 51 52 // if no heal-results are consumed (via the heal-status API) 53 // for this timeout duration, the heal sequence is aborted. 54 healUnconsumedTimeout = 24 * time.Hour 55 56 // time-duration to keep heal sequence state after it 57 // completes. 58 keepHealSeqStateDuration = time.Minute * 10 59 60 // nopHeal is a no operating healing action to 61 // wait for the current healing operation to finish 62 nopHeal = "" 63 ) 64 65 var ( 66 errHealIdleTimeout = fmt.Errorf("healing results were not consumed for too long") 67 errHealStopSignalled = fmt.Errorf("heal stop signaled") 68 69 errFnHealFromAPIErr = func(ctx context.Context, err error) error { 70 apiErr := toAdminAPIErr(ctx, err) 71 return fmt.Errorf("Heal internal error: %s: %s", 72 apiErr.Code, apiErr.Description) 73 } 74 ) 75 76 // healSequenceStatus - accumulated status of the heal sequence 77 type healSequenceStatus struct { 78 // summary and detail for failures 79 Summary healStatusSummary `json:"Summary"` 80 FailureDetail string `json:"Detail,omitempty"` 81 StartTime time.Time `json:"StartTime"` 82 83 // settings for the heal sequence 84 HealSettings madmin.HealOpts `json:"Settings"` 85 86 // slice of available heal result records 87 Items []madmin.HealResultItem `json:"Items"` 88 } 89 90 // structure to hold state of all heal sequences in server memory 91 type allHealState struct { 92 sync.RWMutex 93 94 // map of heal path to heal sequence 95 healSeqMap map[string]*healSequence // Indexed by endpoint 96 // keep track of the healing status of disks in the memory 97 // false: the disk needs to be healed but no healing routine is started 98 // true: the disk is currently healing 99 healLocalDisks map[Endpoint]bool 100 healStatus map[string]healingTracker // Indexed by disk ID 101 } 102 103 // newHealState - initialize global heal state management 104 func newHealState(ctx context.Context, cleanup bool) *allHealState { 105 hstate := &allHealState{ 106 healSeqMap: make(map[string]*healSequence), 107 healLocalDisks: make(map[Endpoint]bool), 108 healStatus: make(map[string]healingTracker), 109 } 110 if cleanup { 111 go hstate.periodicHealSeqsClean(ctx) 112 } 113 return hstate 114 } 115 116 func (ahs *allHealState) popHealLocalDisks(healLocalDisks ...Endpoint) { 117 ahs.Lock() 118 defer ahs.Unlock() 119 120 for _, ep := range healLocalDisks { 121 delete(ahs.healLocalDisks, ep) 122 } 123 for id, disk := range ahs.healStatus { 124 for _, ep := range healLocalDisks { 125 if disk.Endpoint == ep.String() { 126 delete(ahs.healStatus, id) 127 } 128 } 129 } 130 } 131 132 // updateHealStatus will update the heal status. 133 func (ahs *allHealState) updateHealStatus(tracker *healingTracker) { 134 ahs.Lock() 135 defer ahs.Unlock() 136 137 tracker.mu.RLock() 138 t := *tracker 139 t.QueuedBuckets = append(make([]string, 0, len(tracker.QueuedBuckets)), tracker.QueuedBuckets...) 140 t.HealedBuckets = append(make([]string, 0, len(tracker.HealedBuckets)), tracker.HealedBuckets...) 141 ahs.healStatus[tracker.ID] = t 142 tracker.mu.RUnlock() 143 } 144 145 // Sort by zone, set and disk index 146 func sortDisks(disks []madmin.Disk) { 147 sort.Slice(disks, func(i, j int) bool { 148 a, b := &disks[i], &disks[j] 149 if a.PoolIndex != b.PoolIndex { 150 return a.PoolIndex < b.PoolIndex 151 } 152 if a.SetIndex != b.SetIndex { 153 return a.SetIndex < b.SetIndex 154 } 155 return a.DiskIndex < b.DiskIndex 156 }) 157 } 158 159 // getLocalHealingDisks returns local healing disks indexed by endpoint. 160 func (ahs *allHealState) getLocalHealingDisks() map[string]madmin.HealingDisk { 161 ahs.RLock() 162 defer ahs.RUnlock() 163 dst := make(map[string]madmin.HealingDisk, len(ahs.healStatus)) 164 for _, v := range ahs.healStatus { 165 dst[v.Endpoint] = v.toHealingDisk() 166 } 167 168 return dst 169 } 170 171 // getHealLocalDiskEndpoints() returns the list of disks that need 172 // to be healed but there is no healing routine in progress on them. 173 func (ahs *allHealState) getHealLocalDiskEndpoints() Endpoints { 174 ahs.RLock() 175 defer ahs.RUnlock() 176 177 var endpoints Endpoints 178 for ep, healing := range ahs.healLocalDisks { 179 if !healing { 180 endpoints = append(endpoints, ep) 181 } 182 } 183 return endpoints 184 } 185 186 // Set, in the memory, the state of the disk as currently healing or not 187 func (ahs *allHealState) setDiskHealingStatus(ep Endpoint, healing bool) { 188 ahs.Lock() 189 defer ahs.Unlock() 190 191 ahs.healLocalDisks[ep] = healing 192 } 193 194 func (ahs *allHealState) pushHealLocalDisks(healLocalDisks ...Endpoint) { 195 ahs.Lock() 196 defer ahs.Unlock() 197 198 for _, ep := range healLocalDisks { 199 ahs.healLocalDisks[ep] = false 200 } 201 } 202 203 func (ahs *allHealState) periodicHealSeqsClean(ctx context.Context) { 204 // Launch clean-up routine to remove this heal sequence (after 205 // it ends) from the global state after timeout has elapsed. 206 periodicTimer := time.NewTimer(time.Minute * 5) 207 defer periodicTimer.Stop() 208 209 for { 210 select { 211 case <-periodicTimer.C: 212 now := UTCNow() 213 ahs.Lock() 214 for path, h := range ahs.healSeqMap { 215 if h.hasEnded() && h.endTime.Add(keepHealSeqStateDuration).Before(now) { 216 delete(ahs.healSeqMap, path) 217 } 218 } 219 ahs.Unlock() 220 221 periodicTimer.Reset(time.Minute * 5) 222 case <-ctx.Done(): 223 // server could be restarting - need 224 // to exit immediately 225 return 226 } 227 } 228 } 229 230 // getHealSequenceByToken - Retrieve a heal sequence by token. The second 231 // argument returns if a heal sequence actually exists. 232 func (ahs *allHealState) getHealSequenceByToken(token string) (h *healSequence, exists bool) { 233 ahs.RLock() 234 defer ahs.RUnlock() 235 for _, healSeq := range ahs.healSeqMap { 236 if healSeq.clientToken == token { 237 return healSeq, true 238 } 239 } 240 return nil, false 241 } 242 243 // getHealSequence - Retrieve a heal sequence by path. The second 244 // argument returns if a heal sequence actually exists. 245 func (ahs *allHealState) getHealSequence(path string) (h *healSequence, exists bool) { 246 ahs.RLock() 247 defer ahs.RUnlock() 248 h, exists = ahs.healSeqMap[path] 249 return h, exists 250 } 251 252 func (ahs *allHealState) stopHealSequence(path string) ([]byte, APIError) { 253 var hsp madmin.HealStopSuccess 254 he, exists := ahs.getHealSequence(path) 255 if !exists { 256 hsp = madmin.HealStopSuccess{ 257 ClientToken: "unknown", 258 StartTime: UTCNow(), 259 } 260 } else { 261 clientToken := he.clientToken 262 if globalIsDistErasure { 263 clientToken = fmt.Sprintf("%s:%d", he.clientToken, GetProxyEndpointLocalIndex(globalProxyEndpoints)) 264 } 265 266 hsp = madmin.HealStopSuccess{ 267 ClientToken: clientToken, 268 ClientAddress: he.clientAddress, 269 StartTime: he.startTime, 270 } 271 272 he.stop() 273 for !he.hasEnded() { 274 time.Sleep(1 * time.Second) 275 } 276 ahs.Lock() 277 defer ahs.Unlock() 278 // Heal sequence explicitly stopped, remove it. 279 delete(ahs.healSeqMap, path) 280 } 281 282 b, err := json.Marshal(&hsp) 283 return b, toAdminAPIErr(GlobalContext, err) 284 } 285 286 // LaunchNewHealSequence - launches a background routine that performs 287 // healing according to the healSequence argument. For each heal 288 // sequence, state is stored in the `globalAllHealState`, which is a 289 // map of the heal path to `healSequence` which holds state about the 290 // heal sequence. 291 // 292 // Heal results are persisted in server memory for 293 // `keepHealSeqStateDuration`. This function also launches a 294 // background routine to clean up heal results after the 295 // aforementioned duration. 296 func (ahs *allHealState) LaunchNewHealSequence(h *healSequence, objAPI ObjectLayer) ( 297 respBytes []byte, apiErr APIError, errMsg string, 298 ) { 299 if h.forceStarted { 300 _, apiErr = ahs.stopHealSequence(pathJoin(h.bucket, h.object)) 301 if apiErr.Code != "" { 302 return respBytes, apiErr, "" 303 } 304 } else { 305 oh, exists := ahs.getHealSequence(pathJoin(h.bucket, h.object)) 306 if exists && !oh.hasEnded() { 307 errMsg = "Heal is already running on the given path " + 308 "(use force-start option to stop and start afresh). " + 309 fmt.Sprintf("The heal was started by IP %s at %s, token is %s", 310 h.clientAddress, h.startTime.Format(http.TimeFormat), h.clientToken) 311 return nil, errorCodes.ToAPIErr(ErrHealAlreadyRunning), errMsg 312 } 313 } 314 315 ahs.Lock() 316 defer ahs.Unlock() 317 318 // Check if new heal sequence to be started overlaps with any 319 // existing, running sequence 320 hpath := pathJoin(h.bucket, h.object) 321 for k, hSeq := range ahs.healSeqMap { 322 if !hSeq.hasEnded() && (HasPrefix(k, hpath) || HasPrefix(hpath, k)) { 323 errMsg = "The provided heal sequence path overlaps with an existing " + 324 fmt.Sprintf("heal path: %s", k) 325 return nil, errorCodes.ToAPIErr(ErrHealOverlappingPaths), errMsg 326 } 327 } 328 329 // Add heal state and start sequence 330 ahs.healSeqMap[hpath] = h 331 332 // Launch top-level background heal go-routine 333 go h.healSequenceStart(objAPI) 334 335 clientToken := h.clientToken 336 if globalIsDistErasure { 337 clientToken = fmt.Sprintf("%s:%d", h.clientToken, GetProxyEndpointLocalIndex(globalProxyEndpoints)) 338 } 339 340 b, err := json.Marshal(madmin.HealStartSuccess{ 341 ClientToken: clientToken, 342 ClientAddress: h.clientAddress, 343 StartTime: h.startTime, 344 }) 345 if err != nil { 346 logger.LogIf(h.ctx, err) 347 return nil, toAdminAPIErr(h.ctx, err), "" 348 } 349 return b, noError, "" 350 } 351 352 // PopHealStatusJSON - Called by heal-status API. It fetches the heal 353 // status results from global state and returns its JSON 354 // representation. The clientToken helps ensure there aren't 355 // conflicting clients fetching status. 356 func (ahs *allHealState) PopHealStatusJSON(hpath string, 357 clientToken string) ([]byte, APIErrorCode, 358 ) { 359 // fetch heal state for given path 360 h, exists := ahs.getHealSequence(hpath) 361 if !exists { 362 // heal sequence doesn't exist, must have finished. 363 jbytes, err := json.Marshal(healSequenceStatus{ 364 Summary: healFinishedStatus, 365 }) 366 return jbytes, toAdminAPIErrCode(GlobalContext, err) 367 } 368 369 // Check if client-token is valid 370 if clientToken != h.clientToken { 371 return nil, ErrHealInvalidClientToken 372 } 373 374 // Take lock to access and update the heal-sequence 375 h.mutex.Lock() 376 defer h.mutex.Unlock() 377 378 numItems := len(h.currentStatus.Items) 379 380 // calculate index of most recently available heal result 381 // record. 382 lastResultIndex := h.lastSentResultIndex 383 if numItems > 0 { 384 lastResultIndex = h.currentStatus.Items[numItems-1].ResultIndex 385 } 386 387 h.lastSentResultIndex = lastResultIndex 388 389 jbytes, err := json.Marshal(h.currentStatus) 390 if err != nil { 391 h.currentStatus.Items = nil 392 393 logger.LogIf(h.ctx, err) 394 return nil, ErrInternalError 395 } 396 397 h.currentStatus.Items = nil 398 399 return jbytes, ErrNone 400 } 401 402 // healSource denotes single entity and heal option. 403 type healSource struct { 404 bucket string 405 object string 406 versionID string 407 noWait bool // a non blocking call, if task queue is full return right away. 408 opts *madmin.HealOpts // optional heal option overrides default setting 409 } 410 411 // healSequence - state for each heal sequence initiated on the 412 // server. 413 type healSequence struct { 414 // bucket, and object on which heal seq. was initiated 415 bucket, object string 416 417 // Report healing progress 418 reportProgress bool 419 420 // time at which heal sequence was started 421 startTime time.Time 422 423 // time at which heal sequence has ended 424 endTime time.Time 425 426 // Heal client info 427 clientToken, clientAddress string 428 429 // was this heal sequence force started? 430 forceStarted bool 431 432 // heal settings applied to this heal sequence 433 settings madmin.HealOpts 434 435 // current accumulated status of the heal sequence 436 currentStatus healSequenceStatus 437 438 // channel signaled by background routine when traversal has 439 // completed 440 traverseAndHealDoneCh chan error 441 442 // canceler to cancel heal sequence. 443 cancelCtx context.CancelFunc 444 445 // the last result index sent to client 446 lastSentResultIndex int64 447 448 // Number of total items scanned against item type 449 scannedItemsMap map[madmin.HealItemType]int64 450 451 // Number of total items healed against item type 452 healedItemsMap map[madmin.HealItemType]int64 453 454 // Number of total items where healing failed against endpoint and drive state 455 healFailedItemsMap map[string]int64 456 457 // The time of the last scan/heal activity 458 lastHealActivity time.Time 459 460 // Holds the request-info for logging 461 ctx context.Context 462 463 // used to lock this structure as it is concurrently accessed 464 mutex sync.RWMutex 465 } 466 467 // NewHealSequence - creates healSettings, assumes bucket and 468 // objPrefix are already validated. 469 func newHealSequence(ctx context.Context, bucket, objPrefix, clientAddr string, 470 hs madmin.HealOpts, forceStart bool, 471 ) *healSequence { 472 reqInfo := &logger.ReqInfo{RemoteHost: clientAddr, API: "Heal", BucketName: bucket} 473 reqInfo.AppendTags("prefix", objPrefix) 474 ctx, cancel := context.WithCancel(logger.SetReqInfo(ctx, reqInfo)) 475 476 clientToken := mustGetUUID() 477 478 return &healSequence{ 479 bucket: bucket, 480 object: objPrefix, 481 reportProgress: true, 482 startTime: UTCNow(), 483 clientToken: clientToken, 484 clientAddress: clientAddr, 485 forceStarted: forceStart, 486 settings: hs, 487 currentStatus: healSequenceStatus{ 488 Summary: healNotStartedStatus, 489 HealSettings: hs, 490 }, 491 traverseAndHealDoneCh: make(chan error), 492 cancelCtx: cancel, 493 ctx: ctx, 494 scannedItemsMap: make(map[madmin.HealItemType]int64), 495 healedItemsMap: make(map[madmin.HealItemType]int64), 496 healFailedItemsMap: make(map[string]int64), 497 } 498 } 499 500 // getScannedItemsCount - returns a count of all scanned items 501 func (h *healSequence) getScannedItemsCount() int64 { 502 var count int64 503 h.mutex.RLock() 504 defer h.mutex.RUnlock() 505 506 for _, v := range h.scannedItemsMap { 507 count += v 508 } 509 return count 510 } 511 512 // getScannedItemsMap - returns map of all scanned items against type 513 func (h *healSequence) getScannedItemsMap() map[madmin.HealItemType]int64 { 514 h.mutex.RLock() 515 defer h.mutex.RUnlock() 516 517 // Make a copy before returning the value 518 retMap := make(map[madmin.HealItemType]int64, len(h.scannedItemsMap)) 519 for k, v := range h.scannedItemsMap { 520 retMap[k] = v 521 } 522 523 return retMap 524 } 525 526 // getHealedItemsMap - returns the map of all healed items against type 527 func (h *healSequence) getHealedItemsMap() map[madmin.HealItemType]int64 { 528 h.mutex.RLock() 529 defer h.mutex.RUnlock() 530 531 // Make a copy before returning the value 532 retMap := make(map[madmin.HealItemType]int64, len(h.healedItemsMap)) 533 for k, v := range h.healedItemsMap { 534 retMap[k] = v 535 } 536 537 return retMap 538 } 539 540 // gethealFailedItemsMap - returns map of all items where heal failed against 541 // drive endpoint and status 542 func (h *healSequence) gethealFailedItemsMap() map[string]int64 { 543 h.mutex.RLock() 544 defer h.mutex.RUnlock() 545 546 // Make a copy before returning the value 547 retMap := make(map[string]int64, len(h.healFailedItemsMap)) 548 for k, v := range h.healFailedItemsMap { 549 retMap[k] = v 550 } 551 552 return retMap 553 } 554 555 // isQuitting - determines if the heal sequence is quitting (due to an 556 // external signal) 557 func (h *healSequence) isQuitting() bool { 558 select { 559 case <-h.ctx.Done(): 560 return true 561 default: 562 return false 563 } 564 } 565 566 // check if the heal sequence has ended 567 func (h *healSequence) hasEnded() bool { 568 h.mutex.RLock() 569 defer h.mutex.RUnlock() 570 // background heal never ends 571 if h.clientToken == bgHealingUUID { 572 return false 573 } 574 return !h.endTime.IsZero() 575 } 576 577 // stops the heal sequence - safe to call multiple times. 578 func (h *healSequence) stop() { 579 h.cancelCtx() 580 } 581 582 // pushHealResultItem - pushes a heal result item for consumption in 583 // the heal-status API. It blocks if there are 584 // maxUnconsumedHealResultItems. When it blocks, the heal sequence 585 // routine is effectively paused - this happens when the server has 586 // accumulated the maximum number of heal records per heal 587 // sequence. When the client consumes further records, the heal 588 // sequence automatically resumes. The return value indicates if the 589 // operation succeeded. 590 func (h *healSequence) pushHealResultItem(r madmin.HealResultItem) error { 591 // start a timer to keep an upper time limit to find an empty 592 // slot to add the given heal result - if no slot is found it 593 // means that the server is holding the maximum amount of 594 // heal-results in memory and the client has not consumed it 595 // for too long. 596 unconsumedTimer := time.NewTimer(healUnconsumedTimeout) 597 defer unconsumedTimer.Stop() 598 599 var itemsLen int 600 for { 601 h.mutex.Lock() 602 itemsLen = len(h.currentStatus.Items) 603 if itemsLen == maxUnconsumedHealResultItems { 604 // wait for a second, or quit if an external 605 // stop signal is received or the 606 // unconsumedTimer fires. 607 select { 608 // Check after a second 609 case <-time.After(time.Second): 610 h.mutex.Unlock() 611 continue 612 613 case <-h.ctx.Done(): 614 h.mutex.Unlock() 615 // discard result and return. 616 return errHealStopSignalled 617 618 // Timeout if no results consumed for too long. 619 case <-unconsumedTimer.C: 620 h.mutex.Unlock() 621 return errHealIdleTimeout 622 } 623 } 624 break 625 } 626 627 // Set the correct result index for the new result item 628 if itemsLen > 0 { 629 r.ResultIndex = 1 + h.currentStatus.Items[itemsLen-1].ResultIndex 630 } else { 631 r.ResultIndex = 1 + h.lastSentResultIndex 632 } 633 634 // append to results 635 h.currentStatus.Items = append(h.currentStatus.Items, r) 636 637 // release lock 638 h.mutex.Unlock() 639 640 return nil 641 } 642 643 // healSequenceStart - this is the top-level background heal 644 // routine. It launches another go-routine that actually traverses 645 // on-disk data, checks and heals according to the selected 646 // settings. This go-routine itself, (1) monitors the traversal 647 // routine for completion, and (2) listens for external stop 648 // signals. When either event happens, it sets the finish status for 649 // the heal-sequence. 650 func (h *healSequence) healSequenceStart(objAPI ObjectLayer) { 651 // Set status as running 652 h.mutex.Lock() 653 h.currentStatus.Summary = healRunningStatus 654 h.currentStatus.StartTime = UTCNow() 655 h.mutex.Unlock() 656 657 go h.traverseAndHeal(objAPI) 658 659 select { 660 case err, ok := <-h.traverseAndHealDoneCh: 661 if !ok { 662 return 663 } 664 h.mutex.Lock() 665 h.endTime = UTCNow() 666 // Heal traversal is complete. 667 if err == nil { 668 // heal traversal succeeded. 669 h.currentStatus.Summary = healFinishedStatus 670 } else { 671 // heal traversal had an error. 672 h.currentStatus.Summary = healStoppedStatus 673 h.currentStatus.FailureDetail = err.Error() 674 } 675 h.mutex.Unlock() 676 case <-h.ctx.Done(): 677 h.mutex.Lock() 678 h.endTime = UTCNow() 679 h.currentStatus.Summary = healFinishedStatus 680 h.mutex.Unlock() 681 682 // drain traverse channel so the traversal 683 // go-routine does not leak. 684 go func() { 685 // Eventually the traversal go-routine closes 686 // the channel and returns, so this go-routine 687 // itself will not leak. 688 <-h.traverseAndHealDoneCh 689 }() 690 } 691 } 692 693 func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItemType) error { 694 // Send heal request 695 task := healTask{ 696 bucket: source.bucket, 697 object: source.object, 698 versionID: source.versionID, 699 opts: h.settings, 700 } 701 if source.opts != nil { 702 task.opts = *source.opts 703 } else { 704 task.opts.ScanMode = madmin.HealNormalScan 705 } 706 707 h.mutex.Lock() 708 h.scannedItemsMap[healType]++ 709 h.lastHealActivity = UTCNow() 710 h.mutex.Unlock() 711 712 if source.noWait { 713 select { 714 case globalBackgroundHealRoutine.tasks <- task: 715 if serverDebugLog { 716 fmt.Printf("Task in the queue: %#v\n", task) 717 } 718 default: 719 // task queue is full, no more workers, we shall move on and heal later. 720 return nil 721 } 722 // Don't wait for result 723 return nil 724 } 725 726 // respCh must be set to wait for result. 727 // We make it size 1, so a result can always be written 728 // even if we aren't listening. 729 task.respCh = make(chan healResult, 1) 730 select { 731 case globalBackgroundHealRoutine.tasks <- task: 732 if serverDebugLog { 733 fmt.Printf("Task in the queue: %#v\n", task) 734 } 735 case <-h.ctx.Done(): 736 return nil 737 } 738 739 // task queued, now wait for the response. 740 select { 741 case res := <-task.respCh: 742 if !h.reportProgress { 743 if errors.Is(res.err, errSkipFile) { // this is only sent usually by nopHeal 744 return nil 745 } 746 747 h.mutex.Lock() 748 defer h.mutex.Unlock() 749 750 // Progress is not reported in case of background heal processing. 751 // Instead we increment relevant counter based on the heal result 752 // for prometheus reporting. 753 if res.err != nil { 754 for _, d := range res.result.After.Drives { 755 // For failed items we report the endpoint and drive state 756 // This will help users take corrective actions for drives 757 h.healFailedItemsMap[d.Endpoint+","+d.State]++ 758 } 759 } else { 760 // Only object type reported for successful healing 761 h.healedItemsMap[res.result.Type]++ 762 } 763 764 // Report caller of any failure 765 return res.err 766 } 767 res.result.Type = healType 768 if res.err != nil { 769 // Only report object error 770 if healType != madmin.HealItemObject { 771 return res.err 772 } 773 res.result.Detail = res.err.Error() 774 } 775 return h.pushHealResultItem(res.result) 776 case <-h.ctx.Done(): 777 return nil 778 } 779 } 780 781 func (h *healSequence) healDiskMeta(objAPI ObjectLayer) error { 782 // Start healing the config prefix. 783 return h.healMinioSysMeta(objAPI, minioConfigPrefix)() 784 } 785 786 func (h *healSequence) healItems(objAPI ObjectLayer, bucketsOnly bool) error { 787 if h.clientToken == bgHealingUUID { 788 // For background heal do nothing. 789 return nil 790 } 791 792 if err := h.healDiskMeta(objAPI); err != nil { 793 return err 794 } 795 796 // Heal buckets and objects 797 return h.healBuckets(objAPI, bucketsOnly) 798 } 799 800 // traverseAndHeal - traverses on-disk data and performs healing 801 // according to settings. At each "safe" point it also checks if an 802 // external quit signal has been received and quits if so. Since the 803 // healing traversal may be mutating on-disk data when an external 804 // quit signal is received, this routine cannot quit immediately and 805 // has to wait until a safe point is reached, such as between scanning 806 // two objects. 807 func (h *healSequence) traverseAndHeal(objAPI ObjectLayer) { 808 bucketsOnly := false // Heals buckets and objects also. 809 h.traverseAndHealDoneCh <- h.healItems(objAPI, bucketsOnly) 810 xioutil.SafeClose(h.traverseAndHealDoneCh) 811 } 812 813 // healMinioSysMeta - heals all files under a given meta prefix, returns a function 814 // which in-turn heals the respective meta directory path and any files in int. 815 func (h *healSequence) healMinioSysMeta(objAPI ObjectLayer, metaPrefix string) func() error { 816 return func() error { 817 // NOTE: Healing on meta is run regardless 818 // of any bucket being selected, this is to ensure that 819 // meta are always upto date and correct. 820 return objAPI.HealObjects(h.ctx, minioMetaBucket, metaPrefix, h.settings, func(bucket, object, versionID string, scanMode madmin.HealScanMode) error { 821 if h.isQuitting() { 822 return errHealStopSignalled 823 } 824 825 err := h.queueHealTask(healSource{ 826 bucket: bucket, 827 object: object, 828 versionID: versionID, 829 }, madmin.HealItemBucketMetadata) 830 return err 831 }) 832 } 833 } 834 835 // healBuckets - check for all buckets heal or just particular bucket. 836 func (h *healSequence) healBuckets(objAPI ObjectLayer, bucketsOnly bool) error { 837 if h.isQuitting() { 838 return errHealStopSignalled 839 } 840 841 // 1. If a bucket was specified, heal only the bucket. 842 if h.bucket != "" { 843 return h.healBucket(objAPI, h.bucket, bucketsOnly) 844 } 845 846 buckets, err := objAPI.ListBuckets(h.ctx, BucketOptions{}) 847 if err != nil { 848 return errFnHealFromAPIErr(h.ctx, err) 849 } 850 851 // Heal latest buckets first. 852 sort.Slice(buckets, func(i, j int) bool { 853 return buckets[i].Created.After(buckets[j].Created) 854 }) 855 856 for _, bucket := range buckets { 857 if err = h.healBucket(objAPI, bucket.Name, bucketsOnly); err != nil { 858 return err 859 } 860 } 861 862 return nil 863 } 864 865 // healBucket - traverses and heals given bucket 866 func (h *healSequence) healBucket(objAPI ObjectLayer, bucket string, bucketsOnly bool) error { 867 if err := h.queueHealTask(healSource{bucket: bucket}, madmin.HealItemBucket); err != nil { 868 return err 869 } 870 871 if bucketsOnly { 872 return nil 873 } 874 875 if !h.settings.Recursive { 876 if h.object != "" { 877 if err := h.healObject(bucket, h.object, "", h.settings.ScanMode); err != nil { 878 return err 879 } 880 } 881 882 return nil 883 } 884 885 if err := objAPI.HealObjects(h.ctx, bucket, h.object, h.settings, h.healObject); err != nil { 886 return errFnHealFromAPIErr(h.ctx, err) 887 } 888 return nil 889 } 890 891 // healObject - heal the given object and record result 892 func (h *healSequence) healObject(bucket, object, versionID string, scanMode madmin.HealScanMode) error { 893 if h.isQuitting() { 894 return errHealStopSignalled 895 } 896 897 err := h.queueHealTask(healSource{ 898 bucket: bucket, 899 object: object, 900 versionID: versionID, 901 opts: &h.settings, 902 }, madmin.HealItemObject) 903 904 // Wait and proceed if there are active requests 905 waitForLowHTTPReq() 906 907 return err 908 }