storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/cmd/admin-heal-ops.go (about) 1 /* 2 * MinIO Cloud Storage, (C) 2017 MinIO, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package cmd 18 19 import ( 20 "context" 21 "encoding/json" 22 "fmt" 23 "net/http" 24 "sort" 25 "sync" 26 "time" 27 28 "storj.io/minio/cmd/logger" 29 "storj.io/minio/pkg/madmin" 30 ) 31 32 // healStatusSummary - overall short summary of a healing sequence 33 type healStatusSummary string 34 35 // healStatusSummary constants 36 const ( 37 healNotStartedStatus healStatusSummary = "not started" 38 healRunningStatus = "running" 39 healStoppedStatus = "stopped" 40 healFinishedStatus = "finished" 41 ) 42 43 const ( 44 // a heal sequence with this many un-consumed heal result 45 // items blocks until heal-status consumption resumes or is 46 // aborted due to timeout. 47 maxUnconsumedHealResultItems = 1000 48 49 // if no heal-results are consumed (via the heal-status API) 50 // for this timeout duration, the heal sequence is aborted. 51 healUnconsumedTimeout = 24 * time.Hour 52 53 // time-duration to keep heal sequence state after it 54 // completes. 55 keepHealSeqStateDuration = time.Minute * 10 56 57 // nopHeal is a no operating healing action to 58 // wait for the current healing operation to finish 59 nopHeal = "" 60 ) 61 62 var ( 63 errHealIdleTimeout = fmt.Errorf("healing results were not consumed for too long") 64 errHealStopSignalled = fmt.Errorf("heal stop signaled") 65 66 errFnHealFromAPIErr = func(ctx context.Context, err error) error { 67 apiErr := toAdminAPIErr(ctx, err) 68 return fmt.Errorf("Heal internal error: %s: %s", 69 apiErr.Code, apiErr.Description) 70 } 71 ) 72 73 // healSequenceStatus - accumulated status of the heal sequence 74 type healSequenceStatus struct { 75 // summary and detail for failures 76 Summary healStatusSummary `json:"Summary"` 77 FailureDetail string `json:"Detail,omitempty"` 78 StartTime time.Time `json:"StartTime"` 79 80 // settings for the heal sequence 81 HealSettings madmin.HealOpts `json:"Settings"` 82 83 // slice of available heal result records 84 Items []madmin.HealResultItem `json:"Items"` 85 } 86 87 // structure to hold state of all heal sequences in server memory 88 type allHealState struct { 89 sync.RWMutex 90 91 // map of heal path to heal sequence 92 healSeqMap map[string]*healSequence // Indexed by endpoint 93 healLocalDisks map[Endpoint]struct{} 94 healStatus map[string]healingTracker // Indexed by disk ID 95 } 96 97 // newHealState - initialize global heal state management 98 func newHealState(cleanup bool) *allHealState { 99 hstate := &allHealState{ 100 healSeqMap: make(map[string]*healSequence), 101 healLocalDisks: map[Endpoint]struct{}{}, 102 healStatus: make(map[string]healingTracker), 103 } 104 if cleanup { 105 go hstate.periodicHealSeqsClean(GlobalContext) 106 } 107 return hstate 108 } 109 110 func (ahs *allHealState) healDriveCount() int { 111 ahs.RLock() 112 defer ahs.RUnlock() 113 114 return len(ahs.healLocalDisks) 115 } 116 117 func (ahs *allHealState) popHealLocalDisks(healLocalDisks ...Endpoint) { 118 ahs.Lock() 119 defer ahs.Unlock() 120 121 for _, ep := range healLocalDisks { 122 delete(ahs.healLocalDisks, ep) 123 } 124 for id, disk := range ahs.healStatus { 125 for _, ep := range healLocalDisks { 126 if disk.Endpoint == ep.String() { 127 delete(ahs.healStatus, id) 128 } 129 } 130 } 131 } 132 133 // updateHealStatus will update the heal status. 134 func (ahs *allHealState) updateHealStatus(tracker *healingTracker) { 135 ahs.Lock() 136 defer ahs.Unlock() 137 ahs.healStatus[tracker.ID] = *tracker 138 } 139 140 // Sort by zone, set and disk index 141 func sortDisks(disks []madmin.Disk) { 142 sort.Slice(disks, func(i, j int) bool { 143 a, b := &disks[i], &disks[j] 144 if a.PoolIndex != b.PoolIndex { 145 return a.PoolIndex < b.PoolIndex 146 } 147 if a.SetIndex != b.SetIndex { 148 return a.SetIndex < b.SetIndex 149 } 150 return a.DiskIndex < b.DiskIndex 151 }) 152 } 153 154 // getLocalHealingDisks returns local healing disks indexed by endpoint. 155 func (ahs *allHealState) getLocalHealingDisks() map[string]madmin.HealingDisk { 156 ahs.RLock() 157 defer ahs.RUnlock() 158 dst := make(map[string]madmin.HealingDisk, len(ahs.healStatus)) 159 for _, v := range ahs.healStatus { 160 dst[v.Endpoint] = v.toHealingDisk() 161 } 162 163 return dst 164 } 165 166 func (ahs *allHealState) getHealLocalDiskEndpoints() Endpoints { 167 ahs.RLock() 168 defer ahs.RUnlock() 169 170 var endpoints Endpoints 171 for ep := range ahs.healLocalDisks { 172 endpoints = append(endpoints, ep) 173 } 174 return endpoints 175 } 176 177 func (ahs *allHealState) pushHealLocalDisks(healLocalDisks ...Endpoint) { 178 ahs.Lock() 179 defer ahs.Unlock() 180 181 for _, ep := range healLocalDisks { 182 ahs.healLocalDisks[ep] = struct{}{} 183 } 184 } 185 186 func (ahs *allHealState) periodicHealSeqsClean(ctx context.Context) { 187 // Launch clean-up routine to remove this heal sequence (after 188 // it ends) from the global state after timeout has elapsed. 189 periodicTimer := time.NewTimer(time.Minute * 5) 190 defer periodicTimer.Stop() 191 192 for { 193 select { 194 case <-periodicTimer.C: 195 periodicTimer.Reset(time.Minute * 5) 196 now := UTCNow() 197 ahs.Lock() 198 for path, h := range ahs.healSeqMap { 199 if h.hasEnded() && h.endTime.Add(keepHealSeqStateDuration).Before(now) { 200 delete(ahs.healSeqMap, path) 201 } 202 } 203 ahs.Unlock() 204 case <-ctx.Done(): 205 // server could be restarting - need 206 // to exit immediately 207 return 208 } 209 } 210 } 211 212 // getHealSequenceByToken - Retrieve a heal sequence by token. The second 213 // argument returns if a heal sequence actually exists. 214 func (ahs *allHealState) getHealSequenceByToken(token string) (h *healSequence, exists bool) { 215 ahs.Lock() 216 defer ahs.Unlock() 217 for _, healSeq := range ahs.healSeqMap { 218 if healSeq.clientToken == token { 219 return healSeq, true 220 } 221 } 222 return nil, false 223 } 224 225 // getHealSequence - Retrieve a heal sequence by path. The second 226 // argument returns if a heal sequence actually exists. 227 func (ahs *allHealState) getHealSequence(path string) (h *healSequence, exists bool) { 228 ahs.Lock() 229 defer ahs.Unlock() 230 h, exists = ahs.healSeqMap[path] 231 return h, exists 232 } 233 234 func (ahs *allHealState) stopHealSequence(path string) ([]byte, APIError) { 235 var hsp madmin.HealStopSuccess 236 he, exists := ahs.getHealSequence(path) 237 if !exists { 238 hsp = madmin.HealStopSuccess{ 239 ClientToken: "unknown", 240 StartTime: UTCNow(), 241 } 242 } else { 243 clientToken := he.clientToken 244 if globalIsDistErasure { 245 clientToken = fmt.Sprintf("%s@%d", he.clientToken, GetProxyEndpointLocalIndex(globalProxyEndpoints)) 246 } 247 248 hsp = madmin.HealStopSuccess{ 249 ClientToken: clientToken, 250 ClientAddress: he.clientAddress, 251 StartTime: he.startTime, 252 } 253 254 he.stop() 255 for !he.hasEnded() { 256 time.Sleep(1 * time.Second) 257 } 258 ahs.Lock() 259 defer ahs.Unlock() 260 // Heal sequence explicitly stopped, remove it. 261 delete(ahs.healSeqMap, path) 262 } 263 264 b, err := json.Marshal(&hsp) 265 return b, toAdminAPIErr(GlobalContext, err) 266 } 267 268 // LaunchNewHealSequence - launches a background routine that performs 269 // healing according to the healSequence argument. For each heal 270 // sequence, state is stored in the `globalAllHealState`, which is a 271 // map of the heal path to `healSequence` which holds state about the 272 // heal sequence. 273 // 274 // Heal results are persisted in server memory for 275 // `keepHealSeqStateDuration`. This function also launches a 276 // background routine to clean up heal results after the 277 // aforementioned duration. 278 func (ahs *allHealState) LaunchNewHealSequence(h *healSequence, objAPI ObjectLayer) ( 279 respBytes []byte, apiErr APIError, errMsg string) { 280 281 if h.forceStarted { 282 _, apiErr = ahs.stopHealSequence(pathJoin(h.bucket, h.object)) 283 if apiErr.Code != "" { 284 return respBytes, apiErr, "" 285 } 286 } else { 287 oh, exists := ahs.getHealSequence(pathJoin(h.bucket, h.object)) 288 if exists && !oh.hasEnded() { 289 errMsg = "Heal is already running on the given path " + 290 "(use force-start option to stop and start afresh). " + 291 fmt.Sprintf("The heal was started by IP %s at %s, token is %s", 292 h.clientAddress, h.startTime.Format(http.TimeFormat), h.clientToken) 293 return nil, errorCodes.ToAPIErr(ErrHealAlreadyRunning), errMsg 294 } 295 } 296 297 ahs.Lock() 298 defer ahs.Unlock() 299 300 // Check if new heal sequence to be started overlaps with any 301 // existing, running sequence 302 hpath := pathJoin(h.bucket, h.object) 303 for k, hSeq := range ahs.healSeqMap { 304 if !hSeq.hasEnded() && (HasPrefix(k, hpath) || HasPrefix(hpath, k)) { 305 errMsg = "The provided heal sequence path overlaps with an existing " + 306 fmt.Sprintf("heal path: %s", k) 307 return nil, errorCodes.ToAPIErr(ErrHealOverlappingPaths), errMsg 308 } 309 } 310 311 // Add heal state and start sequence 312 ahs.healSeqMap[hpath] = h 313 314 // Launch top-level background heal go-routine 315 go h.healSequenceStart(objAPI) 316 317 clientToken := h.clientToken 318 if globalIsDistErasure { 319 clientToken = fmt.Sprintf("%s@%d", h.clientToken, GetProxyEndpointLocalIndex(globalProxyEndpoints)) 320 } 321 322 b, err := json.Marshal(madmin.HealStartSuccess{ 323 ClientToken: clientToken, 324 ClientAddress: h.clientAddress, 325 StartTime: h.startTime, 326 }) 327 if err != nil { 328 logger.LogIf(h.ctx, err) 329 return nil, toAdminAPIErr(h.ctx, err), "" 330 } 331 return b, noError, "" 332 } 333 334 // PopHealStatusJSON - Called by heal-status API. It fetches the heal 335 // status results from global state and returns its JSON 336 // representation. The clientToken helps ensure there aren't 337 // conflicting clients fetching status. 338 func (ahs *allHealState) PopHealStatusJSON(hpath string, 339 clientToken string) ([]byte, APIErrorCode) { 340 341 // fetch heal state for given path 342 h, exists := ahs.getHealSequence(hpath) 343 if !exists { 344 // heal sequence doesn't exist, must have finished. 345 jbytes, err := json.Marshal(healSequenceStatus{ 346 Summary: healFinishedStatus, 347 }) 348 return jbytes, toAdminAPIErrCode(GlobalContext, err) 349 } 350 351 // Check if client-token is valid 352 if clientToken != h.clientToken { 353 return nil, ErrHealInvalidClientToken 354 } 355 356 // Take lock to access and update the heal-sequence 357 h.mutex.Lock() 358 defer h.mutex.Unlock() 359 360 numItems := len(h.currentStatus.Items) 361 362 // calculate index of most recently available heal result 363 // record. 364 lastResultIndex := h.lastSentResultIndex 365 if numItems > 0 { 366 lastResultIndex = h.currentStatus.Items[numItems-1].ResultIndex 367 } 368 369 h.lastSentResultIndex = lastResultIndex 370 371 jbytes, err := json.Marshal(h.currentStatus) 372 if err != nil { 373 h.currentStatus.Items = nil 374 375 logger.LogIf(h.ctx, err) 376 return nil, ErrInternalError 377 } 378 379 h.currentStatus.Items = nil 380 381 return jbytes, ErrNone 382 } 383 384 // healSource denotes single entity and heal option. 385 type healSource struct { 386 bucket string 387 object string 388 versionID string 389 opts *madmin.HealOpts // optional heal option overrides default setting 390 } 391 392 // healSequence - state for each heal sequence initiated on the 393 // server. 394 type healSequence struct { 395 // bucket, and object on which heal seq. was initiated 396 bucket, object string 397 398 // A channel of entities (format, buckets, objects) to heal 399 sourceCh chan healSource 400 401 // A channel of entities with heal result 402 respCh chan healResult 403 404 // Report healing progress 405 reportProgress bool 406 407 // time at which heal sequence was started 408 startTime time.Time 409 410 // time at which heal sequence has ended 411 endTime time.Time 412 413 // Heal client info 414 clientToken, clientAddress string 415 416 // was this heal sequence force started? 417 forceStarted bool 418 419 // heal settings applied to this heal sequence 420 settings madmin.HealOpts 421 422 // current accumulated status of the heal sequence 423 currentStatus healSequenceStatus 424 425 // channel signaled by background routine when traversal has 426 // completed 427 traverseAndHealDoneCh chan error 428 429 // canceler to cancel heal sequence. 430 cancelCtx context.CancelFunc 431 432 // the last result index sent to client 433 lastSentResultIndex int64 434 435 // Number of total items scanned against item type 436 scannedItemsMap map[madmin.HealItemType]int64 437 438 // Number of total items healed against item type 439 healedItemsMap map[madmin.HealItemType]int64 440 441 // Number of total items where healing failed against endpoint and drive state 442 healFailedItemsMap map[string]int64 443 444 // The time of the last scan/heal activity 445 lastHealActivity time.Time 446 447 // Holds the request-info for logging 448 ctx context.Context 449 450 // used to lock this structure as it is concurrently accessed 451 mutex sync.RWMutex 452 } 453 454 // NewHealSequence - creates healSettings, assumes bucket and 455 // objPrefix are already validated. 456 func newHealSequence(ctx context.Context, bucket, objPrefix, clientAddr string, 457 hs madmin.HealOpts, forceStart bool) *healSequence { 458 459 reqInfo := &logger.ReqInfo{RemoteHost: clientAddr, API: "Heal", BucketName: bucket} 460 reqInfo.AppendTags("prefix", objPrefix) 461 ctx, cancel := context.WithCancel(logger.SetReqInfo(ctx, reqInfo)) 462 463 clientToken := mustGetUUID() 464 465 return &healSequence{ 466 respCh: make(chan healResult), 467 bucket: bucket, 468 object: objPrefix, 469 reportProgress: true, 470 startTime: UTCNow(), 471 clientToken: clientToken, 472 clientAddress: clientAddr, 473 forceStarted: forceStart, 474 settings: hs, 475 currentStatus: healSequenceStatus{ 476 Summary: healNotStartedStatus, 477 HealSettings: hs, 478 }, 479 traverseAndHealDoneCh: make(chan error), 480 cancelCtx: cancel, 481 ctx: ctx, 482 scannedItemsMap: make(map[madmin.HealItemType]int64), 483 healedItemsMap: make(map[madmin.HealItemType]int64), 484 healFailedItemsMap: make(map[string]int64), 485 } 486 } 487 488 // resetHealStatusCounters - reset the healSequence status counters between 489 // each monthly background heal scanning activity. 490 // This is used only in case of Background healing scenario, where 491 // we use a single long running healSequence which reactively heals 492 // objects passed to the SourceCh. 493 func (h *healSequence) resetHealStatusCounters() { 494 h.mutex.Lock() 495 defer h.mutex.Unlock() 496 497 h.currentStatus.Items = []madmin.HealResultItem{} 498 h.lastSentResultIndex = 0 499 h.scannedItemsMap = make(map[madmin.HealItemType]int64) 500 h.healedItemsMap = make(map[madmin.HealItemType]int64) 501 h.healFailedItemsMap = make(map[string]int64) 502 } 503 504 // getScannedItemsCount - returns a count of all scanned items 505 func (h *healSequence) getScannedItemsCount() int64 { 506 var count int64 507 h.mutex.RLock() 508 defer h.mutex.RUnlock() 509 510 for _, v := range h.scannedItemsMap { 511 count = count + v 512 } 513 return count 514 } 515 516 // getScannedItemsMap - returns map of all scanned items against type 517 func (h *healSequence) getScannedItemsMap() map[madmin.HealItemType]int64 { 518 h.mutex.RLock() 519 defer h.mutex.RUnlock() 520 521 // Make a copy before returning the value 522 retMap := make(map[madmin.HealItemType]int64, len(h.scannedItemsMap)) 523 for k, v := range h.scannedItemsMap { 524 retMap[k] = v 525 } 526 527 return retMap 528 } 529 530 // getHealedItemsMap - returns the map of all healed items against type 531 func (h *healSequence) getHealedItemsMap() map[madmin.HealItemType]int64 { 532 h.mutex.RLock() 533 defer h.mutex.RUnlock() 534 535 // Make a copy before returning the value 536 retMap := make(map[madmin.HealItemType]int64, len(h.healedItemsMap)) 537 for k, v := range h.healedItemsMap { 538 retMap[k] = v 539 } 540 541 return retMap 542 } 543 544 // gethealFailedItemsMap - returns map of all items where heal failed against 545 // drive endpoint and status 546 func (h *healSequence) gethealFailedItemsMap() map[string]int64 { 547 h.mutex.RLock() 548 defer h.mutex.RUnlock() 549 550 // Make a copy before returning the value 551 retMap := make(map[string]int64, len(h.healFailedItemsMap)) 552 for k, v := range h.healFailedItemsMap { 553 retMap[k] = v 554 } 555 556 return retMap 557 } 558 559 // isQuitting - determines if the heal sequence is quitting (due to an 560 // external signal) 561 func (h *healSequence) isQuitting() bool { 562 select { 563 case <-h.ctx.Done(): 564 return true 565 default: 566 return false 567 } 568 } 569 570 // check if the heal sequence has ended 571 func (h *healSequence) hasEnded() bool { 572 h.mutex.RLock() 573 defer h.mutex.RUnlock() 574 // background heal never ends 575 if h.clientToken == bgHealingUUID { 576 return false 577 } 578 return !h.endTime.IsZero() 579 } 580 581 // stops the heal sequence - safe to call multiple times. 582 func (h *healSequence) stop() { 583 h.cancelCtx() 584 } 585 586 // pushHealResultItem - pushes a heal result item for consumption in 587 // the heal-status API. It blocks if there are 588 // maxUnconsumedHealResultItems. When it blocks, the heal sequence 589 // routine is effectively paused - this happens when the server has 590 // accumulated the maximum number of heal records per heal 591 // sequence. When the client consumes further records, the heal 592 // sequence automatically resumes. The return value indicates if the 593 // operation succeeded. 594 func (h *healSequence) pushHealResultItem(r madmin.HealResultItem) error { 595 // start a timer to keep an upper time limit to find an empty 596 // slot to add the given heal result - if no slot is found it 597 // means that the server is holding the maximum amount of 598 // heal-results in memory and the client has not consumed it 599 // for too long. 600 unconsumedTimer := time.NewTimer(healUnconsumedTimeout) 601 defer func() { 602 // stop the timeout timer so it is garbage collected. 603 if !unconsumedTimer.Stop() { 604 <-unconsumedTimer.C 605 } 606 }() 607 608 var itemsLen int 609 for { 610 h.mutex.Lock() 611 itemsLen = len(h.currentStatus.Items) 612 if itemsLen == maxUnconsumedHealResultItems { 613 // wait for a second, or quit if an external 614 // stop signal is received or the 615 // unconsumedTimer fires. 616 select { 617 // Check after a second 618 case <-time.After(time.Second): 619 h.mutex.Unlock() 620 continue 621 622 case <-h.ctx.Done(): 623 h.mutex.Unlock() 624 // discard result and return. 625 return errHealStopSignalled 626 627 // Timeout if no results consumed for too long. 628 case <-unconsumedTimer.C: 629 h.mutex.Unlock() 630 return errHealIdleTimeout 631 } 632 } 633 break 634 } 635 636 // Set the correct result index for the new result item 637 if itemsLen > 0 { 638 r.ResultIndex = 1 + h.currentStatus.Items[itemsLen-1].ResultIndex 639 } else { 640 r.ResultIndex = 1 + h.lastSentResultIndex 641 } 642 643 // append to results 644 h.currentStatus.Items = append(h.currentStatus.Items, r) 645 646 // release lock 647 h.mutex.Unlock() 648 649 return nil 650 } 651 652 // healSequenceStart - this is the top-level background heal 653 // routine. It launches another go-routine that actually traverses 654 // on-disk data, checks and heals according to the selected 655 // settings. This go-routine itself, (1) monitors the traversal 656 // routine for completion, and (2) listens for external stop 657 // signals. When either event happens, it sets the finish status for 658 // the heal-sequence. 659 func (h *healSequence) healSequenceStart(objAPI ObjectLayer) { 660 // Set status as running 661 h.mutex.Lock() 662 h.currentStatus.Summary = healRunningStatus 663 h.currentStatus.StartTime = UTCNow() 664 h.mutex.Unlock() 665 666 if h.sourceCh == nil { 667 go h.traverseAndHeal(objAPI) 668 } else { 669 go h.healFromSourceCh() 670 } 671 672 select { 673 case err, ok := <-h.traverseAndHealDoneCh: 674 if !ok { 675 return 676 } 677 h.mutex.Lock() 678 h.endTime = UTCNow() 679 // Heal traversal is complete. 680 if err == nil { 681 // heal traversal succeeded. 682 h.currentStatus.Summary = healFinishedStatus 683 } else { 684 // heal traversal had an error. 685 h.currentStatus.Summary = healStoppedStatus 686 h.currentStatus.FailureDetail = err.Error() 687 } 688 h.mutex.Unlock() 689 case <-h.ctx.Done(): 690 h.mutex.Lock() 691 h.endTime = UTCNow() 692 h.currentStatus.Summary = healFinishedStatus 693 h.mutex.Unlock() 694 695 // drain traverse channel so the traversal 696 // go-routine does not leak. 697 go func() { 698 // Eventually the traversal go-routine closes 699 // the channel and returns, so this go-routine 700 // itself will not leak. 701 <-h.traverseAndHealDoneCh 702 }() 703 } 704 } 705 706 func (h *healSequence) logHeal(healType madmin.HealItemType) { 707 h.mutex.Lock() 708 h.scannedItemsMap[healType]++ 709 h.lastHealActivity = UTCNow() 710 h.mutex.Unlock() 711 } 712 713 func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItemType) error { 714 globalHealConfigMu.Lock() 715 opts := globalHealConfig 716 globalHealConfigMu.Unlock() 717 718 // Send heal request 719 task := healTask{ 720 bucket: source.bucket, 721 object: source.object, 722 versionID: source.versionID, 723 opts: h.settings, 724 responseCh: h.respCh, 725 } 726 if source.opts != nil { 727 task.opts = *source.opts 728 } 729 if opts.Bitrot { 730 task.opts.ScanMode = madmin.HealDeepScan 731 } 732 733 // Wait and proceed if there are active requests 734 waitForLowHTTPReq(opts.IOCount, opts.Sleep) 735 736 h.mutex.Lock() 737 h.scannedItemsMap[healType]++ 738 h.lastHealActivity = UTCNow() 739 h.mutex.Unlock() 740 741 globalBackgroundHealRoutine.queueHealTask(task) 742 743 select { 744 case res := <-h.respCh: 745 if !h.reportProgress { 746 // Object might have been deleted, by the time heal 747 // was attempted, we should ignore this object and 748 // return the error and not calculate this object 749 // as part of the metrics. 750 if isErrObjectNotFound(res.err) || isErrVersionNotFound(res.err) { 751 // Return the error so that caller can handle it. 752 return res.err 753 } 754 755 h.mutex.Lock() 756 defer h.mutex.Unlock() 757 758 // Progress is not reported in case of background heal processing. 759 // Instead we increment relevant counter based on the heal result 760 // for prometheus reporting. 761 if res.err != nil { 762 for _, d := range res.result.After.Drives { 763 // For failed items we report the endpoint and drive state 764 // This will help users take corrective actions for drives 765 h.healFailedItemsMap[d.Endpoint+","+d.State]++ 766 } 767 } else { 768 // Only object type reported for successful healing 769 h.healedItemsMap[res.result.Type]++ 770 } 771 772 // Report caller of any failure 773 return res.err 774 } 775 res.result.Type = healType 776 if res.err != nil { 777 // Object might have been deleted, by the time heal 778 // was attempted, we should ignore this object and return success. 779 if isErrObjectNotFound(res.err) || isErrVersionNotFound(res.err) { 780 return nil 781 } 782 // Only report object error 783 if healType != madmin.HealItemObject { 784 return res.err 785 } 786 res.result.Detail = res.err.Error() 787 } 788 return h.pushHealResultItem(res.result) 789 case <-h.ctx.Done(): 790 return nil 791 } 792 } 793 794 func (h *healSequence) healItemsFromSourceCh() error { 795 for { 796 select { 797 case source, ok := <-h.sourceCh: 798 if !ok { 799 return nil 800 } 801 802 var itemType madmin.HealItemType 803 switch source.bucket { 804 case nopHeal: 805 continue 806 case SlashSeparator: 807 itemType = madmin.HealItemMetadata 808 default: 809 if source.object == "" { 810 itemType = madmin.HealItemBucket 811 } else { 812 itemType = madmin.HealItemObject 813 } 814 } 815 816 if err := h.queueHealTask(source, itemType); err != nil { 817 switch err.(type) { 818 case ObjectExistsAsDirectory: 819 case ObjectNotFound: 820 case VersionNotFound: 821 default: 822 logger.LogIf(h.ctx, fmt.Errorf("Heal attempt failed for %s: %w", 823 pathJoin(source.bucket, source.object), err)) 824 } 825 } 826 case <-h.ctx.Done(): 827 return nil 828 } 829 } 830 } 831 832 func (h *healSequence) healFromSourceCh() { 833 h.healItemsFromSourceCh() 834 } 835 836 func (h *healSequence) healDiskMeta(objAPI ObjectLayer) error { 837 // Try to pro-actively heal backend-encrypted file. 838 if err := h.queueHealTask(healSource{ 839 bucket: minioMetaBucket, 840 object: backendEncryptedFile, 841 }, madmin.HealItemBucketMetadata); err != nil { 842 if !isErrObjectNotFound(err) && !isErrVersionNotFound(err) { 843 return err 844 } 845 } 846 847 // Start healing the config prefix. 848 return h.healMinioSysMeta(objAPI, minioConfigPrefix)() 849 } 850 851 func (h *healSequence) healItems(objAPI ObjectLayer, bucketsOnly bool) error { 852 if err := h.healDiskMeta(objAPI); err != nil { 853 return err 854 } 855 856 // Heal buckets and objects 857 return h.healBuckets(objAPI, bucketsOnly) 858 } 859 860 // traverseAndHeal - traverses on-disk data and performs healing 861 // according to settings. At each "safe" point it also checks if an 862 // external quit signal has been received and quits if so. Since the 863 // healing traversal may be mutating on-disk data when an external 864 // quit signal is received, this routine cannot quit immediately and 865 // has to wait until a safe point is reached, such as between scanning 866 // two objects. 867 func (h *healSequence) traverseAndHeal(objAPI ObjectLayer) { 868 bucketsOnly := false // Heals buckets and objects also. 869 h.traverseAndHealDoneCh <- h.healItems(objAPI, bucketsOnly) 870 close(h.traverseAndHealDoneCh) 871 } 872 873 // healMinioSysMeta - heals all files under a given meta prefix, returns a function 874 // which in-turn heals the respective meta directory path and any files in int. 875 func (h *healSequence) healMinioSysMeta(objAPI ObjectLayer, metaPrefix string) func() error { 876 return func() error { 877 // NOTE: Healing on meta is run regardless 878 // of any bucket being selected, this is to ensure that 879 // meta are always upto date and correct. 880 return objAPI.HealObjects(h.ctx, minioMetaBucket, metaPrefix, h.settings, func(bucket, object, versionID string) error { 881 if h.isQuitting() { 882 return errHealStopSignalled 883 } 884 885 err := h.queueHealTask(healSource{ 886 bucket: bucket, 887 object: object, 888 versionID: versionID, 889 }, madmin.HealItemBucketMetadata) 890 // Object might have been deleted, by the time heal 891 // was attempted we ignore this object an move on. 892 if isErrObjectNotFound(err) || isErrVersionNotFound(err) { 893 return nil 894 } 895 return err 896 }) 897 } 898 } 899 900 // healDiskFormat - heals format.json, return value indicates if a 901 // failure error occurred. 902 func (h *healSequence) healDiskFormat() error { 903 if h.isQuitting() { 904 return errHealStopSignalled 905 } 906 907 return h.queueHealTask(healSource{bucket: SlashSeparator}, madmin.HealItemMetadata) 908 } 909 910 // healBuckets - check for all buckets heal or just particular bucket. 911 func (h *healSequence) healBuckets(objAPI ObjectLayer, bucketsOnly bool) error { 912 if h.isQuitting() { 913 return errHealStopSignalled 914 } 915 916 // 1. If a bucket was specified, heal only the bucket. 917 if h.bucket != "" { 918 return h.healBucket(objAPI, h.bucket, bucketsOnly) 919 } 920 921 buckets, err := objAPI.ListBuckets(h.ctx) 922 if err != nil { 923 return errFnHealFromAPIErr(h.ctx, err) 924 } 925 926 // Heal latest buckets first. 927 sort.Slice(buckets, func(i, j int) bool { 928 return buckets[i].Created.After(buckets[j].Created) 929 }) 930 931 for _, bucket := range buckets { 932 if err = h.healBucket(objAPI, bucket.Name, bucketsOnly); err != nil { 933 return err 934 } 935 } 936 937 return nil 938 } 939 940 // healBucket - traverses and heals given bucket 941 func (h *healSequence) healBucket(objAPI ObjectLayer, bucket string, bucketsOnly bool) error { 942 if err := h.queueHealTask(healSource{bucket: bucket}, madmin.HealItemBucket); err != nil { 943 if !isErrObjectNotFound(err) && !isErrVersionNotFound(err) { 944 return err 945 } 946 } 947 948 if bucketsOnly { 949 return nil 950 } 951 952 if !h.settings.Recursive { 953 if h.object != "" { 954 // Check if an object named as the objPrefix exists, 955 // and if so heal it. 956 oi, err := objAPI.GetObjectInfo(h.ctx, bucket, h.object, ObjectOptions{}) 957 if err == nil { 958 if err = h.healObject(bucket, h.object, oi.VersionID); err != nil { 959 if isErrObjectNotFound(err) || isErrVersionNotFound(err) { 960 return nil 961 } 962 return err 963 } 964 } 965 } 966 967 return nil 968 } 969 970 if err := objAPI.HealObjects(h.ctx, bucket, h.object, h.settings, h.healObject); err != nil { 971 // Object might have been deleted, by the time heal 972 // was attempted we ignore this object an move on. 973 if !isErrObjectNotFound(err) && !isErrVersionNotFound(err) { 974 return errFnHealFromAPIErr(h.ctx, err) 975 } 976 } 977 return nil 978 } 979 980 // healObject - heal the given object and record result 981 func (h *healSequence) healObject(bucket, object, versionID string) error { 982 if h.isQuitting() { 983 return errHealStopSignalled 984 } 985 986 err := h.queueHealTask(healSource{ 987 bucket: bucket, 988 object: object, 989 versionID: versionID, 990 }, madmin.HealItemObject) 991 return err 992 }