github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/erasure-server-pool-rebalance.go (about) 1 // Copyright (c) 2015-2022 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package cmd 19 20 import ( 21 "context" 22 "encoding/binary" 23 "errors" 24 "fmt" 25 "io" 26 "math" 27 "math/rand" 28 "net/http" 29 "strings" 30 "time" 31 32 "github.com/dustin/go-humanize" 33 "github.com/lithammer/shortuuid/v4" 34 "github.com/minio/madmin-go/v3" 35 "github.com/minio/minio/internal/hash" 36 xioutil "github.com/minio/minio/internal/ioutil" 37 "github.com/minio/minio/internal/logger" 38 "github.com/minio/pkg/v2/env" 39 "github.com/minio/pkg/v2/workers" 40 ) 41 42 //go:generate msgp -file $GOFILE -unexported 43 44 // rebalanceStats contains per-pool rebalance statistics like number of objects, 45 // versions and bytes rebalanced out of a pool 46 type rebalanceStats struct { 47 InitFreeSpace uint64 `json:"initFreeSpace" msg:"ifs"` // Pool free space at the start of rebalance 48 InitCapacity uint64 `json:"initCapacity" msg:"ic"` // Pool capacity at the start of rebalance 49 50 Buckets []string `json:"buckets" msg:"bus"` // buckets being rebalanced or to be rebalanced 51 RebalancedBuckets []string `json:"rebalancedBuckets" msg:"rbs"` // buckets rebalanced 52 Bucket string `json:"bucket" msg:"bu"` // Last rebalanced bucket 53 Object string `json:"object" msg:"ob"` // Last rebalanced object 54 NumObjects uint64 `json:"numObjects" msg:"no"` // Number of objects rebalanced 55 NumVersions uint64 `json:"numVersions" msg:"nv"` // Number of versions rebalanced 56 Bytes uint64 `json:"bytes" msg:"bs"` // Number of bytes rebalanced 57 Participating bool `json:"participating" msg:"par"` 58 Info rebalanceInfo `json:"info" msg:"inf"` 59 } 60 61 func (rs *rebalanceStats) update(bucket string, fi FileInfo) { 62 if fi.IsLatest { 63 rs.NumObjects++ 64 } 65 66 rs.NumVersions++ 67 onDiskSz := int64(0) 68 if !fi.Deleted { 69 onDiskSz = fi.Size * int64(fi.Erasure.DataBlocks+fi.Erasure.ParityBlocks) / int64(fi.Erasure.DataBlocks) 70 } 71 rs.Bytes += uint64(onDiskSz) 72 rs.Bucket = bucket 73 rs.Object = fi.Name 74 } 75 76 type rstats []*rebalanceStats 77 78 //go:generate stringer -type=rebalStatus -trimprefix=rebal $GOFILE 79 type rebalStatus uint8 80 81 const ( 82 rebalNone rebalStatus = iota 83 rebalStarted 84 rebalCompleted 85 rebalStopped 86 rebalFailed 87 ) 88 89 type rebalanceInfo struct { 90 StartTime time.Time `msg:"startTs"` // Time at which rebalance-start was issued 91 EndTime time.Time `msg:"stopTs"` // Time at which rebalance operation completed or rebalance-stop was called 92 Status rebalStatus `msg:"status"` // Current state of rebalance operation. One of Started|Stopped|Completed|Failed. 93 } 94 95 // rebalanceMeta contains information pertaining to an ongoing rebalance operation. 96 type rebalanceMeta struct { 97 cancel context.CancelFunc `msg:"-"` // to be invoked on rebalance-stop 98 lastRefreshedAt time.Time `msg:"-"` 99 StoppedAt time.Time `msg:"stopTs"` // Time when rebalance-stop was issued. 100 ID string `msg:"id"` // ID of the ongoing rebalance operation 101 PercentFreeGoal float64 `msg:"pf"` // Computed from total free space and capacity at the start of rebalance 102 PoolStats []*rebalanceStats `msg:"rss"` // Per-pool rebalance stats keyed by pool index 103 } 104 105 var errRebalanceNotStarted = errors.New("rebalance not started") 106 107 func (z *erasureServerPools) loadRebalanceMeta(ctx context.Context) error { 108 r := &rebalanceMeta{} 109 err := r.load(ctx, z.serverPools[0]) 110 if err != nil { 111 if errors.Is(err, errConfigNotFound) { 112 return nil 113 } 114 return err 115 } 116 117 z.rebalMu.Lock() 118 if len(r.PoolStats) == len(z.serverPools) { 119 z.rebalMeta = r 120 } else { 121 z.updateRebalanceStats(ctx) 122 } 123 z.rebalMu.Unlock() 124 125 return nil 126 } 127 128 // updates rebalance.bin from let's say 2 pool setup in the middle 129 // of a rebalance, was expanded can cause z.rebalMeta to be outdated 130 // due to a missing new pool. This function tries to handle this 131 // scenario, albeit rare it seems to have occurred in the wild. 132 // 133 // since we do not explicitly disallow it, but it is okay for them 134 // expand and then we continue to rebalance. 135 func (z *erasureServerPools) updateRebalanceStats(ctx context.Context) error { 136 var ok bool 137 for i := range z.serverPools { 138 if z.findIndex(i) == -1 { 139 // Also ensure to initialize rebalanceStats to indicate 140 // its a new pool that can receive rebalanced data. 141 z.rebalMeta.PoolStats = append(z.rebalMeta.PoolStats, &rebalanceStats{}) 142 ok = true 143 } 144 } 145 if ok { 146 lock := z.serverPools[0].NewNSLock(minioMetaBucket, rebalMetaName) 147 lkCtx, err := lock.GetLock(ctx, globalOperationTimeout) 148 if err != nil { 149 logger.LogIf(ctx, fmt.Errorf("failed to acquire write lock on %s/%s: %w", minioMetaBucket, rebalMetaName, err)) 150 return err 151 } 152 defer lock.Unlock(lkCtx) 153 154 ctx = lkCtx.Context() 155 156 noLockOpts := ObjectOptions{NoLock: true} 157 return z.rebalMeta.saveWithOpts(ctx, z.serverPools[0], noLockOpts) 158 } 159 160 return nil 161 } 162 163 func (z *erasureServerPools) findIndex(index int) int { 164 for i := 0; i < len(z.rebalMeta.PoolStats); i++ { 165 if i == index { 166 return index 167 } 168 } 169 return -1 170 } 171 172 // initRebalanceMeta initializes rebalance metadata for a new rebalance 173 // operation and saves it in the object store. 174 func (z *erasureServerPools) initRebalanceMeta(ctx context.Context, buckets []string) (arn string, err error) { 175 r := &rebalanceMeta{ 176 ID: shortuuid.New(), 177 PoolStats: make([]*rebalanceStats, len(z.serverPools)), 178 } 179 180 // Fetch disk capacity and available space. 181 si := z.StorageInfo(ctx, true) 182 diskStats := make([]struct { 183 AvailableSpace uint64 184 TotalSpace uint64 185 }, len(z.serverPools)) 186 var totalCap, totalFree uint64 187 for _, disk := range si.Disks { 188 // Ignore invalid. 189 if disk.PoolIndex < 0 || len(diskStats) <= disk.PoolIndex { 190 // https://github.com/minio/minio/issues/16500 191 continue 192 } 193 totalCap += disk.TotalSpace 194 totalFree += disk.AvailableSpace 195 196 diskStats[disk.PoolIndex].AvailableSpace += disk.AvailableSpace 197 diskStats[disk.PoolIndex].TotalSpace += disk.TotalSpace 198 } 199 r.PercentFreeGoal = float64(totalFree) / float64(totalCap) 200 201 now := time.Now() 202 for idx := range z.serverPools { 203 r.PoolStats[idx] = &rebalanceStats{ 204 Buckets: make([]string, len(buckets)), 205 RebalancedBuckets: make([]string, 0, len(buckets)), 206 InitFreeSpace: diskStats[idx].AvailableSpace, 207 InitCapacity: diskStats[idx].TotalSpace, 208 } 209 copy(r.PoolStats[idx].Buckets, buckets) 210 211 if pfi := float64(diskStats[idx].AvailableSpace) / float64(diskStats[idx].TotalSpace); pfi < r.PercentFreeGoal { 212 r.PoolStats[idx].Participating = true 213 r.PoolStats[idx].Info = rebalanceInfo{ 214 StartTime: now, 215 Status: rebalStarted, 216 } 217 } 218 } 219 220 err = r.save(ctx, z.serverPools[0]) 221 if err != nil { 222 return arn, err 223 } 224 225 z.rebalMeta = r 226 return r.ID, nil 227 } 228 229 func (z *erasureServerPools) updatePoolStats(poolIdx int, bucket string, fi FileInfo) { 230 z.rebalMu.Lock() 231 defer z.rebalMu.Unlock() 232 233 r := z.rebalMeta 234 if r == nil { 235 return 236 } 237 238 r.PoolStats[poolIdx].update(bucket, fi) 239 } 240 241 const ( 242 rebalMetaName = "rebalance.bin" 243 rebalMetaFmt = 1 244 rebalMetaVer = 1 245 ) 246 247 func (z *erasureServerPools) nextRebalBucket(poolIdx int) (string, bool) { 248 z.rebalMu.RLock() 249 defer z.rebalMu.RUnlock() 250 251 r := z.rebalMeta 252 if r == nil { 253 return "", false 254 } 255 256 ps := r.PoolStats[poolIdx] 257 if ps == nil { 258 return "", false 259 } 260 261 if ps.Info.Status == rebalCompleted || !ps.Participating { 262 return "", false 263 } 264 265 if len(ps.Buckets) == 0 { 266 return "", false 267 } 268 269 return ps.Buckets[0], true 270 } 271 272 func (z *erasureServerPools) bucketRebalanceDone(bucket string, poolIdx int) { 273 z.rebalMu.Lock() 274 defer z.rebalMu.Unlock() 275 276 ps := z.rebalMeta.PoolStats[poolIdx] 277 if ps == nil { 278 return 279 } 280 281 for i, b := range ps.Buckets { 282 if b == bucket { 283 ps.Buckets = append(ps.Buckets[:i], ps.Buckets[i+1:]...) 284 ps.RebalancedBuckets = append(ps.RebalancedBuckets, bucket) 285 break 286 } 287 } 288 } 289 290 func (r *rebalanceMeta) load(ctx context.Context, store objectIO) error { 291 return r.loadWithOpts(ctx, store, ObjectOptions{}) 292 } 293 294 func (r *rebalanceMeta) loadWithOpts(ctx context.Context, store objectIO, opts ObjectOptions) error { 295 data, _, err := readConfigWithMetadata(ctx, store, rebalMetaName, opts) 296 if err != nil { 297 return err 298 } 299 300 if len(data) == 0 { 301 return nil 302 } 303 if len(data) <= 4 { 304 return fmt.Errorf("rebalanceMeta: no data") 305 } 306 307 // Read header 308 switch binary.LittleEndian.Uint16(data[0:2]) { 309 case rebalMetaFmt: 310 default: 311 return fmt.Errorf("rebalanceMeta: unknown format: %d", binary.LittleEndian.Uint16(data[0:2])) 312 } 313 switch binary.LittleEndian.Uint16(data[2:4]) { 314 case rebalMetaVer: 315 default: 316 return fmt.Errorf("rebalanceMeta: unknown version: %d", binary.LittleEndian.Uint16(data[2:4])) 317 } 318 319 // OK, parse data. 320 if _, err = r.UnmarshalMsg(data[4:]); err != nil { 321 return err 322 } 323 324 r.lastRefreshedAt = time.Now() 325 326 return nil 327 } 328 329 func (r *rebalanceMeta) saveWithOpts(ctx context.Context, store objectIO, opts ObjectOptions) error { 330 data := make([]byte, 4, r.Msgsize()+4) 331 332 // Initialize the header. 333 binary.LittleEndian.PutUint16(data[0:2], rebalMetaFmt) 334 binary.LittleEndian.PutUint16(data[2:4], rebalMetaVer) 335 336 buf, err := r.MarshalMsg(data) 337 if err != nil { 338 return err 339 } 340 341 return saveConfigWithOpts(ctx, store, rebalMetaName, buf, opts) 342 } 343 344 func (r *rebalanceMeta) save(ctx context.Context, store objectIO) error { 345 return r.saveWithOpts(ctx, store, ObjectOptions{}) 346 } 347 348 func (z *erasureServerPools) IsRebalanceStarted() bool { 349 z.rebalMu.RLock() 350 defer z.rebalMu.RUnlock() 351 352 if r := z.rebalMeta; r != nil { 353 if r.StoppedAt.IsZero() { 354 return true 355 } 356 } 357 return false 358 } 359 360 func (z *erasureServerPools) IsPoolRebalancing(poolIndex int) bool { 361 z.rebalMu.RLock() 362 defer z.rebalMu.RUnlock() 363 364 if r := z.rebalMeta; r != nil { 365 if !r.StoppedAt.IsZero() { 366 return false 367 } 368 ps := z.rebalMeta.PoolStats[poolIndex] 369 return ps.Participating && ps.Info.Status == rebalStarted 370 } 371 return false 372 } 373 374 func (z *erasureServerPools) rebalanceBuckets(ctx context.Context, poolIdx int) (err error) { 375 doneCh := make(chan struct{}) 376 defer xioutil.SafeClose(doneCh) 377 378 // Save rebalance.bin periodically. 379 go func() { 380 // Update rebalance.bin periodically once every 5-10s, chosen randomly 381 // to avoid multiple pool leaders herding to update around the same 382 // time. 383 r := rand.New(rand.NewSource(time.Now().UnixNano())) 384 randSleepFor := func() time.Duration { 385 return 5*time.Second + time.Duration(float64(5*time.Second)*r.Float64()) 386 } 387 388 timer := time.NewTimer(randSleepFor()) 389 defer timer.Stop() 390 var rebalDone bool 391 var traceMsg string 392 393 for { 394 select { 395 case <-doneCh: 396 // rebalance completed for poolIdx 397 now := time.Now() 398 z.rebalMu.Lock() 399 z.rebalMeta.PoolStats[poolIdx].Info.Status = rebalCompleted 400 z.rebalMeta.PoolStats[poolIdx].Info.EndTime = now 401 z.rebalMu.Unlock() 402 403 rebalDone = true 404 traceMsg = fmt.Sprintf("completed at %s", now) 405 406 case <-ctx.Done(): 407 408 // rebalance stopped for poolIdx 409 now := time.Now() 410 z.rebalMu.Lock() 411 z.rebalMeta.PoolStats[poolIdx].Info.Status = rebalStopped 412 z.rebalMeta.PoolStats[poolIdx].Info.EndTime = now 413 z.rebalMeta.cancel = nil // remove the already used context.CancelFunc 414 z.rebalMu.Unlock() 415 416 rebalDone = true 417 traceMsg = fmt.Sprintf("stopped at %s", now) 418 419 case <-timer.C: 420 traceMsg = fmt.Sprintf("saved at %s", time.Now()) 421 } 422 423 stopFn := globalRebalanceMetrics.log(rebalanceMetricSaveMetadata, poolIdx, traceMsg) 424 err := z.saveRebalanceStats(ctx, poolIdx, rebalSaveStats) 425 stopFn(err) 426 logger.LogIf(ctx, err) 427 timer.Reset(randSleepFor()) 428 429 if rebalDone { 430 return 431 } 432 } 433 }() 434 435 logger.Event(ctx, "Pool %d rebalancing is started", poolIdx+1) 436 437 for { 438 select { 439 case <-ctx.Done(): 440 return 441 default: 442 } 443 444 bucket, ok := z.nextRebalBucket(poolIdx) 445 if !ok { 446 // no more buckets to rebalance or target free_space/capacity reached 447 break 448 } 449 450 stopFn := globalRebalanceMetrics.log(rebalanceMetricRebalanceBucket, poolIdx, bucket) 451 err = z.rebalanceBucket(ctx, bucket, poolIdx) 452 if err != nil { 453 stopFn(err) 454 logger.LogIf(ctx, err) 455 return 456 } 457 stopFn(nil) 458 z.bucketRebalanceDone(bucket, poolIdx) 459 } 460 461 logger.Event(ctx, "Pool %d rebalancing is done", poolIdx+1) 462 463 return err 464 } 465 466 func (z *erasureServerPools) checkIfRebalanceDone(poolIdx int) bool { 467 z.rebalMu.Lock() 468 defer z.rebalMu.Unlock() 469 470 // check if enough objects have been rebalanced 471 r := z.rebalMeta 472 poolStats := r.PoolStats[poolIdx] 473 if poolStats.Info.Status == rebalCompleted { 474 return true 475 } 476 477 pfi := float64(poolStats.InitFreeSpace+poolStats.Bytes) / float64(poolStats.InitCapacity) 478 // Mark pool rebalance as done if within 5% from PercentFreeGoal. 479 if diff := math.Abs(pfi - r.PercentFreeGoal); diff <= 0.05 { 480 r.PoolStats[poolIdx].Info.Status = rebalCompleted 481 r.PoolStats[poolIdx].Info.EndTime = time.Now() 482 return true 483 } 484 485 return false 486 } 487 488 func (set *erasureObjects) listObjectsToRebalance(ctx context.Context, bucketName string, fn func(entry metaCacheEntry)) error { 489 disks, _ := set.getOnlineDisksWithHealing(false) 490 if len(disks) == 0 { 491 return fmt.Errorf("no online drives found for set with endpoints %s", set.getEndpoints()) 492 } 493 494 // However many we ask, versions must exist on ~50% 495 listingQuorum := (set.setDriveCount + 1) / 2 496 497 // How to resolve partial results. 498 resolver := metadataResolutionParams{ 499 dirQuorum: listingQuorum, // make sure to capture all quorum ratios 500 objQuorum: listingQuorum, // make sure to capture all quorum ratios 501 bucket: bucketName, 502 } 503 504 err := listPathRaw(ctx, listPathRawOptions{ 505 disks: disks, 506 bucket: bucketName, 507 recursive: true, 508 forwardTo: "", 509 minDisks: listingQuorum, 510 reportNotFound: false, 511 agreed: fn, 512 partial: func(entries metaCacheEntries, _ []error) { 513 entry, ok := entries.resolve(&resolver) 514 if ok { 515 fn(*entry) 516 } 517 }, 518 finished: nil, 519 }) 520 return err 521 } 522 523 // rebalanceBucket rebalances objects under bucket in poolIdx pool 524 func (z *erasureServerPools) rebalanceBucket(ctx context.Context, bucket string, poolIdx int) error { 525 ctx = logger.SetReqInfo(ctx, &logger.ReqInfo{}) 526 vc, _ := globalBucketVersioningSys.Get(bucket) 527 // Check if the current bucket has a configured lifecycle policy 528 lc, _ := globalLifecycleSys.Get(bucket) 529 // Check if bucket is object locked. 530 lr, _ := globalBucketObjectLockSys.Get(bucket) 531 rcfg, _ := getReplicationConfig(ctx, bucket) 532 533 pool := z.serverPools[poolIdx] 534 535 const envRebalanceWorkers = "_MINIO_REBALANCE_WORKERS" 536 workerSize, err := env.GetInt(envRebalanceWorkers, len(pool.sets)) 537 if err != nil { 538 logger.LogIf(ctx, fmt.Errorf("invalid workers value err: %v, defaulting to %d", err, len(pool.sets))) 539 workerSize = len(pool.sets) 540 } 541 542 // Each decom worker needs one List() goroutine/worker 543 // add that many extra workers. 544 workerSize += len(pool.sets) 545 546 wk, err := workers.New(workerSize) 547 if err != nil { 548 return err 549 } 550 551 for setIdx, set := range pool.sets { 552 set := set 553 554 filterLifecycle := func(bucket, object string, fi FileInfo) bool { 555 if lc == nil { 556 return false 557 } 558 versioned := vc != nil && vc.Versioned(object) 559 objInfo := fi.ToObjectInfo(bucket, object, versioned) 560 561 evt := evalActionFromLifecycle(ctx, *lc, lr, rcfg, objInfo) 562 if evt.Action.Delete() { 563 globalExpiryState.enqueueByDays(objInfo, evt, lcEventSrc_Rebal) 564 return true 565 } 566 567 return false 568 } 569 570 rebalanceEntry := func(entry metaCacheEntry) { 571 defer wk.Give() 572 573 if entry.isDir() { 574 return 575 } 576 577 // rebalance on poolIdx has reached its goal 578 if z.checkIfRebalanceDone(poolIdx) { 579 return 580 } 581 582 fivs, err := entry.fileInfoVersions(bucket) 583 if err != nil { 584 return 585 } 586 587 // We need a reversed order for rebalance, 588 // to create the appropriate stack. 589 versionsSorter(fivs.Versions).reverse() 590 591 var rebalanced, expired int 592 for _, version := range fivs.Versions { 593 // Skip transitioned objects for now. TBD 594 if version.IsRemote() { 595 continue 596 } 597 598 // Apply lifecycle rules on the objects that are expired. 599 if filterLifecycle(bucket, version.Name, version) { 600 expired++ 601 continue 602 } 603 604 // any object with only single DEL marker we don't need 605 // to rebalance, just skip it, this also includes 606 // any other versions that have already expired. 607 remainingVersions := len(fivs.Versions) - expired 608 if version.Deleted && remainingVersions == 1 { 609 rebalanced++ 610 continue 611 } 612 613 versionID := version.VersionID 614 if versionID == "" { 615 versionID = nullVersionID 616 } 617 618 if version.Deleted { 619 _, err := z.DeleteObject(ctx, 620 bucket, 621 version.Name, 622 ObjectOptions{ 623 Versioned: true, 624 VersionID: versionID, 625 MTime: version.ModTime, 626 DeleteReplication: version.ReplicationState, 627 DeleteMarker: true, // make sure we create a delete marker 628 SkipRebalancing: true, // make sure we skip the decommissioned pool 629 NoAuditLog: true, 630 }) 631 var failure bool 632 if err != nil && !isErrObjectNotFound(err) && !isErrVersionNotFound(err) { 633 logger.LogIf(ctx, err) 634 failure = true 635 } 636 637 if !failure { 638 z.updatePoolStats(poolIdx, bucket, version) 639 rebalanced++ 640 } 641 auditLogRebalance(ctx, "Rebalance:DeleteMarker", bucket, version.Name, versionID, err) 642 continue 643 } 644 645 var failure, ignore bool 646 for try := 0; try < 3; try++ { 647 // GetObjectReader.Close is called by rebalanceObject 648 stopFn := globalRebalanceMetrics.log(rebalanceMetricRebalanceObject, poolIdx, bucket, version.Name, version.VersionID) 649 gr, err := set.GetObjectNInfo(ctx, 650 bucket, 651 encodeDirObject(version.Name), 652 nil, 653 http.Header{}, 654 ObjectOptions{ 655 VersionID: versionID, 656 NoDecryption: true, 657 NoLock: true, 658 NoAuditLog: true, 659 }) 660 if isErrObjectNotFound(err) || isErrVersionNotFound(err) { 661 // object deleted by the application, nothing to do here we move on. 662 ignore = true 663 stopFn(nil) 664 break 665 } 666 if err != nil { 667 failure = true 668 logger.LogIf(ctx, err) 669 stopFn(err) 670 continue 671 } 672 673 if err = z.rebalanceObject(ctx, bucket, gr); err != nil { 674 failure = true 675 logger.LogIf(ctx, err) 676 stopFn(err) 677 continue 678 } 679 680 stopFn(nil) 681 failure = false 682 break 683 } 684 if ignore { 685 continue 686 } 687 if failure { 688 break // break out on first error 689 } 690 z.updatePoolStats(poolIdx, bucket, version) 691 rebalanced++ 692 } 693 694 // if all versions were rebalanced, we can delete the object versions. 695 if rebalanced == len(fivs.Versions) { 696 stopFn := globalRebalanceMetrics.log(rebalanceMetricRebalanceRemoveObject, poolIdx, bucket, entry.name) 697 _, err := set.DeleteObject(ctx, 698 bucket, 699 encodeDirObject(entry.name), 700 ObjectOptions{ 701 DeletePrefix: true, // use prefix delete to delete all versions at once. 702 DeletePrefixObject: true, // use prefix delete on exact object (this is an optimization to avoid fan-out calls) 703 NoAuditLog: true, 704 }, 705 ) 706 stopFn(err) 707 auditLogRebalance(ctx, "Rebalance:DeleteObject", bucket, entry.name, "", err) 708 if err != nil { 709 logger.LogIf(ctx, err) 710 } 711 } 712 } 713 714 wk.Take() 715 go func(setIdx int) { 716 defer wk.Give() 717 err := set.listObjectsToRebalance(ctx, bucket, 718 func(entry metaCacheEntry) { 719 wk.Take() 720 go rebalanceEntry(entry) 721 }, 722 ) 723 if err == nil || errors.Is(err, context.Canceled) { 724 return 725 } 726 setN := humanize.Ordinal(setIdx + 1) 727 logger.LogOnceIf(ctx, fmt.Errorf("listing objects from %s set failed with %v", setN, err), "rebalance-listing-failed"+setN) 728 }(setIdx) 729 } 730 731 wk.Wait() 732 return nil 733 } 734 735 type rebalSaveOpts uint8 736 737 const ( 738 rebalSaveStats rebalSaveOpts = iota 739 rebalSaveStoppedAt 740 ) 741 742 func (z *erasureServerPools) saveRebalanceStats(ctx context.Context, poolIdx int, opts rebalSaveOpts) error { 743 lock := z.serverPools[0].NewNSLock(minioMetaBucket, rebalMetaName) 744 lkCtx, err := lock.GetLock(ctx, globalOperationTimeout) 745 if err != nil { 746 logger.LogIf(ctx, fmt.Errorf("failed to acquire write lock on %s/%s: %w", minioMetaBucket, rebalMetaName, err)) 747 return err 748 } 749 defer lock.Unlock(lkCtx) 750 751 ctx = lkCtx.Context() 752 noLockOpts := ObjectOptions{NoLock: true} 753 r := &rebalanceMeta{} 754 if err := r.loadWithOpts(ctx, z.serverPools[0], noLockOpts); err != nil { 755 return err 756 } 757 758 z.rebalMu.Lock() 759 defer z.rebalMu.Unlock() 760 761 switch opts { 762 case rebalSaveStoppedAt: 763 r.StoppedAt = time.Now() 764 case rebalSaveStats: 765 r.PoolStats[poolIdx] = z.rebalMeta.PoolStats[poolIdx] 766 } 767 z.rebalMeta = r 768 769 return z.rebalMeta.saveWithOpts(ctx, z.serverPools[0], noLockOpts) 770 } 771 772 func auditLogRebalance(ctx context.Context, apiName, bucket, object, versionID string, err error) { 773 errStr := "" 774 if err != nil { 775 errStr = err.Error() 776 } 777 auditLogInternal(ctx, AuditLogOptions{ 778 Event: "rebalance", 779 APIName: apiName, 780 Bucket: bucket, 781 Object: object, 782 VersionID: versionID, 783 Error: errStr, 784 }) 785 } 786 787 func (z *erasureServerPools) rebalanceObject(ctx context.Context, bucket string, gr *GetObjectReader) (err error) { 788 oi := gr.ObjInfo 789 790 defer func() { 791 gr.Close() 792 auditLogRebalance(ctx, "RebalanceCopyData", oi.Bucket, oi.Name, oi.VersionID, err) 793 }() 794 795 actualSize, err := oi.GetActualSize() 796 if err != nil { 797 return err 798 } 799 800 if oi.isMultipart() { 801 res, err := z.NewMultipartUpload(ctx, bucket, oi.Name, ObjectOptions{ 802 VersionID: oi.VersionID, 803 UserDefined: oi.UserDefined, 804 NoAuditLog: true, 805 }) 806 if err != nil { 807 return fmt.Errorf("rebalanceObject: NewMultipartUpload() %w", err) 808 } 809 defer z.AbortMultipartUpload(ctx, bucket, oi.Name, res.UploadID, ObjectOptions{NoAuditLog: true}) 810 811 parts := make([]CompletePart, len(oi.Parts)) 812 for i, part := range oi.Parts { 813 hr, err := hash.NewReader(ctx, io.LimitReader(gr, part.Size), part.Size, "", "", part.ActualSize) 814 if err != nil { 815 return fmt.Errorf("rebalanceObject: hash.NewReader() %w", err) 816 } 817 pi, err := z.PutObjectPart(ctx, bucket, oi.Name, res.UploadID, 818 part.Number, 819 NewPutObjReader(hr), 820 ObjectOptions{ 821 PreserveETag: part.ETag, // Preserve original ETag to ensure same metadata. 822 IndexCB: func() []byte { 823 return part.Index // Preserve part Index to ensure decompression works. 824 }, 825 NoAuditLog: true, 826 }) 827 if err != nil { 828 return fmt.Errorf("rebalanceObject: PutObjectPart() %w", err) 829 } 830 parts[i] = CompletePart{ 831 ETag: pi.ETag, 832 PartNumber: pi.PartNumber, 833 } 834 } 835 _, err = z.CompleteMultipartUpload(ctx, bucket, oi.Name, res.UploadID, parts, ObjectOptions{ 836 DataMovement: true, 837 MTime: oi.ModTime, 838 NoAuditLog: true, 839 }) 840 if err != nil { 841 err = fmt.Errorf("rebalanceObject: CompleteMultipartUpload() %w", err) 842 } 843 return err 844 } 845 846 hr, err := hash.NewReader(ctx, gr, oi.Size, "", "", actualSize) 847 if err != nil { 848 return fmt.Errorf("rebalanceObject: hash.NewReader() %w", err) 849 } 850 851 _, err = z.PutObject(ctx, 852 bucket, 853 oi.Name, 854 NewPutObjReader(hr), 855 ObjectOptions{ 856 DataMovement: true, 857 VersionID: oi.VersionID, 858 MTime: oi.ModTime, 859 UserDefined: oi.UserDefined, 860 PreserveETag: oi.ETag, // Preserve original ETag to ensure same metadata. 861 IndexCB: func() []byte { 862 return oi.Parts[0].Index // Preserve part Index to ensure decompression works. 863 }, 864 NoAuditLog: true, 865 }) 866 if err != nil { 867 err = fmt.Errorf("rebalanceObject: PutObject() %w", err) 868 } 869 return err 870 } 871 872 func (z *erasureServerPools) StartRebalance() { 873 z.rebalMu.Lock() 874 if z.rebalMeta == nil || !z.rebalMeta.StoppedAt.IsZero() { // rebalance not running, nothing to do 875 z.rebalMu.Unlock() 876 return 877 } 878 ctx, cancel := context.WithCancel(GlobalContext) 879 z.rebalMeta.cancel = cancel // to be used when rebalance-stop is called 880 z.rebalMu.Unlock() 881 882 z.rebalMu.RLock() 883 participants := make([]bool, len(z.rebalMeta.PoolStats)) 884 for i, ps := range z.rebalMeta.PoolStats { 885 // skip pools which have completed rebalancing 886 if ps.Info.Status != rebalStarted { 887 continue 888 } 889 890 participants[i] = ps.Participating 891 } 892 z.rebalMu.RUnlock() 893 894 for poolIdx, doRebalance := range participants { 895 if !doRebalance { 896 continue 897 } 898 // nothing to do if this node is not pool's first node (i.e pool's rebalance 'leader'). 899 if !globalEndpoints[poolIdx].Endpoints[0].IsLocal { 900 continue 901 } 902 903 go func(idx int) { 904 stopfn := globalRebalanceMetrics.log(rebalanceMetricRebalanceBuckets, idx) 905 err := z.rebalanceBuckets(ctx, idx) 906 stopfn(err) 907 }(poolIdx) 908 } 909 } 910 911 // StopRebalance signals the rebalance goroutine running on this node (if any) 912 // to stop, using the context.CancelFunc(s) saved at the time ofStartRebalance. 913 func (z *erasureServerPools) StopRebalance() error { 914 z.rebalMu.Lock() 915 defer z.rebalMu.Unlock() 916 917 r := z.rebalMeta 918 if r == nil { // rebalance not running in this node, nothing to do 919 return nil 920 } 921 922 if cancel := r.cancel; cancel != nil { 923 // cancel != nil only on pool leaders 924 r.cancel = nil 925 cancel() 926 } 927 return nil 928 } 929 930 // for rebalance trace support 931 type rebalanceMetrics struct{} 932 933 var globalRebalanceMetrics rebalanceMetrics 934 935 //go:generate stringer -type=rebalanceMetric -trimprefix=rebalanceMetric $GOFILE 936 type rebalanceMetric uint8 937 938 const ( 939 rebalanceMetricRebalanceBuckets rebalanceMetric = iota 940 rebalanceMetricRebalanceBucket 941 rebalanceMetricRebalanceObject 942 rebalanceMetricRebalanceRemoveObject 943 rebalanceMetricSaveMetadata 944 ) 945 946 func rebalanceTrace(r rebalanceMetric, poolIdx int, startTime time.Time, duration time.Duration, err error, path string) madmin.TraceInfo { 947 var errStr string 948 if err != nil { 949 errStr = err.Error() 950 } 951 return madmin.TraceInfo{ 952 TraceType: madmin.TraceRebalance, 953 Time: startTime, 954 NodeName: globalLocalNodeName, 955 FuncName: fmt.Sprintf("rebalance.%s (pool-id=%d)", r.String(), poolIdx), 956 Duration: duration, 957 Path: path, 958 Error: errStr, 959 } 960 } 961 962 func (p *rebalanceMetrics) log(r rebalanceMetric, poolIdx int, paths ...string) func(err error) { 963 startTime := time.Now() 964 return func(err error) { 965 duration := time.Since(startTime) 966 if globalTrace.NumSubscribers(madmin.TraceRebalance) > 0 { 967 globalTrace.Publish(rebalanceTrace(r, poolIdx, startTime, duration, err, strings.Join(paths, " "))) 968 } 969 } 970 }