github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/erasure-sets.go (about) 1 // Copyright (c) 2015-2021 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package cmd 19 20 import ( 21 "context" 22 "encoding/binary" 23 "errors" 24 "fmt" 25 "hash/crc32" 26 "math/rand" 27 "net/http" 28 "reflect" 29 "strings" 30 "sync" 31 "time" 32 33 "github.com/dchest/siphash" 34 "github.com/dustin/go-humanize" 35 "github.com/google/uuid" 36 "github.com/minio/madmin-go/v3" 37 "github.com/minio/minio-go/v7/pkg/set" 38 "github.com/minio/minio-go/v7/pkg/tags" 39 "github.com/minio/minio/internal/dsync" 40 xioutil "github.com/minio/minio/internal/ioutil" 41 "github.com/minio/minio/internal/logger" 42 "github.com/minio/pkg/v2/console" 43 "github.com/minio/pkg/v2/sync/errgroup" 44 ) 45 46 // setsDsyncLockers is encapsulated type for Close() 47 type setsDsyncLockers [][]dsync.NetLocker 48 49 // erasureSets implements ObjectLayer combining a static list of erasure coded 50 // object sets. NOTE: There is no dynamic scaling allowed or intended in 51 // current design. 52 type erasureSets struct { 53 sets []*erasureObjects 54 55 // Reference format. 56 format *formatErasureV3 57 58 // erasureDisks mutex to lock erasureDisks. 59 erasureDisksMu sync.RWMutex 60 61 // Re-ordered list of disks per set. 62 erasureDisks [][]StorageAPI 63 64 // Distributed locker clients. 65 erasureLockers setsDsyncLockers 66 67 // Distributed lock owner (constant per running instance). 68 erasureLockOwner string 69 70 // List of endpoints provided on the command line. 71 endpoints PoolEndpoints 72 73 // String version of all the endpoints, an optimization 74 // to avoid url.String() conversion taking CPU on 75 // large disk setups. 76 endpointStrings []string 77 78 // Total number of sets and the number of disks per set. 79 setCount, setDriveCount int 80 defaultParityCount int 81 82 poolIndex int 83 84 // A channel to send the set index to the MRF when 85 // any disk belonging to that set is connected 86 setReconnectEvent chan int 87 88 // Distribution algorithm of choice. 89 distributionAlgo string 90 deploymentID [16]byte 91 92 lastConnectDisksOpTime time.Time 93 } 94 95 func (s *erasureSets) getDiskMap() map[Endpoint]StorageAPI { 96 diskMap := make(map[Endpoint]StorageAPI) 97 98 s.erasureDisksMu.RLock() 99 defer s.erasureDisksMu.RUnlock() 100 101 for i := 0; i < s.setCount; i++ { 102 for j := 0; j < s.setDriveCount; j++ { 103 disk := s.erasureDisks[i][j] 104 if disk == OfflineDisk { 105 continue 106 } 107 if !disk.IsOnline() { 108 continue 109 } 110 diskMap[disk.Endpoint()] = disk 111 } 112 } 113 return diskMap 114 } 115 116 // Initializes a new StorageAPI from the endpoint argument, returns 117 // StorageAPI and also `format` which exists on the disk. 118 func connectEndpoint(endpoint Endpoint) (StorageAPI, *formatErasureV3, []byte, error) { 119 disk, err := newStorageAPI(endpoint, storageOpts{ 120 cleanUp: false, 121 healthCheck: false, 122 }) 123 if err != nil { 124 return nil, nil, nil, err 125 } 126 127 format, formatData, err := loadFormatErasureWithData(disk, false) 128 if err != nil { 129 if errors.Is(err, errUnformattedDisk) { 130 info, derr := disk.DiskInfo(context.TODO(), DiskInfoOptions{}) 131 if derr != nil && info.RootDisk { 132 disk.Close() 133 return nil, nil, nil, fmt.Errorf("Drive: %s is a root drive", disk) 134 } 135 } 136 disk.Close() 137 return nil, nil, nil, fmt.Errorf("Drive: %s returned %w", disk, err) // make sure to '%w' to wrap the error 138 } 139 140 disk.Close() 141 disk, err = newStorageAPI(endpoint, storageOpts{ 142 cleanUp: true, 143 healthCheck: true, 144 }) 145 if err != nil { 146 return nil, nil, nil, err 147 } 148 149 return disk, format, formatData, nil 150 } 151 152 // findDiskIndex - returns the i,j'th position of the input `diskID` against the reference 153 // format, after successful validation. 154 // - i'th position is the set index 155 // - j'th position is the disk index in the current set 156 func findDiskIndexByDiskID(refFormat *formatErasureV3, diskID string) (int, int, error) { 157 if diskID == "" { 158 return -1, -1, errDiskNotFound 159 } 160 if diskID == offlineDiskUUID { 161 return -1, -1, fmt.Errorf("DriveID: %s is offline", diskID) 162 } 163 for i := 0; i < len(refFormat.Erasure.Sets); i++ { 164 for j := 0; j < len(refFormat.Erasure.Sets[0]); j++ { 165 if refFormat.Erasure.Sets[i][j] == diskID { 166 return i, j, nil 167 } 168 } 169 } 170 171 return -1, -1, fmt.Errorf("DriveID: %s not found", diskID) 172 } 173 174 // findDiskIndex - returns the i,j'th position of the input `format` against the reference 175 // format, after successful validation. 176 // - i'th position is the set index 177 // - j'th position is the disk index in the current set 178 func findDiskIndex(refFormat, format *formatErasureV3) (int, int, error) { 179 if err := formatErasureV3Check(refFormat, format); err != nil { 180 return 0, 0, err 181 } 182 183 if format.Erasure.This == offlineDiskUUID { 184 return -1, -1, fmt.Errorf("DriveID: %s is offline", format.Erasure.This) 185 } 186 187 for i := 0; i < len(refFormat.Erasure.Sets); i++ { 188 for j := 0; j < len(refFormat.Erasure.Sets[0]); j++ { 189 if refFormat.Erasure.Sets[i][j] == format.Erasure.This { 190 return i, j, nil 191 } 192 } 193 } 194 195 return -1, -1, fmt.Errorf("DriveID: %s not found", format.Erasure.This) 196 } 197 198 // connectDisks - attempt to connect all the endpoints, loads format 199 // and re-arranges the disks in proper position. 200 func (s *erasureSets) connectDisks() { 201 defer func() { 202 s.lastConnectDisksOpTime = time.Now() 203 }() 204 205 var wg sync.WaitGroup 206 diskMap := s.getDiskMap() 207 for _, endpoint := range s.endpoints.Endpoints { 208 cdisk := diskMap[endpoint] 209 if cdisk != nil && cdisk.IsOnline() { 210 if s.lastConnectDisksOpTime.IsZero() { 211 continue 212 } 213 214 // An online-disk means its a valid disk but it may be a re-connected disk 215 // we verify that here based on LastConn(), however we make sure to avoid 216 // putting it back into the s.erasureDisks by re-placing the disk again. 217 _, setIndex, _ := cdisk.GetDiskLoc() 218 if setIndex != -1 { 219 continue 220 } 221 } 222 if cdisk != nil { 223 // Close previous offline disk. 224 cdisk.Close() 225 } 226 227 wg.Add(1) 228 go func(endpoint Endpoint) { 229 defer wg.Done() 230 disk, format, formatData, err := connectEndpoint(endpoint) 231 if err != nil { 232 if endpoint.IsLocal && errors.Is(err, errUnformattedDisk) { 233 globalBackgroundHealState.pushHealLocalDisks(endpoint) 234 } else { 235 printEndpointError(endpoint, err, true) 236 } 237 return 238 } 239 if disk.IsLocal() && disk.Healing() != nil { 240 globalBackgroundHealState.pushHealLocalDisks(disk.Endpoint()) 241 } 242 s.erasureDisksMu.Lock() 243 setIndex, diskIndex, err := findDiskIndex(s.format, format) 244 if err != nil { 245 printEndpointError(endpoint, err, false) 246 disk.Close() 247 s.erasureDisksMu.Unlock() 248 return 249 } 250 251 if currentDisk := s.erasureDisks[setIndex][diskIndex]; currentDisk != nil { 252 if !reflect.DeepEqual(currentDisk.Endpoint(), disk.Endpoint()) { 253 err = fmt.Errorf("Detected unexpected drive ordering refusing to use the drive: expecting %s, found %s, refusing to use the drive", 254 currentDisk.Endpoint(), disk.Endpoint()) 255 printEndpointError(endpoint, err, false) 256 disk.Close() 257 s.erasureDisksMu.Unlock() 258 return 259 } 260 s.erasureDisks[setIndex][diskIndex].Close() 261 } 262 263 disk.SetDiskID(format.Erasure.This) 264 disk.SetDiskLoc(s.poolIndex, setIndex, diskIndex) 265 disk.SetFormatData(formatData) 266 s.erasureDisks[setIndex][diskIndex] = disk 267 268 if disk.IsLocal() { 269 globalLocalDrivesMu.Lock() 270 if globalIsDistErasure { 271 globalLocalSetDrives[s.poolIndex][setIndex][diskIndex] = disk 272 } 273 for i, ldisk := range globalLocalDrives { 274 _, k, l := ldisk.GetDiskLoc() 275 if k == setIndex && l == diskIndex { 276 globalLocalDrives[i] = disk 277 break 278 } 279 } 280 globalLocalDrivesMu.Unlock() 281 } 282 s.erasureDisksMu.Unlock() 283 }(endpoint) 284 } 285 286 wg.Wait() 287 } 288 289 // monitorAndConnectEndpoints this is a monitoring loop to keep track of disconnected 290 // endpoints by reconnecting them and making sure to place them into right position in 291 // the set topology, this monitoring happens at a given monitoring interval. 292 func (s *erasureSets) monitorAndConnectEndpoints(ctx context.Context, monitorInterval time.Duration) { 293 r := rand.New(rand.NewSource(time.Now().UnixNano())) 294 295 time.Sleep(time.Duration(r.Float64() * float64(time.Second))) 296 297 // Pre-emptively connect the disks if possible. 298 s.connectDisks() 299 300 monitor := time.NewTimer(monitorInterval) 301 defer monitor.Stop() 302 303 for { 304 select { 305 case <-ctx.Done(): 306 return 307 case <-monitor.C: 308 if serverDebugLog { 309 console.Debugln("running drive monitoring") 310 } 311 312 s.connectDisks() 313 314 // Reset the timer for next interval 315 monitor.Reset(monitorInterval) 316 } 317 } 318 } 319 320 func (s *erasureSets) GetLockers(setIndex int) func() ([]dsync.NetLocker, string) { 321 return func() ([]dsync.NetLocker, string) { 322 lockers := make([]dsync.NetLocker, len(s.erasureLockers[setIndex])) 323 copy(lockers, s.erasureLockers[setIndex]) 324 return lockers, s.erasureLockOwner 325 } 326 } 327 328 func (s *erasureSets) GetEndpointStrings(setIndex int) func() []string { 329 return func() []string { 330 eps := make([]string, s.setDriveCount) 331 copy(eps, s.endpointStrings[setIndex*s.setDriveCount:setIndex*s.setDriveCount+s.setDriveCount]) 332 return eps 333 } 334 } 335 336 func (s *erasureSets) GetEndpoints(setIndex int) func() []Endpoint { 337 return func() []Endpoint { 338 eps := make([]Endpoint, s.setDriveCount) 339 copy(eps, s.endpoints.Endpoints[setIndex*s.setDriveCount:setIndex*s.setDriveCount+s.setDriveCount]) 340 return eps 341 } 342 } 343 344 // GetDisks returns a closure for a given set, which provides list of disks per set. 345 func (s *erasureSets) GetDisks(setIndex int) func() []StorageAPI { 346 return func() []StorageAPI { 347 s.erasureDisksMu.RLock() 348 defer s.erasureDisksMu.RUnlock() 349 disks := make([]StorageAPI, s.setDriveCount) 350 copy(disks, s.erasureDisks[setIndex]) 351 return disks 352 } 353 } 354 355 // defaultMonitorConnectEndpointInterval is the interval to monitor endpoint connections. 356 // Must be bigger than defaultMonitorNewDiskInterval. 357 const defaultMonitorConnectEndpointInterval = defaultMonitorNewDiskInterval + time.Second*5 358 359 // Initialize new set of erasure coded sets. 360 func newErasureSets(ctx context.Context, endpoints PoolEndpoints, storageDisks []StorageAPI, format *formatErasureV3, defaultParityCount, poolIdx int) (*erasureSets, error) { 361 setCount := len(format.Erasure.Sets) 362 setDriveCount := len(format.Erasure.Sets[0]) 363 364 endpointStrings := make([]string, len(endpoints.Endpoints)) 365 for i, endpoint := range endpoints.Endpoints { 366 endpointStrings[i] = endpoint.String() 367 } 368 369 // Initialize the erasure sets instance. 370 s := &erasureSets{ 371 sets: make([]*erasureObjects, setCount), 372 erasureDisks: make([][]StorageAPI, setCount), 373 erasureLockers: make([][]dsync.NetLocker, setCount), 374 erasureLockOwner: globalLocalNodeName, 375 endpoints: endpoints, 376 endpointStrings: endpointStrings, 377 setCount: setCount, 378 setDriveCount: setDriveCount, 379 defaultParityCount: defaultParityCount, 380 format: format, 381 setReconnectEvent: make(chan int), 382 distributionAlgo: format.Erasure.DistributionAlgo, 383 deploymentID: uuid.MustParse(format.ID), 384 poolIndex: poolIdx, 385 } 386 387 mutex := newNSLock(globalIsDistErasure) 388 389 for i := 0; i < setCount; i++ { 390 s.erasureDisks[i] = make([]StorageAPI, setDriveCount) 391 } 392 393 erasureLockers := map[string]dsync.NetLocker{} 394 for _, endpoint := range endpoints.Endpoints { 395 if _, ok := erasureLockers[endpoint.Host]; !ok { 396 erasureLockers[endpoint.Host] = newLockAPI(endpoint) 397 } 398 } 399 400 var wg sync.WaitGroup 401 var lk sync.Mutex 402 for i := 0; i < setCount; i++ { 403 lockerEpSet := set.NewStringSet() 404 for j := 0; j < setDriveCount; j++ { 405 wg.Add(1) 406 go func(i int, endpoint Endpoint) { 407 defer wg.Done() 408 409 lk.Lock() 410 // Only add lockers only one per endpoint and per erasure set. 411 if locker, ok := erasureLockers[endpoint.Host]; ok && !lockerEpSet.Contains(endpoint.Host) { 412 lockerEpSet.Add(endpoint.Host) 413 s.erasureLockers[i] = append(s.erasureLockers[i], locker) 414 } 415 lk.Unlock() 416 }(i, endpoints.Endpoints[i*setDriveCount+j]) 417 } 418 } 419 wg.Wait() 420 421 for i := 0; i < setCount; i++ { 422 wg.Add(1) 423 go func(i int) { 424 defer wg.Done() 425 426 var innerWg sync.WaitGroup 427 for j := 0; j < setDriveCount; j++ { 428 disk := storageDisks[i*setDriveCount+j] 429 if disk == nil { 430 continue 431 } 432 433 if disk.IsLocal() && globalIsDistErasure { 434 globalLocalDrivesMu.RLock() 435 ldisk := globalLocalSetDrives[poolIdx][i][j] 436 if ldisk == nil { 437 globalLocalDrivesMu.RUnlock() 438 continue 439 } 440 disk.Close() 441 disk = ldisk 442 globalLocalDrivesMu.RUnlock() 443 } 444 445 innerWg.Add(1) 446 go func(disk StorageAPI, i, j int) { 447 defer innerWg.Done() 448 diskID, err := disk.GetDiskID() 449 if err != nil { 450 if !errors.Is(err, errUnformattedDisk) { 451 logger.LogIf(ctx, err) 452 } 453 return 454 } 455 if diskID == "" { 456 return 457 } 458 m, n, err := findDiskIndexByDiskID(format, diskID) 459 if err != nil { 460 logger.LogIf(ctx, err) 461 return 462 } 463 if m != i || n != j { 464 logger.LogIf(ctx, fmt.Errorf("Detected unexpected drive ordering refusing to use the drive - poolID: %s, found drive mounted at (set=%s, drive=%s) expected mount at (set=%s, drive=%s): %s(%s)", humanize.Ordinal(poolIdx+1), humanize.Ordinal(m+1), humanize.Ordinal(n+1), humanize.Ordinal(i+1), humanize.Ordinal(j+1), disk, diskID)) 465 s.erasureDisks[i][j] = &unrecognizedDisk{storage: disk} 466 return 467 } 468 disk.SetDiskLoc(s.poolIndex, m, n) 469 s.erasureDisks[m][n] = disk 470 }(disk, i, j) 471 } 472 innerWg.Wait() 473 474 // Initialize erasure objects for a given set. 475 s.sets[i] = &erasureObjects{ 476 setIndex: i, 477 poolIndex: poolIdx, 478 setDriveCount: setDriveCount, 479 defaultParityCount: defaultParityCount, 480 getDisks: s.GetDisks(i), 481 getLockers: s.GetLockers(i), 482 getEndpoints: s.GetEndpoints(i), 483 getEndpointStrings: s.GetEndpointStrings(i), 484 nsMutex: mutex, 485 } 486 }(i) 487 } 488 489 wg.Wait() 490 491 // start cleanup stale uploads go-routine. 492 go s.cleanupStaleUploads(ctx) 493 494 // start cleanup of deleted objects. 495 go s.cleanupDeletedObjects(ctx) 496 497 // Start the disk monitoring and connect routine. 498 if !globalIsTesting { 499 go s.monitorAndConnectEndpoints(ctx, defaultMonitorConnectEndpointInterval) 500 } 501 502 return s, nil 503 } 504 505 // cleanup ".trash/" folder every 5m minutes with sufficient sleep cycles, between each 506 // deletes a dynamic sleeper is used with a factor of 10 ratio with max delay between 507 // deletes to be 2 seconds. 508 func (s *erasureSets) cleanupDeletedObjects(ctx context.Context) { 509 timer := time.NewTimer(globalAPIConfig.getDeleteCleanupInterval()) 510 defer timer.Stop() 511 512 for { 513 select { 514 case <-ctx.Done(): 515 return 516 case <-timer.C: 517 var wg sync.WaitGroup 518 for _, set := range s.sets { 519 wg.Add(1) 520 go func(set *erasureObjects) { 521 defer wg.Done() 522 if set == nil { 523 return 524 } 525 set.cleanupDeletedObjects(ctx) 526 }(set) 527 } 528 wg.Wait() 529 530 // Reset for the next interval 531 timer.Reset(globalAPIConfig.getDeleteCleanupInterval()) 532 } 533 } 534 } 535 536 func (s *erasureSets) cleanupStaleUploads(ctx context.Context) { 537 timer := time.NewTimer(globalAPIConfig.getStaleUploadsCleanupInterval()) 538 defer timer.Stop() 539 540 for { 541 select { 542 case <-ctx.Done(): 543 return 544 case <-timer.C: 545 var wg sync.WaitGroup 546 for _, set := range s.sets { 547 wg.Add(1) 548 go func(set *erasureObjects) { 549 defer wg.Done() 550 if set == nil { 551 return 552 } 553 set.cleanupStaleUploads(ctx, globalAPIConfig.getStaleUploadsExpiry()) 554 }(set) 555 } 556 wg.Wait() 557 558 // Reset for the next interval 559 timer.Reset(globalAPIConfig.getStaleUploadsCleanupInterval()) 560 } 561 } 562 } 563 564 type auditObjectOp struct { 565 Name string `json:"name"` 566 Pool int `json:"poolId"` 567 Set int `json:"setId"` 568 Disks []string `json:"disks"` 569 } 570 571 // Add erasure set information to the current context 572 func auditObjectErasureSet(ctx context.Context, object string, set *erasureObjects) { 573 if len(logger.AuditTargets()) == 0 { 574 return 575 } 576 577 op := auditObjectOp{ 578 Name: decodeDirObject(object), 579 Pool: set.poolIndex + 1, 580 Set: set.setIndex + 1, 581 Disks: set.getEndpointStrings(), 582 } 583 584 logger.GetReqInfo(ctx).AppendTags("objectLocation", op) 585 } 586 587 // NewNSLock - initialize a new namespace RWLocker instance. 588 func (s *erasureSets) NewNSLock(bucket string, objects ...string) RWLocker { 589 if len(objects) == 1 { 590 return s.getHashedSet(objects[0]).NewNSLock(bucket, objects...) 591 } 592 return s.getHashedSet("").NewNSLock(bucket, objects...) 593 } 594 595 // SetDriveCount returns the current drives per set. 596 func (s *erasureSets) SetDriveCount() int { 597 return s.setDriveCount 598 } 599 600 // ParityCount returns the default parity count used while erasure 601 // coding objects 602 func (s *erasureSets) ParityCount() int { 603 return s.defaultParityCount 604 } 605 606 // StorageInfo - combines output of StorageInfo across all erasure coded object sets. 607 func (s *erasureSets) StorageInfo(ctx context.Context) StorageInfo { 608 var storageInfo madmin.StorageInfo 609 610 storageInfos := make([]madmin.StorageInfo, len(s.sets)) 611 612 g := errgroup.WithNErrs(len(s.sets)) 613 for index := range s.sets { 614 index := index 615 g.Go(func() error { 616 storageInfos[index] = s.sets[index].StorageInfo(ctx) 617 return nil 618 }, index) 619 } 620 621 // Wait for the go routines. 622 g.Wait() 623 624 for _, lstorageInfo := range storageInfos { 625 storageInfo.Disks = append(storageInfo.Disks, lstorageInfo.Disks...) 626 } 627 628 return storageInfo 629 } 630 631 // StorageInfo - combines output of StorageInfo across all erasure coded object sets. 632 func (s *erasureSets) LocalStorageInfo(ctx context.Context, metrics bool) StorageInfo { 633 var storageInfo StorageInfo 634 635 storageInfos := make([]StorageInfo, len(s.sets)) 636 637 g := errgroup.WithNErrs(len(s.sets)) 638 for index := range s.sets { 639 index := index 640 g.Go(func() error { 641 storageInfos[index] = s.sets[index].LocalStorageInfo(ctx, metrics) 642 return nil 643 }, index) 644 } 645 646 // Wait for the go routines. 647 g.Wait() 648 649 for _, lstorageInfo := range storageInfos { 650 storageInfo.Disks = append(storageInfo.Disks, lstorageInfo.Disks...) 651 } 652 653 return storageInfo 654 } 655 656 // Shutdown shutsdown all erasure coded sets in parallel 657 // returns error upon first error. 658 func (s *erasureSets) Shutdown(ctx context.Context) error { 659 g := errgroup.WithNErrs(len(s.sets)) 660 661 for index := range s.sets { 662 index := index 663 g.Go(func() error { 664 return s.sets[index].Shutdown(ctx) 665 }, index) 666 } 667 668 for _, err := range g.Wait() { 669 if err != nil { 670 return err 671 } 672 } 673 select { 674 case _, ok := <-s.setReconnectEvent: 675 if ok { 676 xioutil.SafeClose(s.setReconnectEvent) 677 } 678 default: 679 xioutil.SafeClose(s.setReconnectEvent) 680 } 681 return nil 682 } 683 684 // hashes the key returning an integer based on the input algorithm. 685 // This function currently supports 686 // - CRCMOD 687 // - SIPMOD 688 // - all new algos. 689 func sipHashMod(key string, cardinality int, id [16]byte) int { 690 if cardinality <= 0 { 691 return -1 692 } 693 // use the faster version as per siphash docs 694 // https://github.com/dchest/siphash#usage 695 k0, k1 := binary.LittleEndian.Uint64(id[0:8]), binary.LittleEndian.Uint64(id[8:16]) 696 sum64 := siphash.Hash(k0, k1, []byte(key)) 697 return int(sum64 % uint64(cardinality)) 698 } 699 700 func crcHashMod(key string, cardinality int) int { 701 if cardinality <= 0 { 702 return -1 703 } 704 keyCrc := crc32.Checksum([]byte(key), crc32.IEEETable) 705 return int(keyCrc % uint32(cardinality)) 706 } 707 708 func hashKey(algo string, key string, cardinality int, id [16]byte) int { 709 switch algo { 710 case formatErasureVersionV2DistributionAlgoV1: 711 return crcHashMod(key, cardinality) 712 case formatErasureVersionV3DistributionAlgoV2, formatErasureVersionV3DistributionAlgoV3: 713 return sipHashMod(key, cardinality, id) 714 default: 715 // Unknown algorithm returns -1, also if cardinality is lesser than 0. 716 return -1 717 } 718 } 719 720 // Returns always a same erasure coded set for a given input. 721 func (s *erasureSets) getHashedSetIndex(input string) int { 722 return hashKey(s.distributionAlgo, input, len(s.sets), s.deploymentID) 723 } 724 725 // Returns always a same erasure coded set for a given input. 726 func (s *erasureSets) getHashedSet(input string) (set *erasureObjects) { 727 return s.sets[s.getHashedSetIndex(input)] 728 } 729 730 // listDeletedBuckets lists deleted buckets from all disks. 731 func listDeletedBuckets(ctx context.Context, storageDisks []StorageAPI, delBuckets map[string]VolInfo, readQuorum int) error { 732 g := errgroup.WithNErrs(len(storageDisks)) 733 var mu sync.Mutex 734 for index := range storageDisks { 735 index := index 736 g.Go(func() error { 737 if storageDisks[index] == nil { 738 // we ignore disk not found errors 739 return nil 740 } 741 volsInfo, err := storageDisks[index].ListDir(ctx, "", minioMetaBucket, pathJoin(bucketMetaPrefix, deletedBucketsPrefix), -1) 742 if err != nil { 743 if errors.Is(err, errFileNotFound) { 744 return nil 745 } 746 return err 747 } 748 for _, volName := range volsInfo { 749 vi, err := storageDisks[index].StatVol(ctx, pathJoin(minioMetaBucket, bucketMetaPrefix, deletedBucketsPrefix, volName)) 750 if err == nil { 751 vi.Name = strings.TrimSuffix(volName, SlashSeparator) 752 mu.Lock() 753 if _, ok := delBuckets[volName]; !ok { 754 delBuckets[volName] = vi 755 } 756 mu.Unlock() 757 } 758 } 759 return nil 760 }, index) 761 } 762 return reduceReadQuorumErrs(ctx, g.Wait(), bucketMetadataOpIgnoredErrs, readQuorum) 763 } 764 765 // --- Object Operations --- 766 767 // GetObjectNInfo - returns object info and locked object ReadCloser 768 func (s *erasureSets) GetObjectNInfo(ctx context.Context, bucket, object string, rs *HTTPRangeSpec, h http.Header, opts ObjectOptions) (gr *GetObjectReader, err error) { 769 set := s.getHashedSet(object) 770 return set.GetObjectNInfo(ctx, bucket, object, rs, h, opts) 771 } 772 773 // PutObject - writes an object to hashedSet based on the object name. 774 func (s *erasureSets) PutObject(ctx context.Context, bucket string, object string, data *PutObjReader, opts ObjectOptions) (objInfo ObjectInfo, err error) { 775 set := s.getHashedSet(object) 776 return set.PutObject(ctx, bucket, object, data, opts) 777 } 778 779 // GetObjectInfo - reads object metadata from the hashedSet based on the object name. 780 func (s *erasureSets) GetObjectInfo(ctx context.Context, bucket, object string, opts ObjectOptions) (objInfo ObjectInfo, err error) { 781 set := s.getHashedSet(object) 782 return set.GetObjectInfo(ctx, bucket, object, opts) 783 } 784 785 func (s *erasureSets) deletePrefix(ctx context.Context, bucket string, prefix string) error { 786 var wg sync.WaitGroup 787 wg.Add(len(s.sets)) 788 for _, s := range s.sets { 789 go func(s *erasureObjects) { 790 defer wg.Done() 791 // This is a force delete, no reason to throw errors. 792 s.DeleteObject(ctx, bucket, prefix, ObjectOptions{DeletePrefix: true}) 793 }(s) 794 } 795 wg.Wait() 796 return nil 797 } 798 799 // DeleteObject - deletes an object from the hashedSet based on the object name. 800 func (s *erasureSets) DeleteObject(ctx context.Context, bucket string, object string, opts ObjectOptions) (objInfo ObjectInfo, err error) { 801 if opts.DeletePrefix && !opts.DeletePrefixObject { 802 err := s.deletePrefix(ctx, bucket, object) 803 return ObjectInfo{}, err 804 } 805 set := s.getHashedSet(object) 806 return set.DeleteObject(ctx, bucket, object, opts) 807 } 808 809 // DeleteObjects - bulk delete of objects 810 // Bulk delete is only possible within one set. For that purpose 811 // objects are group by set first, and then bulk delete is invoked 812 // for each set, the error response of each delete will be returned 813 func (s *erasureSets) DeleteObjects(ctx context.Context, bucket string, objects []ObjectToDelete, opts ObjectOptions) ([]DeletedObject, []error) { 814 type delObj struct { 815 // Set index associated to this object 816 setIndex int 817 // Original index from the list of arguments 818 // where this object is passed 819 origIndex int 820 // object to delete 821 object ObjectToDelete 822 } 823 824 // Transform []delObj to the list of object names 825 toNames := func(delObjs []delObj) []ObjectToDelete { 826 objs := make([]ObjectToDelete, len(delObjs)) 827 for i, obj := range delObjs { 828 objs[i] = obj.object 829 } 830 return objs 831 } 832 833 // The result of delete operation on all passed objects 834 delErrs := make([]error, len(objects)) 835 836 // The result of delete objects 837 delObjects := make([]DeletedObject, len(objects)) 838 839 // A map between a set and its associated objects 840 objSetMap := make(map[int][]delObj) 841 842 // Group objects by set index 843 for i, object := range objects { 844 index := s.getHashedSetIndex(object.ObjectName) 845 objSetMap[index] = append(objSetMap[index], delObj{setIndex: index, origIndex: i, object: object}) 846 } 847 848 // Invoke bulk delete on objects per set and save 849 // the result of the delete operation 850 var wg sync.WaitGroup 851 var mu sync.Mutex 852 wg.Add(len(objSetMap)) 853 for setIdx, objsGroup := range objSetMap { 854 go func(set *erasureObjects, group []delObj) { 855 defer wg.Done() 856 dobjects, errs := set.DeleteObjects(ctx, bucket, toNames(group), opts) 857 mu.Lock() 858 defer mu.Unlock() 859 for i, obj := range group { 860 delErrs[obj.origIndex] = errs[i] 861 delObjects[obj.origIndex] = dobjects[i] 862 } 863 }(s.sets[setIdx], objsGroup) 864 } 865 wg.Wait() 866 867 return delObjects, delErrs 868 } 869 870 // CopyObject - copies objects from one hashedSet to another hashedSet, on server side. 871 func (s *erasureSets) CopyObject(ctx context.Context, srcBucket, srcObject, dstBucket, dstObject string, srcInfo ObjectInfo, srcOpts, dstOpts ObjectOptions) (objInfo ObjectInfo, err error) { 872 srcSet := s.getHashedSet(srcObject) 873 dstSet := s.getHashedSet(dstObject) 874 875 cpSrcDstSame := srcSet == dstSet 876 // Check if this request is only metadata update. 877 if cpSrcDstSame && srcInfo.metadataOnly { 878 // Version ID is set for the destination and source == destination version ID. 879 // perform an in-place update. 880 if dstOpts.VersionID != "" && srcOpts.VersionID == dstOpts.VersionID { 881 srcInfo.Reader.Close() // We are not interested in the reader stream at this point close it. 882 return srcSet.CopyObject(ctx, srcBucket, srcObject, dstBucket, dstObject, srcInfo, srcOpts, dstOpts) 883 } 884 // Destination is not versioned and source version ID is empty 885 // perform an in-place update. 886 if !dstOpts.Versioned && srcOpts.VersionID == "" { 887 srcInfo.Reader.Close() // We are not interested in the reader stream at this point close it. 888 return srcSet.CopyObject(ctx, srcBucket, srcObject, dstBucket, dstObject, srcInfo, srcOpts, dstOpts) 889 } 890 // CopyObject optimization where we don't create an entire copy 891 // of the content, instead we add a reference, we disallow legacy 892 // objects to be self referenced in this manner so make sure 893 // that we actually create a new dataDir for legacy objects. 894 if dstOpts.Versioned && srcOpts.VersionID != dstOpts.VersionID && !srcInfo.Legacy { 895 srcInfo.versionOnly = true 896 srcInfo.Reader.Close() // We are not interested in the reader stream at this point close it. 897 return srcSet.CopyObject(ctx, srcBucket, srcObject, dstBucket, dstObject, srcInfo, srcOpts, dstOpts) 898 } 899 } 900 901 putOpts := ObjectOptions{ 902 ServerSideEncryption: dstOpts.ServerSideEncryption, 903 UserDefined: srcInfo.UserDefined, 904 Versioned: dstOpts.Versioned, 905 VersionID: dstOpts.VersionID, 906 MTime: dstOpts.MTime, 907 } 908 909 return dstSet.putObject(ctx, dstBucket, dstObject, srcInfo.PutObjReader, putOpts) 910 } 911 912 func (s *erasureSets) ListMultipartUploads(ctx context.Context, bucket, prefix, keyMarker, uploadIDMarker, delimiter string, maxUploads int) (result ListMultipartsInfo, err error) { 913 // In list multipart uploads we are going to treat input prefix as the object, 914 // this means that we are not supporting directory navigation. 915 set := s.getHashedSet(prefix) 916 return set.ListMultipartUploads(ctx, bucket, prefix, keyMarker, uploadIDMarker, delimiter, maxUploads) 917 } 918 919 // Initiate a new multipart upload on a hashedSet based on object name. 920 func (s *erasureSets) NewMultipartUpload(ctx context.Context, bucket, object string, opts ObjectOptions) (res *NewMultipartUploadResult, err error) { 921 set := s.getHashedSet(object) 922 return set.NewMultipartUpload(ctx, bucket, object, opts) 923 } 924 925 // PutObjectPart - writes part of an object to hashedSet based on the object name. 926 func (s *erasureSets) PutObjectPart(ctx context.Context, bucket, object, uploadID string, partID int, data *PutObjReader, opts ObjectOptions) (info PartInfo, err error) { 927 set := s.getHashedSet(object) 928 return set.PutObjectPart(ctx, bucket, object, uploadID, partID, data, opts) 929 } 930 931 // GetMultipartInfo - return multipart metadata info uploaded at hashedSet. 932 func (s *erasureSets) GetMultipartInfo(ctx context.Context, bucket, object, uploadID string, opts ObjectOptions) (result MultipartInfo, err error) { 933 set := s.getHashedSet(object) 934 return set.GetMultipartInfo(ctx, bucket, object, uploadID, opts) 935 } 936 937 // ListObjectParts - lists all uploaded parts to an object in hashedSet. 938 func (s *erasureSets) ListObjectParts(ctx context.Context, bucket, object, uploadID string, partNumberMarker int, maxParts int, opts ObjectOptions) (result ListPartsInfo, err error) { 939 set := s.getHashedSet(object) 940 return set.ListObjectParts(ctx, bucket, object, uploadID, partNumberMarker, maxParts, opts) 941 } 942 943 // Aborts an in-progress multipart operation on hashedSet based on the object name. 944 func (s *erasureSets) AbortMultipartUpload(ctx context.Context, bucket, object, uploadID string, opts ObjectOptions) error { 945 set := s.getHashedSet(object) 946 return set.AbortMultipartUpload(ctx, bucket, object, uploadID, opts) 947 } 948 949 // CompleteMultipartUpload - completes a pending multipart transaction, on hashedSet based on object name. 950 func (s *erasureSets) CompleteMultipartUpload(ctx context.Context, bucket, object, uploadID string, uploadedParts []CompletePart, opts ObjectOptions) (objInfo ObjectInfo, err error) { 951 set := s.getHashedSet(object) 952 return set.CompleteMultipartUpload(ctx, bucket, object, uploadID, uploadedParts, opts) 953 } 954 955 /* 956 957 All disks online 958 ----------------- 959 - All Unformatted - format all and return success. 960 - Some Unformatted - format all and return success. 961 - Any JBOD inconsistent - return failure 962 - Some are corrupt (missing format.json) - return failure 963 - Any unrecognized disks - return failure 964 965 Some disks are offline and we have quorum. 966 ----------------- 967 - Some unformatted - format all and return success, 968 treat disks offline as corrupted. 969 - Any JBOD inconsistent - return failure 970 - Some are corrupt (missing format.json) 971 - Any unrecognized disks - return failure 972 973 No read quorum 974 ----------------- 975 failure for all cases. 976 977 // Pseudo code for managing `format.json`. 978 979 // Generic checks. 980 if (no quorum) return error 981 if (any disk is corrupt) return error // Always error 982 if (jbod inconsistent) return error // Always error. 983 if (disks not recognized) // Always error. 984 985 // Specific checks. 986 if (all disks online) 987 if (all disks return format.json) 988 if (jbod consistent) 989 if (all disks recognized) 990 return 991 else 992 if (all disks return format.json not found) 993 return error 994 else (some disks return format.json not found) 995 (heal format) 996 return 997 fi 998 fi 999 else 1000 if (some disks return format.json not found) 1001 // Offline disks are marked as dead. 1002 (heal format) // Offline disks should be marked as dead. 1003 return success 1004 fi 1005 fi 1006 */ 1007 1008 func formatsToDrivesInfo(endpoints Endpoints, formats []*formatErasureV3, sErrs []error) (beforeDrives []madmin.HealDriveInfo) { 1009 beforeDrives = make([]madmin.HealDriveInfo, len(endpoints)) 1010 // Existing formats are available (i.e. ok), so save it in 1011 // result, also populate disks to be healed. 1012 for i, format := range formats { 1013 drive := endpoints.GetString(i) 1014 state := madmin.DriveStateCorrupt 1015 switch { 1016 case format != nil: 1017 state = madmin.DriveStateOk 1018 case sErrs[i] == errUnformattedDisk: 1019 state = madmin.DriveStateMissing 1020 case sErrs[i] == errDiskNotFound: 1021 state = madmin.DriveStateOffline 1022 } 1023 beforeDrives[i] = madmin.HealDriveInfo{ 1024 UUID: func() string { 1025 if format != nil { 1026 return format.Erasure.This 1027 } 1028 return "" 1029 }(), 1030 Endpoint: drive, 1031 State: state, 1032 } 1033 } 1034 1035 return beforeDrives 1036 } 1037 1038 // HealFormat - heals missing `format.json` on fresh unformatted disks. 1039 func (s *erasureSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.HealResultItem, err error) { 1040 storageDisks, _ := initStorageDisksWithErrors(s.endpoints.Endpoints, storageOpts{ 1041 cleanUp: false, 1042 healthCheck: false, 1043 }) 1044 1045 defer func(storageDisks []StorageAPI) { 1046 if err != nil { 1047 closeStorageDisks(storageDisks...) 1048 } 1049 }(storageDisks) 1050 1051 formats, sErrs := loadFormatErasureAll(storageDisks, true) 1052 if err = checkFormatErasureValues(formats, storageDisks, s.setDriveCount); err != nil { 1053 return madmin.HealResultItem{}, err 1054 } 1055 1056 refFormat, err := getFormatErasureInQuorum(formats) 1057 if err != nil { 1058 return res, err 1059 } 1060 1061 // Prepare heal-result 1062 res = madmin.HealResultItem{ 1063 Type: madmin.HealItemMetadata, 1064 Detail: "disk-format", 1065 DiskCount: s.setCount * s.setDriveCount, 1066 SetCount: s.setCount, 1067 } 1068 1069 // Fetch all the drive info status. 1070 beforeDrives := formatsToDrivesInfo(s.endpoints.Endpoints, formats, sErrs) 1071 1072 res.After.Drives = make([]madmin.HealDriveInfo, len(beforeDrives)) 1073 res.Before.Drives = make([]madmin.HealDriveInfo, len(beforeDrives)) 1074 // Copy "after" drive state too from before. 1075 for k, v := range beforeDrives { 1076 res.Before.Drives[k] = v 1077 res.After.Drives[k] = v 1078 } 1079 1080 if countErrs(sErrs, errUnformattedDisk) == 0 { 1081 return res, errNoHealRequired 1082 } 1083 1084 if !reflect.DeepEqual(s.format, refFormat) { 1085 // Format is corrupted and unrecognized by the running instance. 1086 logger.LogIf(ctx, fmt.Errorf("Unable to heal the newly replaced drives due to format.json inconsistencies, please engage MinIO support for further assistance: %w", 1087 errCorruptedFormat)) 1088 return res, errCorruptedFormat 1089 } 1090 1091 formatOpID := mustGetUUID() 1092 1093 // Initialize a new set of set formats which will be written to disk. 1094 newFormatSets, currentDisksInfo := newHealFormatSets(refFormat, s.setCount, s.setDriveCount, formats, sErrs) 1095 1096 if !dryRun { 1097 tmpNewFormats := make([]*formatErasureV3, s.setCount*s.setDriveCount) 1098 for i := range newFormatSets { 1099 for j := range newFormatSets[i] { 1100 if newFormatSets[i][j] == nil { 1101 continue 1102 } 1103 res.After.Drives[i*s.setDriveCount+j].UUID = newFormatSets[i][j].Erasure.This 1104 res.After.Drives[i*s.setDriveCount+j].State = madmin.DriveStateOk 1105 tmpNewFormats[i*s.setDriveCount+j] = newFormatSets[i][j] 1106 } 1107 } 1108 1109 // Save new formats `format.json` on unformatted disks. 1110 for index, format := range tmpNewFormats { 1111 if storageDisks[index] == nil || format == nil { 1112 continue 1113 } 1114 if err := saveFormatErasure(storageDisks[index], format, formatOpID); err != nil { 1115 logger.LogIf(ctx, fmt.Errorf("Drive %s failed to write updated 'format.json': %v", storageDisks[index], err)) 1116 storageDisks[index].Close() 1117 tmpNewFormats[index] = nil // this disk failed to write new format 1118 } 1119 } 1120 1121 s.erasureDisksMu.Lock() 1122 1123 for index, format := range tmpNewFormats { 1124 if format == nil { 1125 continue 1126 } 1127 1128 m, n, err := findDiskIndexByDiskID(refFormat, format.Erasure.This) 1129 if err != nil { 1130 logger.LogIf(ctx, err) 1131 continue 1132 } 1133 1134 if s.erasureDisks[m][n] != nil { 1135 s.erasureDisks[m][n].Close() 1136 } 1137 1138 if disk := storageDisks[index]; disk != nil { 1139 if disk.IsLocal() { 1140 disk.SetDiskLoc(s.poolIndex, m, n) 1141 1142 xldisk, ok := disk.(*xlStorageDiskIDCheck) 1143 if ok { 1144 _, commonDeletes := calcCommonWritesDeletes(currentDisksInfo[m], (s.setDriveCount+1)/2) 1145 xldisk.totalDeletes.Store(commonDeletes) 1146 xldisk.storage.setDeleteAttribute(commonDeletes) 1147 1148 if globalDriveMonitoring { 1149 go xldisk.monitorDiskWritable(xldisk.diskCtx) 1150 } 1151 } 1152 } else { 1153 disk.Close() // Close the remote storage client, re-initialize with healthchecks. 1154 disk, err = newStorageRESTClient(disk.Endpoint(), true, globalGrid.Load()) 1155 if err != nil { 1156 continue 1157 } 1158 disk.SetDiskLoc(s.poolIndex, m, n) 1159 } 1160 1161 s.erasureDisks[m][n] = disk 1162 1163 if disk.IsLocal() { 1164 globalLocalDrivesMu.Lock() 1165 if globalIsDistErasure { 1166 globalLocalSetDrives[s.poolIndex][m][n] = disk 1167 } 1168 for i, ldisk := range globalLocalDrives { 1169 _, k, l := ldisk.GetDiskLoc() 1170 if k == m && l == n { 1171 globalLocalDrives[i] = disk 1172 break 1173 } 1174 } 1175 globalLocalDrivesMu.Unlock() 1176 } 1177 } 1178 } 1179 1180 s.erasureDisksMu.Unlock() 1181 } 1182 1183 return res, nil 1184 } 1185 1186 // HealObject - heals inconsistent object on a hashedSet based on object name. 1187 func (s *erasureSets) HealObject(ctx context.Context, bucket, object, versionID string, opts madmin.HealOpts) (madmin.HealResultItem, error) { 1188 return s.getHashedSet(object).HealObject(ctx, bucket, object, versionID, opts) 1189 } 1190 1191 // PutObjectMetadata - replace or add metadata to an existing object/version 1192 func (s *erasureSets) PutObjectMetadata(ctx context.Context, bucket, object string, opts ObjectOptions) (ObjectInfo, error) { 1193 er := s.getHashedSet(object) 1194 return er.PutObjectMetadata(ctx, bucket, object, opts) 1195 } 1196 1197 // DecomTieredObject - moves tiered object to another pool during decommissioning. 1198 func (s *erasureSets) DecomTieredObject(ctx context.Context, bucket, object string, fi FileInfo, opts ObjectOptions) error { 1199 er := s.getHashedSet(object) 1200 return er.DecomTieredObject(ctx, bucket, object, fi, opts) 1201 } 1202 1203 // PutObjectTags - replace or add tags to an existing object 1204 func (s *erasureSets) PutObjectTags(ctx context.Context, bucket, object string, tags string, opts ObjectOptions) (ObjectInfo, error) { 1205 er := s.getHashedSet(object) 1206 return er.PutObjectTags(ctx, bucket, object, tags, opts) 1207 } 1208 1209 // DeleteObjectTags - delete object tags from an existing object 1210 func (s *erasureSets) DeleteObjectTags(ctx context.Context, bucket, object string, opts ObjectOptions) (ObjectInfo, error) { 1211 er := s.getHashedSet(object) 1212 return er.DeleteObjectTags(ctx, bucket, object, opts) 1213 } 1214 1215 // GetObjectTags - get object tags from an existing object 1216 func (s *erasureSets) GetObjectTags(ctx context.Context, bucket, object string, opts ObjectOptions) (*tags.Tags, error) { 1217 er := s.getHashedSet(object) 1218 return er.GetObjectTags(ctx, bucket, object, opts) 1219 } 1220 1221 // TransitionObject - transition object content to target tier. 1222 func (s *erasureSets) TransitionObject(ctx context.Context, bucket, object string, opts ObjectOptions) error { 1223 return s.getHashedSet(object).TransitionObject(ctx, bucket, object, opts) 1224 } 1225 1226 // RestoreTransitionedObject - restore transitioned object content locally on this cluster. 1227 func (s *erasureSets) RestoreTransitionedObject(ctx context.Context, bucket, object string, opts ObjectOptions) error { 1228 return s.getHashedSet(object).RestoreTransitionedObject(ctx, bucket, object, opts) 1229 } 1230 1231 // CheckAbandonedParts - check object for abandoned parts. 1232 func (s *erasureSets) CheckAbandonedParts(ctx context.Context, bucket, object string, opts madmin.HealOpts) error { 1233 return s.getHashedSet(object).checkAbandonedParts(ctx, bucket, object, opts) 1234 }