storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/cmd/erasure-sets.go (about) 1 /* 2 * MinIO Cloud Storage, (C) 2018-2019 MinIO, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package cmd 18 19 import ( 20 "context" 21 "encoding/binary" 22 "errors" 23 "fmt" 24 "hash/crc32" 25 "math/rand" 26 "net/http" 27 "sort" 28 "sync" 29 "time" 30 31 "github.com/dchest/siphash" 32 "github.com/dustin/go-humanize" 33 "github.com/google/uuid" 34 35 "github.com/minio/minio-go/v7/pkg/set" 36 "github.com/minio/minio-go/v7/pkg/tags" 37 38 "storj.io/minio/cmd/logger" 39 "storj.io/minio/pkg/bpool" 40 "storj.io/minio/pkg/console" 41 "storj.io/minio/pkg/dsync" 42 "storj.io/minio/pkg/env" 43 "storj.io/minio/pkg/madmin" 44 "storj.io/minio/pkg/sync/errgroup" 45 ) 46 47 // setsDsyncLockers is encapsulated type for Close() 48 type setsDsyncLockers [][]dsync.NetLocker 49 50 const envMinioDeleteCleanupInterval = "MINIO_DELETE_CLEANUP_INTERVAL" 51 52 // erasureSets implements ObjectLayer combining a static list of erasure coded 53 // object sets. NOTE: There is no dynamic scaling allowed or intended in 54 // current design. 55 type erasureSets struct { 56 GatewayUnsupported 57 58 sets []*erasureObjects 59 60 // Reference format. 61 format *formatErasureV3 62 63 // erasureDisks mutex to lock erasureDisks. 64 erasureDisksMu sync.RWMutex 65 66 // Re-ordered list of disks per set. 67 erasureDisks [][]StorageAPI 68 69 // Distributed locker clients. 70 erasureLockers setsDsyncLockers 71 72 // Distributed lock owner (constant per running instance). 73 erasureLockOwner string 74 75 // List of endpoints provided on the command line. 76 endpoints Endpoints 77 78 // String version of all the endpoints, an optimization 79 // to avoid url.String() conversion taking CPU on 80 // large disk setups. 81 endpointStrings []string 82 83 // Total number of sets and the number of disks per set. 84 setCount, setDriveCount int 85 defaultParityCount int 86 87 poolIndex int 88 89 // A channel to send the set index to the MRF when 90 // any disk belonging to that set is connected 91 setReconnectEvent chan int 92 93 // Distribution algorithm of choice. 94 distributionAlgo string 95 deploymentID [16]byte 96 97 disksStorageInfoCache timedValue 98 99 mrfMU sync.Mutex 100 mrfOperations map[healSource]int 101 } 102 103 func isEndpointConnected(diskMap map[string]StorageAPI, endpoint string) bool { 104 disk := diskMap[endpoint] 105 if disk == nil { 106 return false 107 } 108 return disk.IsOnline() 109 } 110 111 func (s *erasureSets) getDiskMap() map[string]StorageAPI { 112 diskMap := make(map[string]StorageAPI) 113 114 s.erasureDisksMu.RLock() 115 defer s.erasureDisksMu.RUnlock() 116 117 for i := 0; i < s.setCount; i++ { 118 for j := 0; j < s.setDriveCount; j++ { 119 disk := s.erasureDisks[i][j] 120 if disk == OfflineDisk { 121 continue 122 } 123 if !disk.IsOnline() { 124 continue 125 } 126 diskMap[disk.String()] = disk 127 } 128 } 129 return diskMap 130 } 131 132 // Initializes a new StorageAPI from the endpoint argument, returns 133 // StorageAPI and also `format` which exists on the disk. 134 func connectEndpoint(endpoint Endpoint) (StorageAPI, *formatErasureV3, error) { 135 disk, err := newStorageAPIWithoutHealthCheck(endpoint) 136 if err != nil { 137 return nil, nil, err 138 } 139 140 format, err := loadFormatErasure(disk) 141 if err != nil { 142 if errors.Is(err, errUnformattedDisk) { 143 info, derr := disk.DiskInfo(context.TODO()) 144 if derr != nil && info.RootDisk { 145 return nil, nil, fmt.Errorf("Disk: %s returned %w", disk, derr) // make sure to '%w' to wrap the error 146 } 147 } 148 return nil, nil, fmt.Errorf("Disk: %s returned %w", disk, err) // make sure to '%w' to wrap the error 149 } 150 151 return disk, format, nil 152 } 153 154 // findDiskIndex - returns the i,j'th position of the input `diskID` against the reference 155 // format, after successful validation. 156 // - i'th position is the set index 157 // - j'th position is the disk index in the current set 158 func findDiskIndexByDiskID(refFormat *formatErasureV3, diskID string) (int, int, error) { 159 if diskID == offlineDiskUUID { 160 return -1, -1, fmt.Errorf("diskID: %s is offline", diskID) 161 } 162 for i := 0; i < len(refFormat.Erasure.Sets); i++ { 163 for j := 0; j < len(refFormat.Erasure.Sets[0]); j++ { 164 if refFormat.Erasure.Sets[i][j] == diskID { 165 return i, j, nil 166 } 167 } 168 } 169 170 return -1, -1, fmt.Errorf("diskID: %s not found", diskID) 171 } 172 173 // findDiskIndex - returns the i,j'th position of the input `format` against the reference 174 // format, after successful validation. 175 // - i'th position is the set index 176 // - j'th position is the disk index in the current set 177 func findDiskIndex(refFormat, format *formatErasureV3) (int, int, error) { 178 if err := formatErasureV3Check(refFormat, format); err != nil { 179 return 0, 0, err 180 } 181 182 if format.Erasure.This == offlineDiskUUID { 183 return -1, -1, fmt.Errorf("diskID: %s is offline", format.Erasure.This) 184 } 185 186 for i := 0; i < len(refFormat.Erasure.Sets); i++ { 187 for j := 0; j < len(refFormat.Erasure.Sets[0]); j++ { 188 if refFormat.Erasure.Sets[i][j] == format.Erasure.This { 189 return i, j, nil 190 } 191 } 192 } 193 194 return -1, -1, fmt.Errorf("diskID: %s not found", format.Erasure.This) 195 } 196 197 // connectDisks - attempt to connect all the endpoints, loads format 198 // and re-arranges the disks in proper position. 199 func (s *erasureSets) connectDisks() { 200 var wg sync.WaitGroup 201 var setsJustConnected = make([]bool, s.setCount) 202 diskMap := s.getDiskMap() 203 for _, endpoint := range s.endpoints { 204 diskPath := endpoint.String() 205 if endpoint.IsLocal { 206 diskPath = endpoint.Path 207 } 208 if isEndpointConnected(diskMap, diskPath) { 209 continue 210 } 211 wg.Add(1) 212 go func(endpoint Endpoint) { 213 defer wg.Done() 214 disk, format, err := connectEndpoint(endpoint) 215 if err != nil { 216 if endpoint.IsLocal && errors.Is(err, errUnformattedDisk) { 217 globalBackgroundHealState.pushHealLocalDisks(endpoint) 218 logger.Info(fmt.Sprintf("Found unformatted drive %s, attempting to heal...", endpoint)) 219 } else { 220 printEndpointError(endpoint, err, true) 221 } 222 return 223 } 224 if disk.IsLocal() && disk.Healing() != nil { 225 globalBackgroundHealState.pushHealLocalDisks(disk.Endpoint()) 226 logger.Info(fmt.Sprintf("Found the drive %s that needs healing, attempting to heal...", disk)) 227 } 228 s.erasureDisksMu.RLock() 229 setIndex, diskIndex, err := findDiskIndex(s.format, format) 230 s.erasureDisksMu.RUnlock() 231 if err != nil { 232 printEndpointError(endpoint, err, false) 233 return 234 } 235 236 s.erasureDisksMu.Lock() 237 if s.erasureDisks[setIndex][diskIndex] != nil { 238 s.erasureDisks[setIndex][diskIndex].Close() 239 } 240 if disk.IsLocal() { 241 disk.SetDiskID(format.Erasure.This) 242 s.erasureDisks[setIndex][diskIndex] = disk 243 } else { 244 // Enable healthcheck disk for remote endpoint. 245 disk, err = newStorageAPI(endpoint) 246 if err != nil { 247 printEndpointError(endpoint, err, false) 248 return 249 } 250 disk.SetDiskID(format.Erasure.This) 251 s.erasureDisks[setIndex][diskIndex] = disk 252 } 253 disk.SetDiskLoc(s.poolIndex, setIndex, diskIndex) 254 s.endpointStrings[setIndex*s.setDriveCount+diskIndex] = disk.String() 255 setsJustConnected[setIndex] = true 256 s.erasureDisksMu.Unlock() 257 }(endpoint) 258 } 259 260 wg.Wait() 261 262 go func() { 263 idler := time.NewTimer(100 * time.Millisecond) 264 defer idler.Stop() 265 266 for setIndex, justConnected := range setsJustConnected { 267 if !justConnected { 268 continue 269 } 270 271 // Send a new set connect event with a timeout 272 idler.Reset(100 * time.Millisecond) 273 select { 274 case s.setReconnectEvent <- setIndex: 275 case <-idler.C: 276 } 277 } 278 }() 279 } 280 281 // monitorAndConnectEndpoints this is a monitoring loop to keep track of disconnected 282 // endpoints by reconnecting them and making sure to place them into right position in 283 // the set topology, this monitoring happens at a given monitoring interval. 284 func (s *erasureSets) monitorAndConnectEndpoints(ctx context.Context, monitorInterval time.Duration) { 285 r := rand.New(rand.NewSource(time.Now().UnixNano())) 286 287 time.Sleep(time.Duration(r.Float64() * float64(time.Second))) 288 289 // Pre-emptively connect the disks if possible. 290 s.connectDisks() 291 292 monitor := time.NewTimer(monitorInterval) 293 defer monitor.Stop() 294 295 for { 296 select { 297 case <-ctx.Done(): 298 return 299 case <-monitor.C: 300 // Reset the timer once fired for required interval. 301 monitor.Reset(monitorInterval) 302 303 if serverDebugLog { 304 console.Debugln("running disk monitoring") 305 } 306 307 s.connectDisks() 308 } 309 } 310 } 311 312 func (s *erasureSets) GetLockers(setIndex int) func() ([]dsync.NetLocker, string) { 313 return func() ([]dsync.NetLocker, string) { 314 lockers := make([]dsync.NetLocker, len(s.erasureLockers[setIndex])) 315 copy(lockers, s.erasureLockers[setIndex]) 316 return lockers, s.erasureLockOwner 317 } 318 } 319 320 func (s *erasureSets) GetEndpoints(setIndex int) func() []string { 321 return func() []string { 322 s.erasureDisksMu.RLock() 323 defer s.erasureDisksMu.RUnlock() 324 325 eps := make([]string, s.setDriveCount) 326 for i := 0; i < s.setDriveCount; i++ { 327 eps[i] = s.endpointStrings[setIndex*s.setDriveCount+i] 328 } 329 return eps 330 } 331 } 332 333 // GetDisks returns a closure for a given set, which provides list of disks per set. 334 func (s *erasureSets) GetDisks(setIndex int) func() []StorageAPI { 335 return func() []StorageAPI { 336 s.erasureDisksMu.RLock() 337 defer s.erasureDisksMu.RUnlock() 338 disks := make([]StorageAPI, s.setDriveCount) 339 copy(disks, s.erasureDisks[setIndex]) 340 return disks 341 } 342 } 343 344 // defaultMonitorConnectEndpointInterval is the interval to monitor endpoint connections. 345 // Must be bigger than defaultMonitorNewDiskInterval. 346 const defaultMonitorConnectEndpointInterval = defaultMonitorNewDiskInterval + time.Second*5 347 348 // Initialize new set of erasure coded sets. 349 func newErasureSets(ctx context.Context, endpoints Endpoints, storageDisks []StorageAPI, format *formatErasureV3, defaultParityCount, poolIdx int) (*erasureSets, error) { 350 setCount := len(format.Erasure.Sets) 351 setDriveCount := len(format.Erasure.Sets[0]) 352 353 endpointStrings := make([]string, len(endpoints)) 354 355 // Initialize the erasure sets instance. 356 s := &erasureSets{ 357 sets: make([]*erasureObjects, setCount), 358 erasureDisks: make([][]StorageAPI, setCount), 359 erasureLockers: make([][]dsync.NetLocker, setCount), 360 erasureLockOwner: globalLocalNodeName, 361 endpoints: endpoints, 362 endpointStrings: endpointStrings, 363 setCount: setCount, 364 setDriveCount: setDriveCount, 365 defaultParityCount: defaultParityCount, 366 format: format, 367 setReconnectEvent: make(chan int), 368 distributionAlgo: format.Erasure.DistributionAlgo, 369 deploymentID: uuid.MustParse(format.ID), 370 mrfOperations: make(map[healSource]int), 371 poolIndex: poolIdx, 372 } 373 374 mutex := newNSLock(globalIsDistErasure) 375 376 // Number of buffers, max 2GB 377 n := (2 * humanize.GiByte) / (blockSizeV2 * 2) 378 379 // Initialize byte pool once for all sets, bpool size is set to 380 // setCount * setDriveCount with each memory upto blockSizeV2. 381 bp := bpool.NewBytePoolCap(n, blockSizeV2, blockSizeV2*2) 382 383 for i := 0; i < setCount; i++ { 384 s.erasureDisks[i] = make([]StorageAPI, setDriveCount) 385 } 386 387 var erasureLockers = map[string]dsync.NetLocker{} 388 for _, endpoint := range endpoints { 389 if _, ok := erasureLockers[endpoint.Host]; !ok { 390 erasureLockers[endpoint.Host] = newLockAPI(endpoint) 391 } 392 } 393 394 for i := 0; i < setCount; i++ { 395 var lockerEpSet = set.NewStringSet() 396 for j := 0; j < setDriveCount; j++ { 397 endpoint := endpoints[i*setDriveCount+j] 398 // Only add lockers only one per endpoint and per erasure set. 399 if locker, ok := erasureLockers[endpoint.Host]; ok && !lockerEpSet.Contains(endpoint.Host) { 400 lockerEpSet.Add(endpoint.Host) 401 s.erasureLockers[i] = append(s.erasureLockers[i], locker) 402 } 403 disk := storageDisks[i*setDriveCount+j] 404 if disk == nil { 405 continue 406 } 407 diskID, derr := disk.GetDiskID() 408 if derr != nil { 409 continue 410 } 411 m, n, err := findDiskIndexByDiskID(format, diskID) 412 if err != nil { 413 continue 414 } 415 disk.SetDiskLoc(s.poolIndex, m, n) 416 s.endpointStrings[m*setDriveCount+n] = disk.String() 417 s.erasureDisks[m][n] = disk 418 } 419 420 // Initialize erasure objects for a given set. 421 s.sets[i] = &erasureObjects{ 422 setIndex: i, 423 poolIndex: poolIdx, 424 setDriveCount: setDriveCount, 425 defaultParityCount: defaultParityCount, 426 getDisks: s.GetDisks(i), 427 getLockers: s.GetLockers(i), 428 getEndpoints: s.GetEndpoints(i), 429 deletedCleanupSleeper: newDynamicSleeper(10, 2*time.Second), 430 nsMutex: mutex, 431 bp: bp, 432 mrfOpCh: make(chan partialOperation, 10000), 433 } 434 } 435 436 // cleanup ".trash/" folder every 5m minutes with sufficient sleep cycles, between each 437 // deletes a dynamic sleeper is used with a factor of 10 ratio with max delay between 438 // deletes to be 2 seconds. 439 deletedObjectsCleanupInterval, err := time.ParseDuration(env.Get(envMinioDeleteCleanupInterval, "5m")) 440 if err != nil { 441 return nil, err 442 } 443 444 // start cleanup stale uploads go-routine. 445 go s.cleanupStaleUploads(ctx, GlobalStaleUploadsCleanupInterval, GlobalStaleUploadsExpiry) 446 447 // start cleanup of deleted objects. 448 go s.cleanupDeletedObjects(ctx, deletedObjectsCleanupInterval) 449 450 // Start the disk monitoring and connect routine. 451 go s.monitorAndConnectEndpoints(ctx, defaultMonitorConnectEndpointInterval) 452 go s.maintainMRFList() 453 go s.healMRFRoutine() 454 455 return s, nil 456 } 457 458 func (s *erasureSets) cleanupDeletedObjects(ctx context.Context, cleanupInterval time.Duration) { 459 timer := time.NewTimer(cleanupInterval) 460 defer timer.Stop() 461 462 for { 463 select { 464 case <-ctx.Done(): 465 return 466 case <-timer.C: 467 // Reset for the next interval 468 timer.Reset(cleanupInterval) 469 470 for _, set := range s.sets { 471 set.cleanupDeletedObjects(ctx) 472 } 473 } 474 } 475 } 476 477 func (s *erasureSets) cleanupStaleUploads(ctx context.Context, cleanupInterval, expiry time.Duration) { 478 timer := time.NewTimer(cleanupInterval) 479 defer timer.Stop() 480 481 for { 482 select { 483 case <-ctx.Done(): 484 return 485 case <-timer.C: 486 // Reset for the next interval 487 timer.Reset(cleanupInterval) 488 489 for _, set := range s.sets { 490 set.cleanupStaleUploads(ctx, expiry) 491 } 492 } 493 } 494 } 495 496 const objectErasureMapKey = "objectErasureMap" 497 498 type auditObjectOp struct { 499 Pool int `json:"poolId"` 500 Set int `json:"setId"` 501 Disks []string `json:"disks"` 502 } 503 504 func auditObjectErasureSet(ctx context.Context, object string, set *erasureObjects) { 505 if len(logger.AuditTargets) == 0 { 506 return 507 } 508 509 object = decodeDirObject(object) 510 511 op := auditObjectOp{ 512 Pool: set.poolIndex + 1, 513 Set: set.setIndex + 1, 514 Disks: set.getEndpoints(), 515 } 516 517 var objectErasureSetTag map[string]auditObjectOp 518 reqInfo := logger.GetReqInfo(ctx) 519 for _, kv := range reqInfo.GetTags() { 520 if kv.Key == objectErasureMapKey { 521 objectErasureSetTag = kv.Val.(map[string]auditObjectOp) 522 break 523 } 524 } 525 526 if objectErasureSetTag == nil { 527 objectErasureSetTag = make(map[string]auditObjectOp) 528 } 529 530 objectErasureSetTag[object] = op 531 reqInfo.SetTags(objectErasureMapKey, objectErasureSetTag) 532 } 533 534 // NewNSLock - initialize a new namespace RWLocker instance. 535 func (s *erasureSets) NewNSLock(bucket string, objects ...string) RWLocker { 536 if len(objects) == 1 { 537 return s.getHashedSet(objects[0]).NewNSLock(bucket, objects...) 538 } 539 return s.getHashedSet("").NewNSLock(bucket, objects...) 540 } 541 542 // SetDriveCount returns the current drives per set. 543 func (s *erasureSets) SetDriveCount() int { 544 return s.setDriveCount 545 } 546 547 // ParityCount returns the default parity count used while erasure 548 // coding objects 549 func (s *erasureSets) ParityCount() int { 550 return s.defaultParityCount 551 } 552 553 // StorageUsageInfo - combines output of StorageInfo across all erasure coded object sets. 554 // This only returns disk usage info for ServerPools to perform placement decision, this call 555 // is not implemented in Object interface and is not meant to be used by other object 556 // layer implementations. 557 func (s *erasureSets) StorageUsageInfo(ctx context.Context) StorageInfo { 558 storageUsageInfo := func() StorageInfo { 559 var storageInfo StorageInfo 560 storageInfos := make([]StorageInfo, len(s.sets)) 561 storageInfo.Backend.Type = madmin.Erasure 562 563 g := errgroup.WithNErrs(len(s.sets)) 564 for index := range s.sets { 565 index := index 566 g.Go(func() error { 567 // ignoring errors on purpose 568 storageInfos[index], _ = s.sets[index].StorageInfo(ctx) 569 return nil 570 }, index) 571 } 572 573 // Wait for the go routines. 574 g.Wait() 575 576 for _, lstorageInfo := range storageInfos { 577 storageInfo.Disks = append(storageInfo.Disks, lstorageInfo.Disks...) 578 } 579 580 return storageInfo 581 } 582 583 s.disksStorageInfoCache.Once.Do(func() { 584 s.disksStorageInfoCache.TTL = time.Second 585 s.disksStorageInfoCache.Update = func() (interface{}, error) { 586 return storageUsageInfo(), nil 587 } 588 }) 589 590 v, _ := s.disksStorageInfoCache.Get() 591 return v.(StorageInfo) 592 } 593 594 // StorageInfo - combines output of StorageInfo across all erasure coded object sets. 595 func (s *erasureSets) StorageInfo(ctx context.Context) (StorageInfo, []error) { 596 var storageInfo madmin.StorageInfo 597 598 storageInfos := make([]madmin.StorageInfo, len(s.sets)) 599 storageInfoErrs := make([][]error, len(s.sets)) 600 601 g := errgroup.WithNErrs(len(s.sets)) 602 for index := range s.sets { 603 index := index 604 g.Go(func() error { 605 storageInfos[index], storageInfoErrs[index] = s.sets[index].StorageInfo(ctx) 606 return nil 607 }, index) 608 } 609 610 // Wait for the go routines. 611 g.Wait() 612 613 for _, lstorageInfo := range storageInfos { 614 storageInfo.Disks = append(storageInfo.Disks, lstorageInfo.Disks...) 615 } 616 617 errs := make([]error, 0, len(s.sets)*s.setDriveCount) 618 for i := range s.sets { 619 errs = append(errs, storageInfoErrs[i]...) 620 } 621 622 return storageInfo, errs 623 } 624 625 // StorageInfo - combines output of StorageInfo across all erasure coded object sets. 626 func (s *erasureSets) LocalStorageInfo(ctx context.Context) (StorageInfo, []error) { 627 var storageInfo StorageInfo 628 629 storageInfos := make([]StorageInfo, len(s.sets)) 630 storageInfoErrs := make([][]error, len(s.sets)) 631 632 g := errgroup.WithNErrs(len(s.sets)) 633 for index := range s.sets { 634 index := index 635 g.Go(func() error { 636 storageInfos[index], storageInfoErrs[index] = s.sets[index].LocalStorageInfo(ctx) 637 return nil 638 }, index) 639 } 640 641 // Wait for the go routines. 642 g.Wait() 643 644 for _, lstorageInfo := range storageInfos { 645 storageInfo.Disks = append(storageInfo.Disks, lstorageInfo.Disks...) 646 } 647 648 var errs []error 649 for i := range s.sets { 650 errs = append(errs, storageInfoErrs[i]...) 651 } 652 653 return storageInfo, errs 654 } 655 656 // Shutdown shutsdown all erasure coded sets in parallel 657 // returns error upon first error. 658 func (s *erasureSets) Shutdown(ctx context.Context) error { 659 g := errgroup.WithNErrs(len(s.sets)) 660 661 for index := range s.sets { 662 index := index 663 g.Go(func() error { 664 return s.sets[index].Shutdown(ctx) 665 }, index) 666 } 667 668 for _, err := range g.Wait() { 669 if err != nil { 670 return err 671 } 672 } 673 select { 674 case _, ok := <-s.setReconnectEvent: 675 if ok { 676 close(s.setReconnectEvent) 677 } 678 default: 679 close(s.setReconnectEvent) 680 } 681 return nil 682 } 683 684 // MakeBucketLocation - creates a new bucket across all sets simultaneously, 685 // then return the first encountered error 686 func (s *erasureSets) MakeBucketWithLocation(ctx context.Context, bucket string, opts BucketOptions) error { 687 g := errgroup.WithNErrs(len(s.sets)) 688 689 // Create buckets in parallel across all sets. 690 for index := range s.sets { 691 index := index 692 g.Go(func() error { 693 return s.sets[index].MakeBucketWithLocation(ctx, bucket, opts) 694 }, index) 695 } 696 697 errs := g.Wait() 698 699 // Return the first encountered error 700 for _, err := range errs { 701 if err != nil { 702 return err 703 } 704 } 705 706 // Success. 707 return nil 708 } 709 710 // hashes the key returning an integer based on the input algorithm. 711 // This function currently supports 712 // - CRCMOD 713 // - SIPMOD 714 // - all new algos. 715 func sipHashMod(key string, cardinality int, id [16]byte) int { 716 if cardinality <= 0 { 717 return -1 718 } 719 // use the faster version as per siphash docs 720 // https://github.com/dchest/siphash#usage 721 k0, k1 := binary.LittleEndian.Uint64(id[0:8]), binary.LittleEndian.Uint64(id[8:16]) 722 sum64 := siphash.Hash(k0, k1, []byte(key)) 723 return int(sum64 % uint64(cardinality)) 724 } 725 726 func crcHashMod(key string, cardinality int) int { 727 if cardinality <= 0 { 728 return -1 729 } 730 keyCrc := crc32.Checksum([]byte(key), crc32.IEEETable) 731 return int(keyCrc % uint32(cardinality)) 732 } 733 734 func hashKey(algo string, key string, cardinality int, id [16]byte) int { 735 switch algo { 736 case formatErasureVersionV2DistributionAlgoV1: 737 return crcHashMod(key, cardinality) 738 case formatErasureVersionV3DistributionAlgoV2, formatErasureVersionV3DistributionAlgoV3: 739 return sipHashMod(key, cardinality, id) 740 default: 741 // Unknown algorithm returns -1, also if cardinality is lesser than 0. 742 return -1 743 } 744 } 745 746 // Returns always a same erasure coded set for a given input. 747 func (s *erasureSets) getHashedSetIndex(input string) int { 748 return hashKey(s.distributionAlgo, input, len(s.sets), s.deploymentID) 749 } 750 751 // Returns always a same erasure coded set for a given input. 752 func (s *erasureSets) getHashedSet(input string) (set *erasureObjects) { 753 return s.sets[s.getHashedSetIndex(input)] 754 } 755 756 // GetBucketInfo - returns bucket info from one of the erasure coded set. 757 func (s *erasureSets) GetBucketInfo(ctx context.Context, bucket string) (bucketInfo BucketInfo, err error) { 758 return s.getHashedSet("").GetBucketInfo(ctx, bucket) 759 } 760 761 // IsNotificationSupported returns whether bucket notification is applicable for this layer. 762 func (s *erasureSets) IsNotificationSupported() bool { 763 return s.getHashedSet("").IsNotificationSupported() 764 } 765 766 // IsListenSupported returns whether listen bucket notification is applicable for this layer. 767 func (s *erasureSets) IsListenSupported() bool { 768 return true 769 } 770 771 // IsEncryptionSupported returns whether server side encryption is implemented for this layer. 772 func (s *erasureSets) IsEncryptionSupported() bool { 773 return s.getHashedSet("").IsEncryptionSupported() 774 } 775 776 // IsCompressionSupported returns whether compression is applicable for this layer. 777 func (s *erasureSets) IsCompressionSupported() bool { 778 return s.getHashedSet("").IsCompressionSupported() 779 } 780 781 func (s *erasureSets) IsTaggingSupported() bool { 782 return true 783 } 784 785 // DeleteBucket - deletes a bucket on all sets simultaneously, 786 // even if one of the sets fail to delete buckets, we proceed to 787 // undo a successful operation. 788 func (s *erasureSets) DeleteBucket(ctx context.Context, bucket string, forceDelete bool) error { 789 g := errgroup.WithNErrs(len(s.sets)) 790 791 // Delete buckets in parallel across all sets. 792 for index := range s.sets { 793 index := index 794 g.Go(func() error { 795 return s.sets[index].DeleteBucket(ctx, bucket, forceDelete) 796 }, index) 797 } 798 799 errs := g.Wait() 800 // For any failure, we attempt undo all the delete buckets operation 801 // by creating buckets again on all sets which were successfully deleted. 802 for _, err := range errs { 803 if err != nil { 804 undoDeleteBucketSets(ctx, bucket, s.sets, errs) 805 return err 806 } 807 } 808 809 // Delete all bucket metadata. 810 deleteBucketMetadata(ctx, s, bucket) 811 812 // Success. 813 return nil 814 } 815 816 // This function is used to undo a successful DeleteBucket operation. 817 func undoDeleteBucketSets(ctx context.Context, bucket string, sets []*erasureObjects, errs []error) { 818 g := errgroup.WithNErrs(len(sets)) 819 820 // Undo previous delete bucket on all underlying sets. 821 for index := range sets { 822 index := index 823 g.Go(func() error { 824 if errs[index] == nil { 825 return sets[index].MakeBucketWithLocation(ctx, bucket, BucketOptions{}) 826 } 827 return nil 828 }, index) 829 } 830 831 g.Wait() 832 } 833 834 // List all buckets from one of the set, we are not doing merge 835 // sort here just for simplification. As per design it is assumed 836 // that all buckets are present on all sets. 837 func (s *erasureSets) ListBuckets(ctx context.Context) (buckets []BucketInfo, err error) { 838 var listBuckets []BucketInfo 839 var healBuckets = map[string]VolInfo{} 840 for _, set := range s.sets { 841 // lists all unique buckets across drives. 842 if err := listAllBuckets(ctx, set.getDisks(), healBuckets); err != nil { 843 return nil, err 844 } 845 } 846 847 for _, v := range healBuckets { 848 listBuckets = append(listBuckets, BucketInfo(v)) 849 } 850 851 sort.Slice(listBuckets, func(i, j int) bool { 852 return listBuckets[i].Name < listBuckets[j].Name 853 }) 854 855 return listBuckets, nil 856 } 857 858 // --- Object Operations --- 859 860 // GetObjectNInfo - returns object info and locked object ReadCloser 861 func (s *erasureSets) GetObjectNInfo(ctx context.Context, bucket, object string, rs *HTTPRangeSpec, h http.Header, lockType LockType, opts ObjectOptions) (gr *GetObjectReader, err error) { 862 set := s.getHashedSet(object) 863 auditObjectErasureSet(ctx, object, set) 864 return set.GetObjectNInfo(ctx, bucket, object, rs, h, lockType, opts) 865 } 866 867 func (s *erasureSets) parentDirIsObject(ctx context.Context, bucket, parent string) bool { 868 if parent == "." { 869 return false 870 } 871 return s.getHashedSet(parent).parentDirIsObject(ctx, bucket, parent) 872 } 873 874 // PutObject - writes an object to hashedSet based on the object name. 875 func (s *erasureSets) PutObject(ctx context.Context, bucket string, object string, data *PutObjReader, opts ObjectOptions) (objInfo ObjectInfo, err error) { 876 set := s.getHashedSet(object) 877 auditObjectErasureSet(ctx, object, set) 878 opts.ParentIsObject = s.parentDirIsObject 879 return set.PutObject(ctx, bucket, object, data, opts) 880 } 881 882 // GetObjectInfo - reads object metadata from the hashedSet based on the object name. 883 func (s *erasureSets) GetObjectInfo(ctx context.Context, bucket, object string, opts ObjectOptions) (objInfo ObjectInfo, err error) { 884 set := s.getHashedSet(object) 885 auditObjectErasureSet(ctx, object, set) 886 return set.GetObjectInfo(ctx, bucket, object, opts) 887 } 888 889 // DeleteObject - deletes an object from the hashedSet based on the object name. 890 func (s *erasureSets) DeleteObject(ctx context.Context, bucket string, object string, opts ObjectOptions) (objInfo ObjectInfo, err error) { 891 set := s.getHashedSet(object) 892 auditObjectErasureSet(ctx, object, set) 893 return set.DeleteObject(ctx, bucket, object, opts) 894 } 895 896 // DeleteObjects - bulk delete of objects 897 // Bulk delete is only possible within one set. For that purpose 898 // objects are group by set first, and then bulk delete is invoked 899 // for each set, the error response of each delete will be returned 900 func (s *erasureSets) DeleteObjects(ctx context.Context, bucket string, objects []ObjectToDelete, opts ObjectOptions) ([]DeletedObject, []error) { 901 type delObj struct { 902 // Set index associated to this object 903 setIndex int 904 // Original index from the list of arguments 905 // where this object is passed 906 origIndex int 907 // object to delete 908 object ObjectToDelete 909 } 910 911 // Transform []delObj to the list of object names 912 toNames := func(delObjs []delObj) []ObjectToDelete { 913 objs := make([]ObjectToDelete, len(delObjs)) 914 for i, obj := range delObjs { 915 objs[i] = obj.object 916 } 917 return objs 918 } 919 920 // The result of delete operation on all passed objects 921 var delErrs = make([]error, len(objects)) 922 923 // The result of delete objects 924 var delObjects = make([]DeletedObject, len(objects)) 925 926 // A map between a set and its associated objects 927 var objSetMap = make(map[int][]delObj) 928 929 // Group objects by set index 930 for i, object := range objects { 931 index := s.getHashedSetIndex(object.ObjectName) 932 objSetMap[index] = append(objSetMap[index], delObj{setIndex: index, origIndex: i, object: object}) 933 } 934 935 // Invoke bulk delete on objects per set and save 936 // the result of the delete operation 937 for _, objsGroup := range objSetMap { 938 set := s.getHashedSet(objsGroup[0].object.ObjectName) 939 dobjects, errs := set.DeleteObjects(ctx, bucket, toNames(objsGroup), opts) 940 for i, obj := range objsGroup { 941 delErrs[obj.origIndex] = errs[i] 942 delObjects[obj.origIndex] = dobjects[i] 943 if errs[i] == nil { 944 auditObjectErasureSet(ctx, obj.object.ObjectName, set) 945 } 946 } 947 } 948 949 return delObjects, delErrs 950 } 951 952 // CopyObject - copies objects from one hashedSet to another hashedSet, on server side. 953 func (s *erasureSets) CopyObject(ctx context.Context, srcBucket, srcObject, dstBucket, dstObject string, srcInfo ObjectInfo, srcOpts, dstOpts ObjectOptions) (objInfo ObjectInfo, err error) { 954 srcSet := s.getHashedSet(srcObject) 955 dstSet := s.getHashedSet(dstObject) 956 957 auditObjectErasureSet(ctx, dstObject, dstSet) 958 959 cpSrcDstSame := srcSet == dstSet 960 // Check if this request is only metadata update. 961 if cpSrcDstSame && srcInfo.metadataOnly { 962 // Version ID is set for the destination and source == destination version ID. 963 // perform an in-place update. 964 if dstOpts.VersionID != "" && srcOpts.VersionID == dstOpts.VersionID { 965 return srcSet.CopyObject(ctx, srcBucket, srcObject, dstBucket, dstObject, srcInfo, srcOpts, dstOpts) 966 } 967 // Destination is not versioned and source version ID is empty 968 // perform an in-place update. 969 if !dstOpts.Versioned && srcOpts.VersionID == "" { 970 return srcSet.CopyObject(ctx, srcBucket, srcObject, dstBucket, dstObject, srcInfo, srcOpts, dstOpts) 971 } 972 // CopyObject optimization where we don't create an entire copy 973 // of the content, instead we add a reference, we disallow legacy 974 // objects to be self referenced in this manner so make sure 975 // that we actually create a new dataDir for legacy objects. 976 if dstOpts.Versioned && srcOpts.VersionID != dstOpts.VersionID && !srcInfo.Legacy { 977 srcInfo.versionOnly = true 978 return srcSet.CopyObject(ctx, srcBucket, srcObject, dstBucket, dstObject, srcInfo, srcOpts, dstOpts) 979 } 980 } 981 982 putOpts := ObjectOptions{ 983 ServerSideEncryption: dstOpts.ServerSideEncryption, 984 UserDefined: srcInfo.UserDefined, 985 Versioned: dstOpts.Versioned, 986 VersionID: dstOpts.VersionID, 987 MTime: dstOpts.MTime, 988 } 989 990 return dstSet.putObject(ctx, dstBucket, dstObject, srcInfo.PutObjReader, putOpts) 991 } 992 993 func (s *erasureSets) ListMultipartUploads(ctx context.Context, bucket, prefix, keyMarker, uploadIDMarker, delimiter string, maxUploads int) (result ListMultipartsInfo, err error) { 994 // In list multipart uploads we are going to treat input prefix as the object, 995 // this means that we are not supporting directory navigation. 996 set := s.getHashedSet(prefix) 997 auditObjectErasureSet(ctx, prefix, set) 998 return set.ListMultipartUploads(ctx, bucket, prefix, keyMarker, uploadIDMarker, delimiter, maxUploads) 999 } 1000 1001 // Initiate a new multipart upload on a hashedSet based on object name. 1002 func (s *erasureSets) NewMultipartUpload(ctx context.Context, bucket, object string, opts ObjectOptions) (uploadID string, err error) { 1003 set := s.getHashedSet(object) 1004 auditObjectErasureSet(ctx, object, set) 1005 return set.NewMultipartUpload(ctx, bucket, object, opts) 1006 } 1007 1008 // Copies a part of an object from source hashedSet to destination hashedSet. 1009 func (s *erasureSets) CopyObjectPart(ctx context.Context, srcBucket, srcObject, destBucket, destObject string, uploadID string, partID int, 1010 startOffset int64, length int64, srcInfo ObjectInfo, srcOpts, dstOpts ObjectOptions) (partInfo PartInfo, err error) { 1011 destSet := s.getHashedSet(destObject) 1012 auditObjectErasureSet(ctx, destObject, destSet) 1013 return destSet.PutObjectPart(ctx, destBucket, destObject, uploadID, partID, NewPutObjReader(srcInfo.Reader), dstOpts) 1014 } 1015 1016 // PutObjectPart - writes part of an object to hashedSet based on the object name. 1017 func (s *erasureSets) PutObjectPart(ctx context.Context, bucket, object, uploadID string, partID int, data *PutObjReader, opts ObjectOptions) (info PartInfo, err error) { 1018 set := s.getHashedSet(object) 1019 auditObjectErasureSet(ctx, object, set) 1020 return set.PutObjectPart(ctx, bucket, object, uploadID, partID, data, opts) 1021 } 1022 1023 // GetMultipartInfo - return multipart metadata info uploaded at hashedSet. 1024 func (s *erasureSets) GetMultipartInfo(ctx context.Context, bucket, object, uploadID string, opts ObjectOptions) (result MultipartInfo, err error) { 1025 set := s.getHashedSet(object) 1026 auditObjectErasureSet(ctx, object, set) 1027 return set.GetMultipartInfo(ctx, bucket, object, uploadID, opts) 1028 } 1029 1030 // ListObjectParts - lists all uploaded parts to an object in hashedSet. 1031 func (s *erasureSets) ListObjectParts(ctx context.Context, bucket, object, uploadID string, partNumberMarker int, maxParts int, opts ObjectOptions) (result ListPartsInfo, err error) { 1032 set := s.getHashedSet(object) 1033 auditObjectErasureSet(ctx, object, set) 1034 return set.ListObjectParts(ctx, bucket, object, uploadID, partNumberMarker, maxParts, opts) 1035 } 1036 1037 // Aborts an in-progress multipart operation on hashedSet based on the object name. 1038 func (s *erasureSets) AbortMultipartUpload(ctx context.Context, bucket, object, uploadID string, opts ObjectOptions) error { 1039 set := s.getHashedSet(object) 1040 auditObjectErasureSet(ctx, object, set) 1041 return set.AbortMultipartUpload(ctx, bucket, object, uploadID, opts) 1042 } 1043 1044 // CompleteMultipartUpload - completes a pending multipart transaction, on hashedSet based on object name. 1045 func (s *erasureSets) CompleteMultipartUpload(ctx context.Context, bucket, object, uploadID string, uploadedParts []CompletePart, opts ObjectOptions) (objInfo ObjectInfo, err error) { 1046 set := s.getHashedSet(object) 1047 auditObjectErasureSet(ctx, object, set) 1048 opts.ParentIsObject = s.parentDirIsObject 1049 return set.CompleteMultipartUpload(ctx, bucket, object, uploadID, uploadedParts, opts) 1050 } 1051 1052 /* 1053 1054 All disks online 1055 ----------------- 1056 - All Unformatted - format all and return success. 1057 - Some Unformatted - format all and return success. 1058 - Any JBOD inconsistent - return failure 1059 - Some are corrupt (missing format.json) - return failure 1060 - Any unrecognized disks - return failure 1061 1062 Some disks are offline and we have quorum. 1063 ----------------- 1064 - Some unformatted - format all and return success, 1065 treat disks offline as corrupted. 1066 - Any JBOD inconsistent - return failure 1067 - Some are corrupt (missing format.json) 1068 - Any unrecognized disks - return failure 1069 1070 No read quorum 1071 ----------------- 1072 failure for all cases. 1073 1074 // Pseudo code for managing `format.json`. 1075 1076 // Generic checks. 1077 if (no quorum) return error 1078 if (any disk is corrupt) return error // Always error 1079 if (jbod inconsistent) return error // Always error. 1080 if (disks not recognized) // Always error. 1081 1082 // Specific checks. 1083 if (all disks online) 1084 if (all disks return format.json) 1085 if (jbod consistent) 1086 if (all disks recognized) 1087 return 1088 else 1089 if (all disks return format.json not found) 1090 return error 1091 else (some disks return format.json not found) 1092 (heal format) 1093 return 1094 fi 1095 fi 1096 else 1097 if (some disks return format.json not found) 1098 // Offline disks are marked as dead. 1099 (heal format) // Offline disks should be marked as dead. 1100 return success 1101 fi 1102 fi 1103 */ 1104 1105 func formatsToDrivesInfo(endpoints Endpoints, formats []*formatErasureV3, sErrs []error) (beforeDrives []madmin.HealDriveInfo) { 1106 beforeDrives = make([]madmin.HealDriveInfo, len(endpoints)) 1107 // Existing formats are available (i.e. ok), so save it in 1108 // result, also populate disks to be healed. 1109 for i, format := range formats { 1110 drive := endpoints.GetString(i) 1111 var state = madmin.DriveStateCorrupt 1112 switch { 1113 case format != nil: 1114 state = madmin.DriveStateOk 1115 case sErrs[i] == errUnformattedDisk: 1116 state = madmin.DriveStateMissing 1117 case sErrs[i] == errDiskNotFound: 1118 state = madmin.DriveStateOffline 1119 } 1120 beforeDrives[i] = madmin.HealDriveInfo{ 1121 UUID: func() string { 1122 if format != nil { 1123 return format.Erasure.This 1124 } 1125 return "" 1126 }(), 1127 Endpoint: drive, 1128 State: state, 1129 } 1130 } 1131 1132 return beforeDrives 1133 } 1134 1135 // If it is a single node Erasure and all disks are root disks, it is most likely a test setup, else it is a production setup. 1136 // On a test setup we allow creation of format.json on root disks to help with dev/testing. 1137 func isTestSetup(infos []DiskInfo, errs []error) bool { 1138 rootDiskCount := 0 1139 for i := range errs { 1140 if errs[i] == nil || errs[i] == errUnformattedDisk { 1141 if infos[i].RootDisk { 1142 rootDiskCount++ 1143 } 1144 } 1145 } 1146 // It is a test setup if all disks are root disks in quorum. 1147 return rootDiskCount >= len(infos)/2+1 1148 } 1149 1150 func getHealDiskInfos(storageDisks []StorageAPI, errs []error) ([]DiskInfo, []error) { 1151 infos := make([]DiskInfo, len(storageDisks)) 1152 g := errgroup.WithNErrs(len(storageDisks)) 1153 for index := range storageDisks { 1154 index := index 1155 g.Go(func() error { 1156 if errs[index] != nil && errs[index] != errUnformattedDisk { 1157 return errs[index] 1158 } 1159 if storageDisks[index] == nil { 1160 return errDiskNotFound 1161 } 1162 var err error 1163 infos[index], err = storageDisks[index].DiskInfo(context.TODO()) 1164 return err 1165 }, index) 1166 } 1167 return infos, g.Wait() 1168 } 1169 1170 // Mark root disks as down so as not to heal them. 1171 func markRootDisksAsDown(storageDisks []StorageAPI, errs []error) { 1172 var infos []DiskInfo 1173 infos, errs = getHealDiskInfos(storageDisks, errs) 1174 if !isTestSetup(infos, errs) { 1175 for i := range storageDisks { 1176 if storageDisks[i] != nil && infos[i].RootDisk { 1177 // We should not heal on root disk. i.e in a situation where the minio-administrator has unmounted a 1178 // defective drive we should not heal a path on the root disk. 1179 logger.Info("Disk `%s` the same as the system root disk.\n"+ 1180 "Disk will not be used. Please supply a separate disk and restart the server.", 1181 storageDisks[i].String()) 1182 storageDisks[i] = nil 1183 } 1184 } 1185 } 1186 } 1187 1188 // HealFormat - heals missing `format.json` on fresh unformatted disks. 1189 func (s *erasureSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.HealResultItem, err error) { 1190 storageDisks, errs := initStorageDisksWithErrorsWithoutHealthCheck(s.endpoints) 1191 for i, derr := range errs { 1192 if derr != nil && derr != errDiskNotFound { 1193 return madmin.HealResultItem{}, fmt.Errorf("Disk %s: %w", s.endpoints[i], derr) 1194 } 1195 } 1196 1197 defer func(storageDisks []StorageAPI) { 1198 if err != nil { 1199 closeStorageDisks(storageDisks) 1200 } 1201 }(storageDisks) 1202 1203 formats, sErrs := loadFormatErasureAll(storageDisks, true) 1204 if err = checkFormatErasureValues(formats, storageDisks, s.setDriveCount); err != nil { 1205 return madmin.HealResultItem{}, err 1206 } 1207 1208 // Mark all root disks down 1209 markRootDisksAsDown(storageDisks, sErrs) 1210 1211 refFormat, err := getFormatErasureInQuorum(formats) 1212 if err != nil { 1213 return res, err 1214 } 1215 1216 // Prepare heal-result 1217 res = madmin.HealResultItem{ 1218 Type: madmin.HealItemMetadata, 1219 Detail: "disk-format", 1220 DiskCount: s.setCount * s.setDriveCount, 1221 SetCount: s.setCount, 1222 } 1223 1224 // Fetch all the drive info status. 1225 beforeDrives := formatsToDrivesInfo(s.endpoints, formats, sErrs) 1226 1227 res.After.Drives = make([]madmin.HealDriveInfo, len(beforeDrives)) 1228 res.Before.Drives = make([]madmin.HealDriveInfo, len(beforeDrives)) 1229 // Copy "after" drive state too from before. 1230 for k, v := range beforeDrives { 1231 res.Before.Drives[k] = v 1232 res.After.Drives[k] = v 1233 } 1234 1235 if countErrs(sErrs, errUnformattedDisk) == 0 { 1236 return res, errNoHealRequired 1237 } 1238 1239 // Initialize a new set of set formats which will be written to disk. 1240 newFormatSets := newHealFormatSets(refFormat, s.setCount, s.setDriveCount, formats, sErrs) 1241 1242 if !dryRun { 1243 var tmpNewFormats = make([]*formatErasureV3, s.setCount*s.setDriveCount) 1244 for i := range newFormatSets { 1245 for j := range newFormatSets[i] { 1246 if newFormatSets[i][j] == nil { 1247 continue 1248 } 1249 res.After.Drives[i*s.setDriveCount+j].UUID = newFormatSets[i][j].Erasure.This 1250 res.After.Drives[i*s.setDriveCount+j].State = madmin.DriveStateOk 1251 tmpNewFormats[i*s.setDriveCount+j] = newFormatSets[i][j] 1252 } 1253 } 1254 1255 // Save new formats `format.json` on unformatted disks. 1256 if err = saveUnformattedFormat(ctx, storageDisks, tmpNewFormats); err != nil { 1257 return madmin.HealResultItem{}, err 1258 } 1259 1260 s.erasureDisksMu.Lock() 1261 1262 for index, format := range tmpNewFormats { 1263 if format == nil { 1264 continue 1265 } 1266 1267 m, n, err := findDiskIndexByDiskID(refFormat, format.Erasure.This) 1268 if err != nil { 1269 continue 1270 } 1271 1272 if s.erasureDisks[m][n] != nil { 1273 s.erasureDisks[m][n].Close() 1274 } 1275 storageDisks[index].SetDiskLoc(s.poolIndex, m, n) 1276 s.erasureDisks[m][n] = storageDisks[index] 1277 s.endpointStrings[m*s.setDriveCount+n] = storageDisks[index].String() 1278 } 1279 1280 // Replace reference format with what was loaded from disks. 1281 s.format = refFormat 1282 1283 s.erasureDisksMu.Unlock() 1284 } 1285 1286 return res, nil 1287 } 1288 1289 // HealBucket - heals inconsistent buckets and bucket metadata on all sets. 1290 func (s *erasureSets) HealBucket(ctx context.Context, bucket string, opts madmin.HealOpts) (result madmin.HealResultItem, err error) { 1291 // Initialize heal result info 1292 result = madmin.HealResultItem{ 1293 Type: madmin.HealItemBucket, 1294 Bucket: bucket, 1295 DiskCount: s.setCount * s.setDriveCount, 1296 SetCount: s.setCount, 1297 } 1298 1299 for _, set := range s.sets { 1300 var healResult madmin.HealResultItem 1301 healResult, err = set.HealBucket(ctx, bucket, opts) 1302 if err != nil { 1303 return result, toObjectErr(err, bucket) 1304 } 1305 result.Before.Drives = append(result.Before.Drives, healResult.Before.Drives...) 1306 result.After.Drives = append(result.After.Drives, healResult.After.Drives...) 1307 } 1308 1309 // Check if we had quorum to write, if not return an appropriate error. 1310 _, afterDriveOnline := result.GetOnlineCounts() 1311 if afterDriveOnline < ((s.setCount*s.setDriveCount)/2)+1 { 1312 return result, toObjectErr(errErasureWriteQuorum, bucket) 1313 } 1314 1315 return result, nil 1316 } 1317 1318 // HealObject - heals inconsistent object on a hashedSet based on object name. 1319 func (s *erasureSets) HealObject(ctx context.Context, bucket, object, versionID string, opts madmin.HealOpts) (madmin.HealResultItem, error) { 1320 return s.getHashedSet(object).HealObject(ctx, bucket, object, versionID, opts) 1321 } 1322 1323 // PutObjectMetadata - replace or add metadata to an existing object/version 1324 func (s *erasureSets) PutObjectMetadata(ctx context.Context, bucket, object string, opts ObjectOptions) (ObjectInfo, error) { 1325 er := s.getHashedSet(object) 1326 return er.PutObjectMetadata(ctx, bucket, object, opts) 1327 } 1328 1329 // PutObjectTags - replace or add tags to an existing object 1330 func (s *erasureSets) PutObjectTags(ctx context.Context, bucket, object string, tags string, opts ObjectOptions) (ObjectInfo, error) { 1331 er := s.getHashedSet(object) 1332 return er.PutObjectTags(ctx, bucket, object, tags, opts) 1333 } 1334 1335 // DeleteObjectTags - delete object tags from an existing object 1336 func (s *erasureSets) DeleteObjectTags(ctx context.Context, bucket, object string, opts ObjectOptions) (ObjectInfo, error) { 1337 er := s.getHashedSet(object) 1338 return er.DeleteObjectTags(ctx, bucket, object, opts) 1339 } 1340 1341 // GetObjectTags - get object tags from an existing object 1342 func (s *erasureSets) GetObjectTags(ctx context.Context, bucket, object string, opts ObjectOptions) (*tags.Tags, error) { 1343 er := s.getHashedSet(object) 1344 return er.GetObjectTags(ctx, bucket, object, opts) 1345 } 1346 1347 // maintainMRFList gathers the list of successful partial uploads 1348 // from all underlying er.sets and puts them in a global map which 1349 // should not have more than 10000 entries. 1350 func (s *erasureSets) maintainMRFList() { 1351 var agg = make(chan partialOperation, 10000) 1352 for i, er := range s.sets { 1353 go func(c <-chan partialOperation, setIndex int) { 1354 for msg := range c { 1355 msg.failedSet = setIndex 1356 select { 1357 case agg <- msg: 1358 default: 1359 } 1360 } 1361 }(er.mrfOpCh, i) 1362 } 1363 1364 for fOp := range agg { 1365 s.mrfMU.Lock() 1366 if len(s.mrfOperations) > 10000 { 1367 s.mrfMU.Unlock() 1368 continue 1369 } 1370 s.mrfOperations[healSource{ 1371 bucket: fOp.bucket, 1372 object: fOp.object, 1373 versionID: fOp.versionID, 1374 opts: &madmin.HealOpts{Remove: true}, 1375 }] = fOp.failedSet 1376 s.mrfMU.Unlock() 1377 } 1378 } 1379 1380 // healMRFRoutine monitors new disks connection, sweep the MRF list 1381 // to find objects related to the new disk that needs to be healed. 1382 func (s *erasureSets) healMRFRoutine() { 1383 // Wait until background heal state is initialized 1384 bgSeq := mustGetHealSequence(GlobalContext) 1385 1386 for setIndex := range s.setReconnectEvent { 1387 // Get the list of objects related the er.set 1388 // to which the connected disk belongs. 1389 var mrfOperations []healSource 1390 s.mrfMU.Lock() 1391 for k, v := range s.mrfOperations { 1392 if v == setIndex { 1393 mrfOperations = append(mrfOperations, k) 1394 } 1395 } 1396 s.mrfMU.Unlock() 1397 1398 // Heal objects 1399 for _, u := range mrfOperations { 1400 waitForLowHTTPReq(globalHealConfig.IOCount, globalHealConfig.Sleep) 1401 1402 // Send an object to background heal 1403 bgSeq.sourceCh <- u 1404 1405 s.mrfMU.Lock() 1406 delete(s.mrfOperations, u) 1407 s.mrfMU.Unlock() 1408 } 1409 } 1410 }