github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/erasure-object.go (about) 1 // Copyright (c) 2015-2021 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package cmd 19 20 import ( 21 "bytes" 22 "context" 23 "errors" 24 "fmt" 25 "io" 26 "net/http" 27 "path" 28 "runtime" 29 "strconv" 30 "strings" 31 "sync" 32 "time" 33 34 "github.com/dustin/go-humanize" 35 "github.com/klauspost/readahead" 36 "github.com/minio/madmin-go/v3" 37 "github.com/minio/minio-go/v7/pkg/tags" 38 "github.com/minio/minio/internal/bucket/lifecycle" 39 "github.com/minio/minio/internal/bucket/object/lock" 40 "github.com/minio/minio/internal/bucket/replication" 41 "github.com/minio/minio/internal/config/storageclass" 42 "github.com/minio/minio/internal/crypto" 43 "github.com/minio/minio/internal/event" 44 "github.com/minio/minio/internal/hash" 45 xhttp "github.com/minio/minio/internal/http" 46 xioutil "github.com/minio/minio/internal/ioutil" 47 "github.com/minio/minio/internal/logger" 48 "github.com/minio/pkg/v2/mimedb" 49 "github.com/minio/pkg/v2/sync/errgroup" 50 "github.com/minio/pkg/v2/wildcard" 51 ) 52 53 // list all errors which can be ignored in object operations. 54 var objectOpIgnoredErrs = append(baseIgnoredErrs, errDiskAccessDenied, errUnformattedDisk, errDiskOngoingReq) 55 56 // Object Operations 57 58 func countOnlineDisks(onlineDisks []StorageAPI) (online int) { 59 for _, onlineDisk := range onlineDisks { 60 if onlineDisk != nil && onlineDisk.IsOnline() { 61 online++ 62 } 63 } 64 return online 65 } 66 67 // CopyObject - copy object source object to destination object. 68 // if source object and destination object are same we only 69 // update metadata. 70 func (er erasureObjects) CopyObject(ctx context.Context, srcBucket, srcObject, dstBucket, dstObject string, srcInfo ObjectInfo, srcOpts, dstOpts ObjectOptions) (oi ObjectInfo, err error) { 71 if !dstOpts.NoAuditLog { 72 auditObjectErasureSet(ctx, dstObject, &er) 73 } 74 75 // This call shouldn't be used for anything other than metadata updates or adding self referential versions. 76 if !srcInfo.metadataOnly { 77 return oi, NotImplemented{} 78 } 79 80 if !dstOpts.NoLock { 81 lk := er.NewNSLock(dstBucket, dstObject) 82 lkctx, err := lk.GetLock(ctx, globalOperationTimeout) 83 if err != nil { 84 return oi, err 85 } 86 ctx = lkctx.Context() 87 defer lk.Unlock(lkctx) 88 } 89 // Read metadata associated with the object from all disks. 90 storageDisks := er.getDisks() 91 92 var metaArr []FileInfo 93 var errs []error 94 95 // Read metadata associated with the object from all disks. 96 if srcOpts.VersionID != "" { 97 metaArr, errs = readAllFileInfo(ctx, storageDisks, "", srcBucket, srcObject, srcOpts.VersionID, true, false) 98 } else { 99 metaArr, errs = readAllXL(ctx, storageDisks, srcBucket, srcObject, true, false, true) 100 } 101 102 readQuorum, writeQuorum, err := objectQuorumFromMeta(ctx, metaArr, errs, er.defaultParityCount) 103 if err != nil { 104 if errors.Is(err, errErasureReadQuorum) && !strings.HasPrefix(srcBucket, minioMetaBucket) { 105 _, derr := er.deleteIfDangling(context.Background(), srcBucket, srcObject, metaArr, errs, nil, srcOpts) 106 if derr != nil { 107 err = derr 108 } 109 } 110 return ObjectInfo{}, toObjectErr(err, srcBucket, srcObject) 111 } 112 113 // List all online disks. 114 onlineDisks, modTime, etag := listOnlineDisks(storageDisks, metaArr, errs, readQuorum) 115 116 // Pick latest valid metadata. 117 fi, err := pickValidFileInfo(ctx, metaArr, modTime, etag, readQuorum) 118 if err != nil { 119 return oi, toObjectErr(err, srcBucket, srcObject) 120 } 121 if fi.Deleted { 122 if srcOpts.VersionID == "" { 123 return oi, toObjectErr(errFileNotFound, srcBucket, srcObject) 124 } 125 return fi.ToObjectInfo(srcBucket, srcObject, srcOpts.Versioned || srcOpts.VersionSuspended), toObjectErr(errMethodNotAllowed, srcBucket, srcObject) 126 } 127 128 filterOnlineDisksInplace(fi, metaArr, onlineDisks) 129 130 versionID := srcInfo.VersionID 131 if srcInfo.versionOnly { 132 versionID = dstOpts.VersionID 133 // preserve destination versionId if specified. 134 if versionID == "" { 135 versionID = mustGetUUID() 136 fi.IsLatest = true // we are creating a new version so this is latest. 137 } 138 } 139 140 modTime = UTCNow() // We only preserve modTime if dstOpts.MTime is true. 141 // in all other cases mtime is latest. 142 143 fi.VersionID = versionID // set any new versionID we might have created 144 fi.ModTime = modTime // set modTime for the new versionID 145 if !dstOpts.MTime.IsZero() { 146 modTime = dstOpts.MTime 147 fi.ModTime = dstOpts.MTime 148 } 149 150 fi.Metadata = srcInfo.UserDefined 151 srcInfo.UserDefined["etag"] = srcInfo.ETag 152 153 inlineData := fi.InlineData() 154 freeVersionID := fi.TierFreeVersionID() 155 freeVersionMarker := fi.TierFreeVersion() 156 157 // Update `xl.meta` content on each disks. 158 for index := range metaArr { 159 if metaArr[index].IsValid() { 160 metaArr[index].ModTime = modTime 161 metaArr[index].VersionID = versionID 162 if !metaArr[index].InlineData() { 163 // If the data is not inlined, we may end up incorrectly 164 // inlining the data here, that leads to an inconsistent 165 // situation where some objects are were not inlined 166 // were now inlined, make sure to `nil` the Data such 167 // that xl.meta is written as expected. 168 metaArr[index].Data = nil 169 } 170 metaArr[index].Metadata = srcInfo.UserDefined 171 // Preserve existing values 172 if inlineData { 173 metaArr[index].SetInlineData() 174 } 175 if freeVersionID != "" { 176 metaArr[index].SetTierFreeVersionID(freeVersionID) 177 } 178 if freeVersionMarker { 179 metaArr[index].SetTierFreeVersion() 180 } 181 } 182 } 183 184 // Write unique `xl.meta` for each disk. 185 if _, err = writeUniqueFileInfo(ctx, onlineDisks, "", srcBucket, srcObject, metaArr, writeQuorum); err != nil { 186 return oi, toObjectErr(err, srcBucket, srcObject) 187 } 188 189 return fi.ToObjectInfo(srcBucket, srcObject, srcOpts.Versioned || srcOpts.VersionSuspended), nil 190 } 191 192 // GetObjectNInfo - returns object info and an object 193 // Read(Closer). When err != nil, the returned reader is always nil. 194 func (er erasureObjects) GetObjectNInfo(ctx context.Context, bucket, object string, rs *HTTPRangeSpec, h http.Header, opts ObjectOptions) (gr *GetObjectReader, err error) { 195 if !opts.NoAuditLog { 196 auditObjectErasureSet(ctx, object, &er) 197 } 198 199 var unlockOnDefer bool 200 nsUnlocker := func() {} 201 defer func() { 202 if unlockOnDefer { 203 nsUnlocker() 204 } 205 }() 206 207 // Acquire lock 208 if !opts.NoLock { 209 lock := er.NewNSLock(bucket, object) 210 lkctx, err := lock.GetRLock(ctx, globalOperationTimeout) 211 if err != nil { 212 return nil, err 213 } 214 ctx = lkctx.Context() 215 216 // Release lock when the metadata is verified, and reader 217 // is ready to be read. 218 // 219 // This is possible to be lock free because 220 // - xl.meta for inlined objects has already read the data 221 // into memory, any mutation on xl.meta subsequently is 222 // inconsequential to the overall read operation. 223 // - xl.meta metadata is still verified for quorum under lock() 224 // however writing the response doesn't need to serialize 225 // concurrent writers 226 unlockOnDefer = true 227 nsUnlocker = func() { lock.RUnlock(lkctx) } 228 } 229 230 fi, metaArr, onlineDisks, err := er.getObjectFileInfo(ctx, bucket, object, opts, true) 231 if err != nil { 232 return nil, toObjectErr(err, bucket, object) 233 } 234 235 objInfo := fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended) 236 if objInfo.DeleteMarker { 237 if opts.VersionID == "" { 238 return &GetObjectReader{ 239 ObjInfo: objInfo, 240 }, toObjectErr(errFileNotFound, bucket, object) 241 } 242 // Make sure to return object info to provide extra information. 243 return &GetObjectReader{ 244 ObjInfo: objInfo, 245 }, toObjectErr(errMethodNotAllowed, bucket, object) 246 } 247 248 // Set NoDecryption for SSE-C objects and if replication request 249 if crypto.SSEC.IsEncrypted(objInfo.UserDefined) && opts.ReplicationRequest { 250 opts.NoDecryption = true 251 } 252 253 if objInfo.IsRemote() { 254 gr, err := getTransitionedObjectReader(ctx, bucket, object, rs, h, objInfo, opts) 255 if err != nil { 256 return nil, err 257 } 258 unlockOnDefer = false 259 return gr.WithCleanupFuncs(nsUnlocker), nil 260 } 261 262 if objInfo.Size == 0 { 263 // Zero byte objects don't even need to further initialize pipes etc. 264 return NewGetObjectReaderFromReader(bytes.NewReader(nil), objInfo, opts) 265 } 266 267 fn, off, length, err := NewGetObjectReader(rs, objInfo, opts) 268 if err != nil { 269 return nil, err 270 } 271 272 if unlockOnDefer { 273 unlockOnDefer = fi.InlineData() 274 } 275 276 pr, pw := xioutil.WaitPipe() 277 go func() { 278 pw.CloseWithError(er.getObjectWithFileInfo(ctx, bucket, object, off, length, pw, fi, metaArr, onlineDisks)) 279 }() 280 281 // Cleanup function to cause the go routine above to exit, in 282 // case of incomplete read. 283 pipeCloser := func() { 284 pr.CloseWithError(nil) 285 } 286 287 if !unlockOnDefer { 288 return fn(pr, h, pipeCloser, nsUnlocker) 289 } 290 291 return fn(pr, h, pipeCloser) 292 } 293 294 func (er erasureObjects) getObjectWithFileInfo(ctx context.Context, bucket, object string, startOffset int64, length int64, writer io.Writer, fi FileInfo, metaArr []FileInfo, onlineDisks []StorageAPI) error { 295 // Reorder online disks based on erasure distribution order. 296 // Reorder parts metadata based on erasure distribution order. 297 onlineDisks, metaArr = shuffleDisksAndPartsMetadataByIndex(onlineDisks, metaArr, fi) 298 299 // For negative length read everything. 300 if length < 0 { 301 length = fi.Size - startOffset 302 } 303 304 // Reply back invalid range if the input offset and length fall out of range. 305 if startOffset > fi.Size || startOffset+length > fi.Size { 306 return InvalidRange{startOffset, length, fi.Size} 307 } 308 309 // Get start part index and offset. 310 partIndex, partOffset, err := fi.ObjectToPartOffset(ctx, startOffset) 311 if err != nil { 312 return InvalidRange{startOffset, length, fi.Size} 313 } 314 315 // Calculate endOffset according to length 316 endOffset := startOffset 317 if length > 0 { 318 endOffset += length - 1 319 } 320 321 // Get last part index to read given length. 322 lastPartIndex, _, err := fi.ObjectToPartOffset(ctx, endOffset) 323 if err != nil { 324 return InvalidRange{startOffset, length, fi.Size} 325 } 326 327 var totalBytesRead int64 328 erasure, err := NewErasure(ctx, fi.Erasure.DataBlocks, fi.Erasure.ParityBlocks, fi.Erasure.BlockSize) 329 if err != nil { 330 return toObjectErr(err, bucket, object) 331 } 332 333 var healOnce sync.Once 334 335 for ; partIndex <= lastPartIndex; partIndex++ { 336 if length == totalBytesRead { 337 break 338 } 339 340 partNumber := fi.Parts[partIndex].Number 341 342 // Save the current part name and size. 343 partSize := fi.Parts[partIndex].Size 344 345 partLength := partSize - partOffset 346 // partLength should be adjusted so that we don't write more data than what was requested. 347 if partLength > (length - totalBytesRead) { 348 partLength = length - totalBytesRead 349 } 350 351 tillOffset := erasure.ShardFileOffset(partOffset, partLength, partSize) 352 // Get the checksums of the current part. 353 readers := make([]io.ReaderAt, len(onlineDisks)) 354 prefer := make([]bool, len(onlineDisks)) 355 for index, disk := range onlineDisks { 356 if disk == OfflineDisk { 357 continue 358 } 359 if !metaArr[index].IsValid() { 360 continue 361 } 362 if !metaArr[index].Erasure.Equal(fi.Erasure) { 363 continue 364 } 365 checksumInfo := metaArr[index].Erasure.GetChecksumInfo(partNumber) 366 partPath := pathJoin(object, metaArr[index].DataDir, fmt.Sprintf("part.%d", partNumber)) 367 readers[index] = newBitrotReader(disk, metaArr[index].Data, bucket, partPath, tillOffset, 368 checksumInfo.Algorithm, checksumInfo.Hash, erasure.ShardSize()) 369 370 // Prefer local disks 371 prefer[index] = disk.Hostname() == "" 372 } 373 374 written, err := erasure.Decode(ctx, writer, readers, partOffset, partLength, partSize, prefer) 375 // Note: we should not be defer'ing the following closeBitrotReaders() call as 376 // we are inside a for loop i.e if we use defer, we would accumulate a lot of open files by the time 377 // we return from this function. 378 closeBitrotReaders(readers) 379 if err != nil { 380 // If we have successfully written all the content that was asked 381 // by the client, but we still see an error - this would mean 382 // that we have some parts or data blocks missing or corrupted 383 // - attempt a heal to successfully heal them for future calls. 384 if written == partLength { 385 var scan madmin.HealScanMode 386 switch { 387 case errors.Is(err, errFileNotFound): 388 scan = madmin.HealNormalScan 389 case errors.Is(err, errFileCorrupt): 390 scan = madmin.HealDeepScan 391 } 392 switch scan { 393 case madmin.HealNormalScan, madmin.HealDeepScan: 394 healOnce.Do(func() { 395 globalMRFState.addPartialOp(partialOperation{ 396 bucket: bucket, 397 object: object, 398 versionID: fi.VersionID, 399 queued: time.Now(), 400 setIndex: er.setIndex, 401 poolIndex: er.poolIndex, 402 scanMode: scan, 403 }) 404 }) 405 // Healing is triggered and we have written 406 // successfully the content to client for 407 // the specific part, we should `nil` this error 408 // and proceed forward, instead of throwing errors. 409 err = nil 410 } 411 } 412 if err != nil { 413 return toObjectErr(err, bucket, object) 414 } 415 } 416 for i, r := range readers { 417 if r == nil { 418 onlineDisks[i] = OfflineDisk 419 } 420 } 421 // Track total bytes read from disk and written to the client. 422 totalBytesRead += partLength 423 // partOffset will be valid only for the first part, hence reset it to 0 for 424 // the remaining parts. 425 partOffset = 0 426 } // End of read all parts loop. 427 // Return success. 428 return nil 429 } 430 431 // GetObjectInfo - reads object metadata and replies back ObjectInfo. 432 func (er erasureObjects) GetObjectInfo(ctx context.Context, bucket, object string, opts ObjectOptions) (info ObjectInfo, err error) { 433 if !opts.NoAuditLog { 434 auditObjectErasureSet(ctx, object, &er) 435 } 436 437 if !opts.NoLock { 438 // Lock the object before reading. 439 lk := er.NewNSLock(bucket, object) 440 lkctx, err := lk.GetRLock(ctx, globalOperationTimeout) 441 if err != nil { 442 return ObjectInfo{}, err 443 } 444 ctx = lkctx.Context() 445 defer lk.RUnlock(lkctx) 446 } 447 448 return er.getObjectInfo(ctx, bucket, object, opts) 449 } 450 451 func auditDanglingObjectDeletion(ctx context.Context, bucket, object, versionID string, tags map[string]interface{}) { 452 if len(logger.AuditTargets()) == 0 { 453 return 454 } 455 456 opts := AuditLogOptions{ 457 Event: "DeleteDanglingObject", 458 Bucket: bucket, 459 Object: object, 460 VersionID: versionID, 461 Tags: tags, 462 } 463 464 auditLogInternal(ctx, opts) 465 } 466 467 func joinErrs(errs []error) []string { 468 s := make([]string, len(errs)) 469 for i := range s { 470 if errs[i] == nil { 471 s[i] = "<nil>" 472 } else { 473 s[i] = errs[i].Error() 474 } 475 } 476 return s 477 } 478 479 func (er erasureObjects) deleteIfDangling(ctx context.Context, bucket, object string, metaArr []FileInfo, errs []error, dataErrs []error, opts ObjectOptions) (FileInfo, error) { 480 var err error 481 m, ok := isObjectDangling(metaArr, errs, dataErrs) 482 if ok { 483 tags := make(map[string]interface{}, 4) 484 tags["set"] = er.setIndex 485 tags["pool"] = er.poolIndex 486 tags["merrs"] = joinErrs(errs) 487 tags["derrs"] = joinErrs(dataErrs) 488 if m.IsValid() { 489 tags["size"] = m.Size 490 tags["mtime"] = m.ModTime.Format(http.TimeFormat) 491 tags["data"] = m.Erasure.DataBlocks 492 tags["parity"] = m.Erasure.ParityBlocks 493 } else { 494 tags["invalid-meta"] = true 495 tags["data"] = er.setDriveCount - er.defaultParityCount 496 tags["parity"] = er.defaultParityCount 497 } 498 499 // count the number of offline disks 500 offline := 0 501 for i := 0; i < max(len(errs), len(dataErrs)); i++ { 502 if i < len(errs) && errors.Is(errs[i], errDiskNotFound) || i < len(dataErrs) && errors.Is(dataErrs[i], errDiskNotFound) { 503 offline++ 504 } 505 } 506 if offline > 0 { 507 tags["offline"] = offline 508 } 509 510 _, file, line, cok := runtime.Caller(1) 511 if cok { 512 tags["caller"] = fmt.Sprintf("%s:%d", file, line) 513 } 514 515 defer auditDanglingObjectDeletion(ctx, bucket, object, m.VersionID, tags) 516 517 err = errFileNotFound 518 if opts.VersionID != "" { 519 err = errFileVersionNotFound 520 } 521 522 fi := FileInfo{ 523 VersionID: m.VersionID, 524 } 525 if opts.VersionID != "" { 526 fi.VersionID = opts.VersionID 527 } 528 fi.SetTierFreeVersionID(mustGetUUID()) 529 disks := er.getDisks() 530 g := errgroup.WithNErrs(len(disks)) 531 for index := range disks { 532 index := index 533 g.Go(func() error { 534 if disks[index] == nil { 535 return errDiskNotFound 536 } 537 return disks[index].DeleteVersion(ctx, bucket, object, fi, false, DeleteOptions{}) 538 }, index) 539 } 540 541 rmDisks := make(map[string]string, len(disks)) 542 for index, err := range g.Wait() { 543 var errStr, diskName string 544 if err != nil { 545 errStr = err.Error() 546 } else { 547 errStr = "<nil>" 548 } 549 if disks[index] != nil { 550 diskName = disks[index].String() 551 } else { 552 diskName = fmt.Sprintf("disk-%d", index) 553 } 554 rmDisks[diskName] = errStr 555 } 556 tags["cleanupResult"] = rmDisks 557 } 558 return m, err 559 } 560 561 func fileInfoFromRaw(ri RawFileInfo, bucket, object string, readData, inclFreeVers, allParts bool) (FileInfo, error) { 562 var xl xlMetaV2 563 if err := xl.LoadOrConvert(ri.Buf); err != nil { 564 return FileInfo{}, err 565 } 566 567 fi, err := xl.ToFileInfo(bucket, object, "", inclFreeVers, allParts) 568 if err != nil { 569 return FileInfo{}, err 570 } 571 572 if !fi.IsValid() { 573 return FileInfo{}, errCorruptedFormat 574 } 575 576 versionID := fi.VersionID 577 if versionID == "" { 578 versionID = nullVersionID 579 } 580 581 fileInfo, err := xl.ToFileInfo(bucket, object, versionID, inclFreeVers, allParts) 582 if err != nil { 583 return FileInfo{}, err 584 } 585 586 if readData { 587 fileInfo.Data = xl.data.find(versionID) 588 } 589 590 return fileInfo, nil 591 } 592 593 func readAllRawFileInfo(ctx context.Context, disks []StorageAPI, bucket, object string, readData bool) ([]RawFileInfo, []error) { 594 rawFileInfos := make([]RawFileInfo, len(disks)) 595 g := errgroup.WithNErrs(len(disks)) 596 for index := range disks { 597 index := index 598 g.Go(func() (err error) { 599 if disks[index] == nil { 600 return errDiskNotFound 601 } 602 rf, err := disks[index].ReadXL(ctx, bucket, object, readData) 603 if err != nil { 604 return err 605 } 606 rawFileInfos[index] = rf 607 return nil 608 }, index) 609 } 610 611 return rawFileInfos, g.Wait() 612 } 613 614 func pickLatestQuorumFilesInfo(ctx context.Context, rawFileInfos []RawFileInfo, errs []error, bucket, object string, readData, inclFreeVers, allParts bool) ([]FileInfo, []error) { 615 metadataArray := make([]*xlMetaV2, len(rawFileInfos)) 616 metaFileInfos := make([]FileInfo, len(rawFileInfos)) 617 metadataShallowVersions := make([][]xlMetaV2ShallowVersion, len(rawFileInfos)) 618 var v2bufs [][]byte 619 if !readData { 620 v2bufs = make([][]byte, len(rawFileInfos)) 621 } 622 623 // Read `xl.meta` in parallel across disks. 624 for index := range rawFileInfos { 625 rf := rawFileInfos[index] 626 if rf.Buf == nil { 627 continue 628 } 629 if !readData { 630 // Save the buffer so we can reuse it. 631 v2bufs[index] = rf.Buf 632 } 633 634 var xl xlMetaV2 635 if err := xl.LoadOrConvert(rf.Buf); err != nil { 636 errs[index] = err 637 continue 638 } 639 metadataArray[index] = &xl 640 metaFileInfos[index] = FileInfo{} 641 } 642 643 for index := range metadataArray { 644 if metadataArray[index] != nil { 645 metadataShallowVersions[index] = metadataArray[index].versions 646 } 647 } 648 649 readQuorum := (len(rawFileInfos) + 1) / 2 650 meta := &xlMetaV2{versions: mergeXLV2Versions(readQuorum, false, 1, metadataShallowVersions...)} 651 lfi, err := meta.ToFileInfo(bucket, object, "", inclFreeVers, allParts) 652 if err != nil { 653 for i := range errs { 654 if errs[i] == nil { 655 errs[i] = err 656 } 657 } 658 return metaFileInfos, errs 659 } 660 if !lfi.IsValid() { 661 for i := range errs { 662 if errs[i] == nil { 663 errs[i] = errCorruptedFormat 664 } 665 } 666 return metaFileInfos, errs 667 } 668 669 versionID := lfi.VersionID 670 if versionID == "" { 671 versionID = nullVersionID 672 } 673 674 for index := range metadataArray { 675 if metadataArray[index] == nil { 676 continue 677 } 678 679 // make sure to preserve this for diskmtime based healing bugfix. 680 metaFileInfos[index], errs[index] = metadataArray[index].ToFileInfo(bucket, object, versionID, inclFreeVers, allParts) 681 if errs[index] != nil { 682 continue 683 } 684 685 if readData { 686 metaFileInfos[index].Data = metadataArray[index].data.find(versionID) 687 } 688 } 689 if !readData { 690 for i := range v2bufs { 691 metaDataPoolPut(v2bufs[i]) 692 } 693 } 694 695 // Return all the metadata. 696 return metaFileInfos, errs 697 } 698 699 // Checking if an object is dangling costs some IOPS; hence implementing this function 700 // which decides which condition it is useful to check if an object is dangling 701 // 702 // errs: errors from reading xl.meta in all disks 703 // err: reduced errs 704 // bucket: the object name in question 705 func shouldCheckForDangling(err error, errs []error, bucket string) bool { 706 // Avoid data in .minio.sys for now 707 if bucket == minioMetaBucket { 708 return false 709 } 710 switch { 711 // Check if we have a read quorum issue 712 case errors.Is(err, errErasureReadQuorum): 713 return true 714 // Check if the object is inexistent in most disks but not all of them 715 case errors.Is(err, errFileNotFound) || errors.Is(err, errFileVersionNotFound): 716 for i := range errs { 717 if errs[i] == nil { 718 return true 719 } 720 } 721 } 722 return false 723 } 724 725 func readAllXL(ctx context.Context, disks []StorageAPI, bucket, object string, readData, inclFreeVers, allParts bool) ([]FileInfo, []error) { 726 rawFileInfos, errs := readAllRawFileInfo(ctx, disks, bucket, object, readData) 727 return pickLatestQuorumFilesInfo(ctx, rawFileInfos, errs, bucket, object, readData, inclFreeVers, allParts) 728 } 729 730 func (er erasureObjects) getObjectFileInfo(ctx context.Context, bucket, object string, opts ObjectOptions, readData bool) (FileInfo, []FileInfo, []StorageAPI, error) { 731 rawArr := make([]RawFileInfo, er.setDriveCount) 732 metaArr := make([]FileInfo, er.setDriveCount) 733 errs := make([]error, er.setDriveCount) 734 for i := range errs { 735 errs[i] = errDiskOngoingReq 736 } 737 738 done := make(chan bool, er.setDriveCount) 739 disks := er.getDisks() 740 741 ropts := ReadOptions{ 742 ReadData: readData, 743 Healing: false, 744 } 745 746 mrfCheck := make(chan FileInfo) 747 defer xioutil.SafeClose(mrfCheck) 748 749 var rw sync.Mutex 750 751 // Ask for all disks first; 752 go func() { 753 ctx, cancel := context.WithCancel(ctx) 754 defer cancel() 755 756 wg := sync.WaitGroup{} 757 for i, disk := range disks { 758 if disk == nil { 759 done <- false 760 continue 761 } 762 if !disk.IsOnline() { 763 done <- false 764 continue 765 } 766 wg.Add(1) 767 go func(i int, disk StorageAPI) { 768 defer wg.Done() 769 770 var ( 771 fi FileInfo 772 rfi RawFileInfo 773 err error 774 ) 775 776 if opts.VersionID != "" { 777 // Read a specific version ID 778 fi, err = disk.ReadVersion(ctx, "", bucket, object, opts.VersionID, ropts) 779 } else { 780 // Read the latest version 781 rfi, err = disk.ReadXL(ctx, bucket, object, readData) 782 if err == nil { 783 fi, err = fileInfoFromRaw(rfi, bucket, object, readData, opts.InclFreeVersions, true) 784 } 785 } 786 787 rw.Lock() 788 rawArr[i] = rfi 789 metaArr[i], errs[i] = fi, err 790 rw.Unlock() 791 792 done <- err == nil 793 }(i, disk) 794 } 795 796 wg.Wait() 797 xioutil.SafeClose(done) 798 799 fi, ok := <-mrfCheck 800 if !ok { 801 return 802 } 803 804 if fi.Deleted { 805 return 806 } 807 808 // if one of the disk is offline, return right here no need 809 // to attempt a heal on the object. 810 if countErrs(errs, errDiskNotFound) > 0 { 811 return 812 } 813 814 var missingBlocks int 815 for i := range errs { 816 if IsErr(errs[i], 817 errFileNotFound, 818 errFileVersionNotFound, 819 errFileCorrupt, 820 ) { 821 missingBlocks++ 822 } 823 } 824 825 // if missing metadata can be reconstructed, attempt to reconstruct. 826 // additionally do not heal delete markers inline, let them be 827 // healed upon regular heal process. 828 if missingBlocks > 0 && missingBlocks < fi.Erasure.DataBlocks { 829 globalMRFState.addPartialOp(partialOperation{ 830 bucket: fi.Volume, 831 object: fi.Name, 832 versionID: fi.VersionID, 833 queued: time.Now(), 834 setIndex: er.setIndex, 835 poolIndex: er.poolIndex, 836 }) 837 } 838 839 return 840 }() 841 842 validResp := 0 843 totalResp := 0 844 845 // minDisks value is only to reduce the number of calls 846 // to the disks; this value is not accurate because we do 847 // not know the storage class of the object yet 848 minDisks := 0 849 if p := globalStorageClass.GetParityForSC(""); p > -1 { 850 minDisks = er.setDriveCount - p 851 } else { 852 minDisks = er.setDriveCount - er.defaultParityCount 853 } 854 855 calcQuorum := func(metaArr []FileInfo, errs []error) (FileInfo, []FileInfo, []StorageAPI, time.Time, string, error) { 856 readQuorum, _, err := objectQuorumFromMeta(ctx, metaArr, errs, er.defaultParityCount) 857 if err != nil { 858 return FileInfo{}, nil, nil, time.Time{}, "", err 859 } 860 if err := reduceReadQuorumErrs(ctx, errs, objectOpIgnoredErrs, readQuorum); err != nil { 861 return FileInfo{}, nil, nil, time.Time{}, "", err 862 } 863 onlineDisks, modTime, etag := listOnlineDisks(disks, metaArr, errs, readQuorum) 864 fi, err := pickValidFileInfo(ctx, metaArr, modTime, etag, readQuorum) 865 if err != nil { 866 return FileInfo{}, nil, nil, time.Time{}, "", err 867 } 868 869 onlineMeta := make([]FileInfo, len(metaArr)) 870 for i, disk := range onlineDisks { 871 if disk != nil { 872 onlineMeta[i] = metaArr[i] 873 } 874 } 875 876 return fi, onlineMeta, onlineDisks, modTime, etag, nil 877 } 878 879 var ( 880 modTime time.Time 881 etag string 882 fi FileInfo 883 onlineMeta []FileInfo 884 onlineDisks []StorageAPI 885 err error 886 ) 887 888 for success := range done { 889 totalResp++ 890 if success { 891 validResp++ 892 } 893 if totalResp < er.setDriveCount { 894 if !opts.FastGetObjInfo { 895 continue 896 } 897 if validResp < minDisks { 898 continue 899 } 900 } 901 902 rw.Lock() 903 if opts.VersionID == "" && totalResp == er.setDriveCount { 904 fi, onlineMeta, onlineDisks, modTime, etag, err = calcQuorum(pickLatestQuorumFilesInfo(ctx, 905 rawArr, errs, bucket, object, readData, opts.InclFreeVersions, true)) 906 } else { 907 fi, onlineMeta, onlineDisks, modTime, etag, err = calcQuorum(metaArr, errs) 908 } 909 rw.Unlock() 910 if err == nil && fi.InlineData() { 911 break 912 } 913 } 914 915 if err != nil { 916 // We can only look for dangling if we received all the responses, if we did 917 // not we simply ignore it, since we can't tell for sure if its dangling object. 918 if totalResp == er.setDriveCount && shouldCheckForDangling(err, errs, bucket) { 919 _, derr := er.deleteIfDangling(context.Background(), bucket, object, metaArr, errs, nil, opts) 920 if derr != nil { 921 err = derr 922 } 923 } 924 return fi, nil, nil, toObjectErr(err, bucket, object) 925 } 926 927 if !fi.Deleted && len(fi.Erasure.Distribution) != len(onlineDisks) { 928 err := fmt.Errorf("unexpected file distribution (%v) from online disks (%v), looks like backend disks have been manually modified refusing to heal %s/%s(%s)", 929 fi.Erasure.Distribution, onlineDisks, bucket, object, opts.VersionID) 930 logger.LogOnceIf(ctx, err, "get-object-file-info-manually-modified") 931 return fi, nil, nil, toObjectErr(err, bucket, object, opts.VersionID) 932 } 933 934 filterOnlineDisksInplace(fi, onlineMeta, onlineDisks) 935 for i := range onlineMeta { 936 // verify metadata is valid, it has similar erasure info 937 // as well as common modtime, if modtime is not possible 938 // verify if it has common "etag" at least. 939 if onlineMeta[i].IsValid() && onlineMeta[i].Erasure.Equal(fi.Erasure) { 940 ok := onlineMeta[i].ModTime.Equal(modTime) 941 if modTime.IsZero() || modTime.Equal(timeSentinel) { 942 ok = etag != "" && etag == fi.Metadata["etag"] 943 } 944 if ok { 945 continue 946 } 947 } // in all other cases metadata is corrupt, do not read from it. 948 949 onlineMeta[i] = FileInfo{} 950 onlineDisks[i] = nil 951 } 952 953 select { 954 case mrfCheck <- fi.ShallowCopy(): 955 case <-ctx.Done(): 956 return fi, onlineMeta, onlineDisks, toObjectErr(ctx.Err(), bucket, object) 957 } 958 959 return fi, onlineMeta, onlineDisks, nil 960 } 961 962 // getObjectInfo - wrapper for reading object metadata and constructs ObjectInfo. 963 func (er erasureObjects) getObjectInfo(ctx context.Context, bucket, object string, opts ObjectOptions) (objInfo ObjectInfo, err error) { 964 fi, _, _, err := er.getObjectFileInfo(ctx, bucket, object, opts, false) 965 if err != nil { 966 return objInfo, toObjectErr(err, bucket, object) 967 } 968 objInfo = fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended) 969 if fi.Deleted { 970 if opts.VersionID == "" || opts.DeleteMarker { 971 return objInfo, toObjectErr(errFileNotFound, bucket, object) 972 } 973 // Make sure to return object info to provide extra information. 974 return objInfo, toObjectErr(errMethodNotAllowed, bucket, object) 975 } 976 977 return objInfo, nil 978 } 979 980 // getObjectInfoAndQuorum - wrapper for reading object metadata and constructs ObjectInfo, additionally returns write quorum for the object. 981 func (er erasureObjects) getObjectInfoAndQuorum(ctx context.Context, bucket, object string, opts ObjectOptions) (objInfo ObjectInfo, wquorum int, err error) { 982 fi, _, _, err := er.getObjectFileInfo(ctx, bucket, object, opts, false) 983 if err != nil { 984 return objInfo, er.defaultWQuorum(), toObjectErr(err, bucket, object) 985 } 986 987 wquorum = fi.WriteQuorum(er.defaultWQuorum()) 988 989 objInfo = fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended) 990 if !fi.VersionPurgeStatus().Empty() && opts.VersionID != "" { 991 // Make sure to return object info to provide extra information. 992 return objInfo, wquorum, toObjectErr(errMethodNotAllowed, bucket, object) 993 } 994 995 if fi.Deleted { 996 if opts.VersionID == "" || opts.DeleteMarker { 997 return objInfo, wquorum, toObjectErr(errFileNotFound, bucket, object) 998 } 999 // Make sure to return object info to provide extra information. 1000 return objInfo, wquorum, toObjectErr(errMethodNotAllowed, bucket, object) 1001 } 1002 1003 return objInfo, wquorum, nil 1004 } 1005 1006 // Similar to rename but renames data from srcEntry to dstEntry at dataDir 1007 func renameData(ctx context.Context, disks []StorageAPI, srcBucket, srcEntry string, metadata []FileInfo, dstBucket, dstEntry string, writeQuorum int) ([]StorageAPI, bool, error) { 1008 g := errgroup.WithNErrs(len(disks)) 1009 1010 fvID := mustGetUUID() 1011 for index := range disks { 1012 metadata[index].SetTierFreeVersionID(fvID) 1013 } 1014 1015 diskVersions := make([]uint64, len(disks)) 1016 // Rename file on all underlying storage disks. 1017 for index := range disks { 1018 index := index 1019 g.Go(func() error { 1020 if disks[index] == nil { 1021 return errDiskNotFound 1022 } 1023 1024 // Pick one FileInfo for a disk at index. 1025 fi := metadata[index] 1026 // Assign index when index is initialized 1027 if fi.Erasure.Index == 0 { 1028 fi.Erasure.Index = index + 1 1029 } 1030 1031 if !fi.IsValid() { 1032 return errFileCorrupt 1033 } 1034 sign, err := disks[index].RenameData(ctx, srcBucket, srcEntry, fi, dstBucket, dstEntry, RenameOptions{}) 1035 if err != nil { 1036 return err 1037 } 1038 diskVersions[index] = sign 1039 return nil 1040 }, index) 1041 } 1042 1043 // Wait for all renames to finish. 1044 errs := g.Wait() 1045 1046 var versionsDisparity bool 1047 1048 err := reduceWriteQuorumErrs(ctx, errs, objectOpIgnoredErrs, writeQuorum) 1049 if err != nil { 1050 dg := errgroup.WithNErrs(len(disks)) 1051 for index, nerr := range errs { 1052 if nerr != nil { 1053 continue 1054 } 1055 index := index 1056 // When we are going to return error, attempt to delete success 1057 // on some of the drives, if we cannot we do not have to notify 1058 // caller this dangling object will be now scheduled to be removed 1059 // via active healing. 1060 dg.Go(func() error { 1061 return disks[index].DeleteVersion(context.Background(), dstBucket, dstEntry, metadata[index], false, DeleteOptions{UndoWrite: true}) 1062 }, index) 1063 } 1064 dg.Wait() 1065 } 1066 if err == nil { 1067 versions := reduceCommonVersions(diskVersions, writeQuorum) 1068 for index, dversions := range diskVersions { 1069 if errs[index] != nil { 1070 continue 1071 } 1072 if versions != dversions { 1073 versionsDisparity = true 1074 break 1075 } 1076 } 1077 } 1078 1079 // We can safely allow RenameData errors up to len(er.getDisks()) - writeQuorum 1080 // otherwise return failure. 1081 return evalDisks(disks, errs), versionsDisparity, err 1082 } 1083 1084 func (er erasureObjects) putMetacacheObject(ctx context.Context, key string, r *PutObjReader, opts ObjectOptions) (objInfo ObjectInfo, err error) { 1085 data := r.Reader 1086 1087 // No metadata is set, allocate a new one. 1088 if opts.UserDefined == nil { 1089 opts.UserDefined = make(map[string]string) 1090 } 1091 1092 storageDisks := er.getDisks() 1093 // Get parity and data drive count based on storage class metadata 1094 parityDrives := globalStorageClass.GetParityForSC(opts.UserDefined[xhttp.AmzStorageClass]) 1095 if parityDrives < 0 { 1096 parityDrives = er.defaultParityCount 1097 } 1098 dataDrives := len(storageDisks) - parityDrives 1099 1100 // we now know the number of blocks this object needs for data and parity. 1101 // writeQuorum is dataBlocks + 1 1102 writeQuorum := dataDrives 1103 if dataDrives == parityDrives { 1104 writeQuorum++ 1105 } 1106 1107 // Validate input data size and it can never be less than zero. 1108 if data.Size() < -1 { 1109 logger.LogIf(ctx, errInvalidArgument, logger.ErrorKind) 1110 return ObjectInfo{}, toObjectErr(errInvalidArgument) 1111 } 1112 1113 // Initialize parts metadata 1114 partsMetadata := make([]FileInfo, len(storageDisks)) 1115 1116 fi := newFileInfo(pathJoin(minioMetaBucket, key), dataDrives, parityDrives) 1117 fi.DataDir = mustGetUUID() 1118 1119 // Initialize erasure metadata. 1120 for index := range partsMetadata { 1121 partsMetadata[index] = fi 1122 } 1123 1124 // Order disks according to erasure distribution 1125 var onlineDisks []StorageAPI 1126 onlineDisks, partsMetadata = shuffleDisksAndPartsMetadata(storageDisks, partsMetadata, fi) 1127 1128 erasure, err := NewErasure(ctx, fi.Erasure.DataBlocks, fi.Erasure.ParityBlocks, fi.Erasure.BlockSize) 1129 if err != nil { 1130 return ObjectInfo{}, toObjectErr(err, minioMetaBucket, key) 1131 } 1132 1133 // Fetch buffer for I/O, returns from the pool if not allocates a new one and returns. 1134 var buffer []byte 1135 switch size := data.Size(); { 1136 case size == 0: 1137 buffer = make([]byte, 1) // Allocate at least a byte to reach EOF 1138 case size >= fi.Erasure.BlockSize: 1139 buffer = globalBytePoolCap.Get() 1140 defer globalBytePoolCap.Put(buffer) 1141 case size < fi.Erasure.BlockSize: 1142 // No need to allocate fully blockSizeV1 buffer if the incoming data is smaller. 1143 buffer = make([]byte, size, 2*size+int64(fi.Erasure.ParityBlocks+fi.Erasure.DataBlocks-1)) 1144 } 1145 1146 if len(buffer) > int(fi.Erasure.BlockSize) { 1147 buffer = buffer[:fi.Erasure.BlockSize] 1148 } 1149 1150 shardFileSize := erasure.ShardFileSize(data.Size()) 1151 writers := make([]io.Writer, len(onlineDisks)) 1152 inlineBuffers := make([]*bytes.Buffer, len(onlineDisks)) 1153 for i, disk := range onlineDisks { 1154 if disk == nil { 1155 continue 1156 } 1157 if disk.IsOnline() { 1158 inlineBuffers[i] = bytes.NewBuffer(make([]byte, 0, shardFileSize)) 1159 writers[i] = newStreamingBitrotWriterBuffer(inlineBuffers[i], DefaultBitrotAlgorithm, erasure.ShardSize()) 1160 } 1161 } 1162 1163 n, erasureErr := erasure.Encode(ctx, data, writers, buffer, writeQuorum) 1164 closeBitrotWriters(writers) 1165 if erasureErr != nil { 1166 return ObjectInfo{}, toObjectErr(erasureErr, minioMetaBucket, key) 1167 } 1168 1169 // Should return IncompleteBody{} error when reader has fewer bytes 1170 // than specified in request header. 1171 if n < data.Size() { 1172 return ObjectInfo{}, IncompleteBody{Bucket: minioMetaBucket, Object: key} 1173 } 1174 var index []byte 1175 if opts.IndexCB != nil { 1176 index = opts.IndexCB() 1177 } 1178 1179 modTime := UTCNow() 1180 1181 for i, w := range writers { 1182 if w == nil { 1183 // Make sure to avoid writing to disks which we couldn't complete in erasure.Encode() 1184 onlineDisks[i] = nil 1185 continue 1186 } 1187 partsMetadata[i].Data = inlineBuffers[i].Bytes() 1188 partsMetadata[i].AddObjectPart(1, "", n, data.ActualSize(), modTime, index, nil) 1189 } 1190 1191 // Fill all the necessary metadata. 1192 // Update `xl.meta` content on each disks. 1193 for index := range partsMetadata { 1194 partsMetadata[index].Size = n 1195 partsMetadata[index].Fresh = true 1196 partsMetadata[index].ModTime = modTime 1197 partsMetadata[index].Metadata = opts.UserDefined 1198 } 1199 1200 // Set an additional header when data is inlined. 1201 for index := range partsMetadata { 1202 partsMetadata[index].SetInlineData() 1203 } 1204 1205 for i := 0; i < len(onlineDisks); i++ { 1206 if onlineDisks[i] != nil && onlineDisks[i].IsOnline() { 1207 // Object info is the same in all disks, so we can pick 1208 // the first meta from online disk 1209 fi = partsMetadata[i] 1210 break 1211 } 1212 } 1213 1214 if _, err = writeUniqueFileInfo(ctx, onlineDisks, "", minioMetaBucket, key, partsMetadata, writeQuorum); err != nil { 1215 return ObjectInfo{}, toObjectErr(err, minioMetaBucket, key) 1216 } 1217 1218 return fi.ToObjectInfo(minioMetaBucket, key, opts.Versioned || opts.VersionSuspended), nil 1219 } 1220 1221 // PutObject - creates an object upon reading from the input stream 1222 // until EOF, erasure codes the data across all disk and additionally 1223 // writes `xl.meta` which carries the necessary metadata for future 1224 // object operations. 1225 func (er erasureObjects) PutObject(ctx context.Context, bucket string, object string, data *PutObjReader, opts ObjectOptions) (objInfo ObjectInfo, err error) { 1226 return er.putObject(ctx, bucket, object, data, opts) 1227 } 1228 1229 // Heal up to two versions of one object when there is disparity between disks 1230 func healObjectVersionsDisparity(bucket string, entry metaCacheEntry, scanMode madmin.HealScanMode) error { 1231 if entry.isDir() { 1232 return nil 1233 } 1234 // We might land at .metacache, .trash, .multipart 1235 // no need to heal them skip, only when bucket 1236 // is '.minio.sys' 1237 if bucket == minioMetaBucket { 1238 if wildcard.Match("buckets/*/.metacache/*", entry.name) { 1239 return nil 1240 } 1241 if wildcard.Match("tmp/*", entry.name) { 1242 return nil 1243 } 1244 if wildcard.Match("multipart/*", entry.name) { 1245 return nil 1246 } 1247 if wildcard.Match("tmp-old/*", entry.name) { 1248 return nil 1249 } 1250 } 1251 1252 fivs, err := entry.fileInfoVersions(bucket) 1253 if err != nil { 1254 healObject(bucket, entry.name, "", madmin.HealDeepScan) 1255 return err 1256 } 1257 1258 if len(fivs.Versions) <= 2 { 1259 for _, version := range fivs.Versions { 1260 healObject(bucket, entry.name, version.VersionID, scanMode) 1261 } 1262 } 1263 1264 return nil 1265 } 1266 1267 // putObject wrapper for erasureObjects PutObject 1268 func (er erasureObjects) putObject(ctx context.Context, bucket string, object string, r *PutObjReader, opts ObjectOptions) (objInfo ObjectInfo, err error) { 1269 if !opts.NoAuditLog { 1270 auditObjectErasureSet(ctx, object, &er) 1271 } 1272 1273 data := r.Reader 1274 1275 if opts.CheckPrecondFn != nil { 1276 if !opts.NoLock { 1277 ns := er.NewNSLock(bucket, object) 1278 lkctx, err := ns.GetLock(ctx, globalOperationTimeout) 1279 if err != nil { 1280 return ObjectInfo{}, err 1281 } 1282 ctx = lkctx.Context() 1283 defer ns.Unlock(lkctx) 1284 opts.NoLock = true 1285 } 1286 1287 obj, err := er.getObjectInfo(ctx, bucket, object, opts) 1288 if err == nil && opts.CheckPrecondFn(obj) { 1289 return objInfo, PreConditionFailed{} 1290 } 1291 if err != nil && !isErrVersionNotFound(err) && !isErrObjectNotFound(err) && !isErrReadQuorum(err) { 1292 return objInfo, err 1293 } 1294 } 1295 1296 // Validate input data size and it can never be less than -1. 1297 if data.Size() < -1 { 1298 logger.LogIf(ctx, errInvalidArgument, logger.ErrorKind) 1299 return ObjectInfo{}, toObjectErr(errInvalidArgument) 1300 } 1301 1302 userDefined := cloneMSS(opts.UserDefined) 1303 1304 storageDisks := er.getDisks() 1305 1306 // Get parity and data drive count based on storage class metadata 1307 parityDrives := globalStorageClass.GetParityForSC(userDefined[xhttp.AmzStorageClass]) 1308 if parityDrives < 0 { 1309 parityDrives = er.defaultParityCount 1310 } 1311 if opts.MaxParity { 1312 parityDrives = len(storageDisks) / 2 1313 } 1314 if !opts.MaxParity && globalStorageClass.AvailabilityOptimized() { 1315 // If we have offline disks upgrade the number of erasure codes for this object. 1316 parityOrig := parityDrives 1317 1318 var offlineDrives int 1319 for _, disk := range storageDisks { 1320 if disk == nil || !disk.IsOnline() { 1321 parityDrives++ 1322 offlineDrives++ 1323 continue 1324 } 1325 } 1326 1327 if offlineDrives >= (len(storageDisks)+1)/2 { 1328 // if offline drives are more than 50% of the drives 1329 // we have no quorum, we shouldn't proceed just 1330 // fail at that point. 1331 return ObjectInfo{}, toObjectErr(errErasureWriteQuorum, bucket, object) 1332 } 1333 1334 if parityDrives >= len(storageDisks)/2 { 1335 parityDrives = len(storageDisks) / 2 1336 } 1337 1338 if parityOrig != parityDrives { 1339 userDefined[minIOErasureUpgraded] = strconv.Itoa(parityOrig) + "->" + strconv.Itoa(parityDrives) 1340 } 1341 } 1342 dataDrives := len(storageDisks) - parityDrives 1343 1344 // we now know the number of blocks this object needs for data and parity. 1345 // writeQuorum is dataBlocks + 1 1346 writeQuorum := dataDrives 1347 if dataDrives == parityDrives { 1348 writeQuorum++ 1349 } 1350 1351 // Initialize parts metadata 1352 partsMetadata := make([]FileInfo, len(storageDisks)) 1353 1354 fi := newFileInfo(pathJoin(bucket, object), dataDrives, parityDrives) 1355 fi.VersionID = opts.VersionID 1356 if opts.Versioned && fi.VersionID == "" { 1357 fi.VersionID = mustGetUUID() 1358 } 1359 1360 fi.DataDir = mustGetUUID() 1361 fi.Checksum = opts.WantChecksum.AppendTo(nil, nil) 1362 if opts.EncryptFn != nil { 1363 fi.Checksum = opts.EncryptFn("object-checksum", fi.Checksum) 1364 } 1365 uniqueID := mustGetUUID() 1366 tempObj := uniqueID 1367 1368 // Initialize erasure metadata. 1369 for index := range partsMetadata { 1370 partsMetadata[index] = fi 1371 } 1372 1373 // Order disks according to erasure distribution 1374 var onlineDisks []StorageAPI 1375 onlineDisks, partsMetadata = shuffleDisksAndPartsMetadata(storageDisks, partsMetadata, fi) 1376 1377 erasure, err := NewErasure(ctx, fi.Erasure.DataBlocks, fi.Erasure.ParityBlocks, fi.Erasure.BlockSize) 1378 if err != nil { 1379 return ObjectInfo{}, toObjectErr(err, bucket, object) 1380 } 1381 1382 // Fetch buffer for I/O, returns from the pool if not allocates a new one and returns. 1383 var buffer []byte 1384 switch size := data.Size(); { 1385 case size == 0: 1386 buffer = make([]byte, 1) // Allocate at least a byte to reach EOF 1387 case size >= fi.Erasure.BlockSize || size == -1: 1388 buffer = globalBytePoolCap.Get() 1389 defer globalBytePoolCap.Put(buffer) 1390 case size < fi.Erasure.BlockSize: 1391 // No need to allocate fully blockSizeV1 buffer if the incoming data is smaller. 1392 buffer = make([]byte, size, 2*size+int64(fi.Erasure.ParityBlocks+fi.Erasure.DataBlocks-1)) 1393 } 1394 1395 if len(buffer) > int(fi.Erasure.BlockSize) { 1396 buffer = buffer[:fi.Erasure.BlockSize] 1397 } 1398 1399 partName := "part.1" 1400 tempErasureObj := pathJoin(uniqueID, fi.DataDir, partName) 1401 1402 defer er.deleteAll(context.Background(), minioMetaTmpBucket, tempObj) 1403 1404 shardFileSize := erasure.ShardFileSize(data.Size()) 1405 inlineBlock := globalStorageClass.InlineBlock() 1406 if inlineBlock <= 0 { 1407 inlineBlock = 128 * humanize.KiByte 1408 } 1409 1410 writers := make([]io.Writer, len(onlineDisks)) 1411 var inlineBuffers []*bytes.Buffer 1412 if shardFileSize >= 0 { 1413 if !opts.Versioned && shardFileSize < inlineBlock { 1414 inlineBuffers = make([]*bytes.Buffer, len(onlineDisks)) 1415 } else if shardFileSize < inlineBlock/8 { 1416 inlineBuffers = make([]*bytes.Buffer, len(onlineDisks)) 1417 } 1418 } else { 1419 // If compressed, use actual size to determine. 1420 if sz := erasure.ShardFileSize(data.ActualSize()); sz > 0 { 1421 if !opts.Versioned && sz < inlineBlock { 1422 inlineBuffers = make([]*bytes.Buffer, len(onlineDisks)) 1423 } else if sz < inlineBlock/8 { 1424 inlineBuffers = make([]*bytes.Buffer, len(onlineDisks)) 1425 } 1426 } 1427 } 1428 for i, disk := range onlineDisks { 1429 if disk == nil { 1430 continue 1431 } 1432 1433 if !disk.IsOnline() { 1434 continue 1435 } 1436 1437 if len(inlineBuffers) > 0 { 1438 sz := shardFileSize 1439 if sz < 0 { 1440 sz = data.ActualSize() 1441 } 1442 inlineBuffers[i] = bytes.NewBuffer(make([]byte, 0, sz)) 1443 writers[i] = newStreamingBitrotWriterBuffer(inlineBuffers[i], DefaultBitrotAlgorithm, erasure.ShardSize()) 1444 continue 1445 } 1446 1447 writers[i] = newBitrotWriter(disk, bucket, minioMetaTmpBucket, tempErasureObj, shardFileSize, DefaultBitrotAlgorithm, erasure.ShardSize()) 1448 } 1449 1450 toEncode := io.Reader(data) 1451 if data.Size() > bigFileThreshold { 1452 // We use 2 buffers, so we always have a full buffer of input. 1453 bufA := globalBytePoolCap.Get() 1454 bufB := globalBytePoolCap.Get() 1455 defer globalBytePoolCap.Put(bufA) 1456 defer globalBytePoolCap.Put(bufB) 1457 ra, err := readahead.NewReaderBuffer(data, [][]byte{bufA[:fi.Erasure.BlockSize], bufB[:fi.Erasure.BlockSize]}) 1458 if err == nil { 1459 toEncode = ra 1460 defer ra.Close() 1461 } 1462 logger.LogIf(ctx, err) 1463 } 1464 n, erasureErr := erasure.Encode(ctx, toEncode, writers, buffer, writeQuorum) 1465 closeBitrotWriters(writers) 1466 if erasureErr != nil { 1467 return ObjectInfo{}, toObjectErr(erasureErr, bucket, object) 1468 } 1469 1470 // Should return IncompleteBody{} error when reader has fewer bytes 1471 // than specified in request header. 1472 if n < data.Size() { 1473 return ObjectInfo{}, IncompleteBody{Bucket: bucket, Object: object} 1474 } 1475 1476 var compIndex []byte 1477 if opts.IndexCB != nil { 1478 compIndex = opts.IndexCB() 1479 } 1480 if !opts.NoLock { 1481 lk := er.NewNSLock(bucket, object) 1482 lkctx, err := lk.GetLock(ctx, globalOperationTimeout) 1483 if err != nil { 1484 return ObjectInfo{}, err 1485 } 1486 ctx = lkctx.Context() 1487 defer lk.Unlock(lkctx) 1488 } 1489 1490 modTime := opts.MTime 1491 if opts.MTime.IsZero() { 1492 modTime = UTCNow() 1493 } 1494 1495 for i, w := range writers { 1496 if w == nil { 1497 onlineDisks[i] = nil 1498 continue 1499 } 1500 if len(inlineBuffers) > 0 && inlineBuffers[i] != nil { 1501 partsMetadata[i].Data = inlineBuffers[i].Bytes() 1502 } else { 1503 partsMetadata[i].Data = nil 1504 } 1505 // No need to add checksum to part. We already have it on the object. 1506 partsMetadata[i].AddObjectPart(1, "", n, data.ActualSize(), modTime, compIndex, nil) 1507 partsMetadata[i].Versioned = opts.Versioned || opts.VersionSuspended 1508 } 1509 1510 userDefined["etag"] = r.MD5CurrentHexString() 1511 kind, _ := crypto.IsEncrypted(userDefined) 1512 if opts.PreserveETag != "" { 1513 if !opts.ReplicationRequest { 1514 userDefined["etag"] = opts.PreserveETag 1515 } else if kind != crypto.S3 { 1516 // if we have a replication request 1517 // and SSE-S3 is specified do not preserve 1518 // the incoming etag. 1519 userDefined["etag"] = opts.PreserveETag 1520 } 1521 } 1522 1523 // Guess content-type from the extension if possible. 1524 if userDefined["content-type"] == "" { 1525 userDefined["content-type"] = mimedb.TypeByExtension(path.Ext(object)) 1526 } 1527 1528 // if storageClass is standard no need to save it as part of metadata. 1529 if userDefined[xhttp.AmzStorageClass] == storageclass.STANDARD { 1530 delete(userDefined, xhttp.AmzStorageClass) 1531 } 1532 1533 // Fill all the necessary metadata. 1534 // Update `xl.meta` content on each disks. 1535 for index := range partsMetadata { 1536 partsMetadata[index].Metadata = userDefined 1537 partsMetadata[index].Size = n 1538 partsMetadata[index].ModTime = modTime 1539 if len(inlineBuffers) > 0 { 1540 partsMetadata[index].SetInlineData() 1541 } 1542 if opts.DataMovement { 1543 partsMetadata[index].SetDataMov() 1544 } 1545 } 1546 1547 // Rename the successfully written temporary object to final location. 1548 onlineDisks, versionsDisparity, err := renameData(ctx, onlineDisks, minioMetaTmpBucket, tempObj, partsMetadata, bucket, object, writeQuorum) 1549 if err != nil { 1550 if errors.Is(err, errFileNotFound) { 1551 return ObjectInfo{}, toObjectErr(errErasureWriteQuorum, bucket, object) 1552 } 1553 return ObjectInfo{}, toObjectErr(err, bucket, object) 1554 } 1555 1556 for i := 0; i < len(onlineDisks); i++ { 1557 if onlineDisks[i] != nil && onlineDisks[i].IsOnline() { 1558 // Object info is the same in all disks, so we can pick 1559 // the first meta from online disk 1560 fi = partsMetadata[i] 1561 break 1562 } 1563 } 1564 1565 // For speedtest objects do not attempt to heal them. 1566 if !opts.Speedtest { 1567 // When there is versions disparity we are healing 1568 // the content implicitly for all versions, we can 1569 // avoid triggering another MRF heal for offline drives. 1570 if !versionsDisparity { 1571 // Whether a disk was initially or becomes offline 1572 // during this upload, send it to the MRF list. 1573 for i := 0; i < len(onlineDisks); i++ { 1574 if onlineDisks[i] != nil && onlineDisks[i].IsOnline() { 1575 continue 1576 } 1577 1578 er.addPartial(bucket, object, fi.VersionID) 1579 break 1580 } 1581 } else { 1582 globalMRFState.addPartialOp(partialOperation{ 1583 bucket: bucket, 1584 object: object, 1585 queued: time.Now(), 1586 allVersions: true, 1587 setIndex: er.setIndex, 1588 poolIndex: er.poolIndex, 1589 }) 1590 } 1591 } 1592 1593 fi.ReplicationState = opts.PutReplicationState() 1594 1595 // we are adding a new version to this object under the namespace lock, so this is the latest version. 1596 fi.IsLatest = true 1597 1598 return fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended), nil 1599 } 1600 1601 func (er erasureObjects) deleteObjectVersion(ctx context.Context, bucket, object string, fi FileInfo, forceDelMarker bool) error { 1602 disks := er.getDisks() 1603 // Assume (N/2 + 1) quorum for Delete() 1604 // this is a theoretical assumption such that 1605 // for delete's we do not need to honor storage 1606 // class for objects that have reduced quorum 1607 // due to storage class - this only needs to be honored 1608 // for Read() requests alone that we already do. 1609 writeQuorum := len(disks)/2 + 1 1610 1611 g := errgroup.WithNErrs(len(disks)) 1612 for index := range disks { 1613 index := index 1614 g.Go(func() error { 1615 if disks[index] == nil { 1616 return errDiskNotFound 1617 } 1618 return disks[index].DeleteVersion(ctx, bucket, object, fi, forceDelMarker, DeleteOptions{}) 1619 }, index) 1620 } 1621 // return errors if any during deletion 1622 return reduceWriteQuorumErrs(ctx, g.Wait(), objectOpIgnoredErrs, writeQuorum) 1623 } 1624 1625 // DeleteObjects deletes objects/versions in bulk, this function will still automatically split objects list 1626 // into smaller bulks if some object names are found to be duplicated in the delete list, splitting 1627 // into smaller bulks will avoid holding twice the write lock of the duplicated object names. 1628 func (er erasureObjects) DeleteObjects(ctx context.Context, bucket string, objects []ObjectToDelete, opts ObjectOptions) ([]DeletedObject, []error) { 1629 if !opts.NoAuditLog { 1630 for _, obj := range objects { 1631 auditObjectErasureSet(ctx, obj.ObjectV.ObjectName, &er) 1632 } 1633 } 1634 1635 errs := make([]error, len(objects)) 1636 dobjects := make([]DeletedObject, len(objects)) 1637 writeQuorums := make([]int, len(objects)) 1638 1639 storageDisks := er.getDisks() 1640 1641 for i := range objects { 1642 // Assume (N/2 + 1) quorums for all objects 1643 // this is a theoretical assumption such that 1644 // for delete's we do not need to honor storage 1645 // class for objects which have reduced quorum 1646 // storage class only needs to be honored for 1647 // Read() requests alone which we already do. 1648 writeQuorums[i] = len(storageDisks)/2 + 1 1649 } 1650 1651 versionsMap := make(map[string]FileInfoVersions, len(objects)) 1652 for i := range objects { 1653 // Construct the FileInfo data that needs to be preserved on the disk. 1654 vr := FileInfo{ 1655 Name: objects[i].ObjectName, 1656 VersionID: objects[i].VersionID, 1657 ReplicationState: objects[i].ReplicationState(), 1658 // save the index to set correct error at this index. 1659 Idx: i, 1660 } 1661 vr.SetTierFreeVersionID(mustGetUUID()) 1662 // VersionID is not set means delete is not specific about 1663 // any version, look for if the bucket is versioned or not. 1664 if objects[i].VersionID == "" { 1665 // MinIO extension to bucket version configuration 1666 suspended := opts.VersionSuspended 1667 versioned := opts.Versioned 1668 if opts.PrefixEnabledFn != nil { 1669 versioned = opts.PrefixEnabledFn(objects[i].ObjectName) 1670 } 1671 if versioned || suspended { 1672 // Bucket is versioned and no version was explicitly 1673 // mentioned for deletes, create a delete marker instead. 1674 vr.ModTime = UTCNow() 1675 vr.Deleted = true 1676 // Versioning suspended means that we add a `null` version 1677 // delete marker, if not add a new version for this delete 1678 // marker. 1679 if versioned { 1680 vr.VersionID = mustGetUUID() 1681 } 1682 } 1683 } 1684 // De-dup same object name to collect multiple versions for same object. 1685 v, ok := versionsMap[objects[i].ObjectName] 1686 if ok { 1687 v.Versions = append(v.Versions, vr) 1688 } else { 1689 v = FileInfoVersions{ 1690 Name: vr.Name, 1691 Versions: []FileInfo{vr}, 1692 } 1693 } 1694 if vr.Deleted { 1695 dobjects[i] = DeletedObject{ 1696 DeleteMarker: vr.Deleted, 1697 DeleteMarkerVersionID: vr.VersionID, 1698 DeleteMarkerMTime: DeleteMarkerMTime{vr.ModTime}, 1699 ObjectName: vr.Name, 1700 ReplicationState: vr.ReplicationState, 1701 } 1702 } else { 1703 dobjects[i] = DeletedObject{ 1704 ObjectName: vr.Name, 1705 VersionID: vr.VersionID, 1706 ReplicationState: vr.ReplicationState, 1707 } 1708 } 1709 versionsMap[objects[i].ObjectName] = v 1710 } 1711 1712 dedupVersions := make([]FileInfoVersions, 0, len(versionsMap)) 1713 for _, version := range versionsMap { 1714 dedupVersions = append(dedupVersions, version) 1715 } 1716 1717 // Initialize list of errors. 1718 delObjErrs := make([][]error, len(storageDisks)) 1719 1720 var wg sync.WaitGroup 1721 // Remove versions in bulk for each disk 1722 for index, disk := range storageDisks { 1723 wg.Add(1) 1724 go func(index int, disk StorageAPI) { 1725 defer wg.Done() 1726 delObjErrs[index] = make([]error, len(objects)) 1727 if disk == nil { 1728 for i := range objects { 1729 delObjErrs[index][i] = errDiskNotFound 1730 } 1731 return 1732 } 1733 errs := disk.DeleteVersions(ctx, bucket, dedupVersions, DeleteOptions{}) 1734 for i, err := range errs { 1735 if err == nil { 1736 continue 1737 } 1738 for _, v := range dedupVersions[i].Versions { 1739 if err == errFileNotFound || err == errFileVersionNotFound { 1740 if !dobjects[v.Idx].DeleteMarker { 1741 // Not delete marker, if not found, ok. 1742 continue 1743 } 1744 } 1745 delObjErrs[index][v.Idx] = err 1746 } 1747 } 1748 }(index, disk) 1749 } 1750 wg.Wait() 1751 1752 // Reduce errors for each object 1753 for objIndex := range objects { 1754 diskErrs := make([]error, len(storageDisks)) 1755 // Iterate over disks to fetch the error 1756 // of deleting of the current object 1757 for i := range delObjErrs { 1758 // delObjErrs[i] is not nil when disks[i] is also not nil 1759 if delObjErrs[i] != nil { 1760 diskErrs[i] = delObjErrs[i][objIndex] 1761 } 1762 } 1763 err := reduceWriteQuorumErrs(ctx, diskErrs, objectOpIgnoredErrs, writeQuorums[objIndex]) 1764 if objects[objIndex].VersionID != "" { 1765 errs[objIndex] = toObjectErr(err, bucket, objects[objIndex].ObjectName, objects[objIndex].VersionID) 1766 } else { 1767 errs[objIndex] = toObjectErr(err, bucket, objects[objIndex].ObjectName) 1768 } 1769 } 1770 1771 // Check failed deletes across multiple objects 1772 for i, dobj := range dobjects { 1773 // This object errored, we should attempt a heal just in case. 1774 if errs[i] != nil && !isErrVersionNotFound(errs[i]) && !isErrObjectNotFound(errs[i]) { 1775 // all other direct versionId references we should 1776 // ensure no dangling file is left over. 1777 er.addPartial(bucket, dobj.ObjectName, dobj.VersionID) 1778 continue 1779 } 1780 1781 // Check if there is any offline disk and add it to the MRF list 1782 for _, disk := range storageDisks { 1783 if disk != nil && disk.IsOnline() { 1784 // Skip attempted heal on online disks. 1785 continue 1786 } 1787 1788 // all other direct versionId references we should 1789 // ensure no dangling file is left over. 1790 er.addPartial(bucket, dobj.ObjectName, dobj.VersionID) 1791 break 1792 } 1793 } 1794 1795 return dobjects, errs 1796 } 1797 1798 func (er erasureObjects) deletePrefix(ctx context.Context, bucket, prefix string) error { 1799 disks := er.getDisks() 1800 g := errgroup.WithNErrs(len(disks)) 1801 for index := range disks { 1802 index := index 1803 g.Go(func() error { 1804 if disks[index] == nil { 1805 return nil 1806 } 1807 return disks[index].Delete(ctx, bucket, prefix, DeleteOptions{ 1808 Recursive: true, 1809 Immediate: true, 1810 }) 1811 }, index) 1812 } 1813 for _, err := range g.Wait() { 1814 if err != nil { 1815 return err 1816 } 1817 } 1818 return nil 1819 } 1820 1821 // DeleteObject - deletes an object, this call doesn't necessary reply 1822 // any error as it is not necessary for the handler to reply back a 1823 // response to the client request. 1824 func (er erasureObjects) DeleteObject(ctx context.Context, bucket, object string, opts ObjectOptions) (objInfo ObjectInfo, err error) { 1825 if !opts.NoAuditLog { 1826 auditObjectErasureSet(ctx, object, &er) 1827 } 1828 1829 var lc *lifecycle.Lifecycle 1830 var rcfg lock.Retention 1831 var replcfg *replication.Config 1832 if opts.Expiration.Expire { 1833 // Check if the current bucket has a configured lifecycle policy 1834 lc, _ = globalLifecycleSys.Get(bucket) 1835 rcfg, _ = globalBucketObjectLockSys.Get(bucket) 1836 replcfg, _ = getReplicationConfig(ctx, bucket) 1837 } 1838 1839 // expiration attempted on a bucket with no lifecycle 1840 // rules shall be rejected. 1841 if lc == nil && opts.Expiration.Expire { 1842 if opts.VersionID != "" { 1843 return objInfo, VersionNotFound{ 1844 Bucket: bucket, 1845 Object: object, 1846 VersionID: opts.VersionID, 1847 } 1848 } 1849 return objInfo, ObjectNotFound{ 1850 Bucket: bucket, 1851 Object: object, 1852 } 1853 } 1854 1855 if opts.DeletePrefix { 1856 if opts.Expiration.Expire { 1857 // Expire all versions expiration must still verify the state() on disk 1858 // via a getObjectInfo() call as follows, any read quorum issues we 1859 // must not proceed further for safety reasons. attempt a MRF heal 1860 // while we see such quorum errors. 1861 goi, _, gerr := er.getObjectInfoAndQuorum(ctx, bucket, object, opts) 1862 if gerr != nil && goi.Name == "" { 1863 if _, ok := gerr.(InsufficientReadQuorum); ok { 1864 // Add an MRF heal for next time. 1865 er.addPartial(bucket, object, opts.VersionID) 1866 1867 return objInfo, InsufficientWriteQuorum{} 1868 } 1869 return objInfo, gerr 1870 } 1871 1872 // Add protection and re-verify the ILM rules for qualification 1873 // based on the latest objectInfo and see if the object still 1874 // qualifies for deletion. 1875 if gerr == nil { 1876 evt := evalActionFromLifecycle(ctx, *lc, rcfg, replcfg, goi) 1877 var isErr bool 1878 switch evt.Action { 1879 case lifecycle.NoneAction: 1880 isErr = true 1881 case lifecycle.TransitionAction, lifecycle.TransitionVersionAction: 1882 isErr = true 1883 } 1884 if isErr { 1885 if goi.VersionID != "" { 1886 return goi, VersionNotFound{ 1887 Bucket: bucket, 1888 Object: object, 1889 VersionID: goi.VersionID, 1890 } 1891 } 1892 return goi, ObjectNotFound{ 1893 Bucket: bucket, 1894 Object: object, 1895 } 1896 } 1897 } 1898 } // Delete marker and any latest that qualifies shall be expired permanently. 1899 1900 return ObjectInfo{}, toObjectErr(er.deletePrefix(ctx, bucket, object), bucket, object) 1901 } 1902 1903 storageDisks := er.getDisks() 1904 versionFound := true 1905 objInfo = ObjectInfo{VersionID: opts.VersionID} // version id needed in Delete API response. 1906 goi, _, gerr := er.getObjectInfoAndQuorum(ctx, bucket, object, opts) 1907 if gerr != nil && goi.Name == "" { 1908 if _, ok := gerr.(InsufficientReadQuorum); ok { 1909 // Add an MRF heal for next time. 1910 er.addPartial(bucket, object, opts.VersionID) 1911 1912 return objInfo, InsufficientWriteQuorum{} 1913 } 1914 // For delete marker replication, versionID being replicated will not exist on disk 1915 if opts.DeleteMarker { 1916 versionFound = false 1917 } else { 1918 return objInfo, gerr 1919 } 1920 } 1921 1922 if opts.EvalMetadataFn != nil { 1923 dsc, err := opts.EvalMetadataFn(&goi, err) 1924 if err != nil { 1925 return ObjectInfo{}, err 1926 } 1927 if dsc.ReplicateAny() { 1928 opts.SetDeleteReplicationState(dsc, opts.VersionID) 1929 goi.replicationDecision = opts.DeleteReplication.ReplicateDecisionStr 1930 } 1931 } 1932 1933 if opts.EvalRetentionBypassFn != nil { 1934 if err := opts.EvalRetentionBypassFn(goi, gerr); err != nil { 1935 return ObjectInfo{}, err 1936 } 1937 } 1938 1939 if opts.Expiration.Expire { 1940 if gerr == nil { 1941 evt := evalActionFromLifecycle(ctx, *lc, rcfg, replcfg, goi) 1942 var isErr bool 1943 switch evt.Action { 1944 case lifecycle.NoneAction: 1945 isErr = true 1946 case lifecycle.TransitionAction, lifecycle.TransitionVersionAction: 1947 isErr = true 1948 } 1949 if isErr { 1950 if goi.VersionID != "" { 1951 return goi, VersionNotFound{ 1952 Bucket: bucket, 1953 Object: object, 1954 VersionID: goi.VersionID, 1955 } 1956 } 1957 return goi, ObjectNotFound{ 1958 Bucket: bucket, 1959 Object: object, 1960 } 1961 } 1962 } 1963 } 1964 1965 // Determine whether to mark object deleted for replication 1966 markDelete := goi.VersionID != "" 1967 1968 // Default deleteMarker to true if object is under versioning 1969 deleteMarker := opts.Versioned 1970 1971 if opts.VersionID != "" { 1972 // case where replica version needs to be deleted on target cluster 1973 if versionFound && opts.DeleteMarkerReplicationStatus() == replication.Replica { 1974 markDelete = false 1975 } 1976 if opts.VersionPurgeStatus().Empty() && opts.DeleteMarkerReplicationStatus().Empty() { 1977 markDelete = false 1978 } 1979 if opts.VersionPurgeStatus() == Complete { 1980 markDelete = false 1981 } 1982 // now, since VersionPurgeStatus() is already set, we can let the 1983 // lower layers decide this. This fixes a regression that was introduced 1984 // in PR #14555 where !VersionPurgeStatus.Empty() is automatically 1985 // considered as Delete marker true to avoid listing such objects by 1986 // regular ListObjects() calls. However for delete replication this 1987 // ends up being a problem because "upon" a successful delete this 1988 // ends up creating a new delete marker that is spurious and unnecessary. 1989 // 1990 // Regression introduced by #14555 was reintroduced in #15564 1991 if versionFound { 1992 if !goi.VersionPurgeStatus.Empty() { 1993 deleteMarker = false 1994 } else if !goi.DeleteMarker { // implies a versioned delete of object 1995 deleteMarker = false 1996 } 1997 } 1998 } 1999 2000 modTime := opts.MTime 2001 if opts.MTime.IsZero() { 2002 modTime = UTCNow() 2003 } 2004 fvID := mustGetUUID() 2005 2006 defer func() { 2007 // attempt a heal before returning if there are offline disks 2008 // for both del marker and permanent delete situations. 2009 for _, disk := range storageDisks { 2010 if disk != nil && disk.IsOnline() { 2011 continue 2012 } 2013 er.addPartial(bucket, object, opts.VersionID) 2014 break 2015 } 2016 }() 2017 2018 if markDelete && (opts.Versioned || opts.VersionSuspended) { 2019 if !deleteMarker { 2020 // versioning suspended means we add `null` version as 2021 // delete marker, if its not decided already. 2022 deleteMarker = opts.VersionSuspended && opts.VersionID == "" 2023 } 2024 fi := FileInfo{ 2025 Name: object, 2026 Deleted: deleteMarker, 2027 MarkDeleted: markDelete, 2028 ModTime: modTime, 2029 ReplicationState: opts.DeleteReplication, 2030 TransitionStatus: opts.Transition.Status, 2031 ExpireRestored: opts.Transition.ExpireRestored, 2032 } 2033 fi.SetTierFreeVersionID(fvID) 2034 if opts.SkipFreeVersion { 2035 fi.SetSkipTierFreeVersion() 2036 } 2037 if opts.VersionID != "" { 2038 fi.VersionID = opts.VersionID 2039 } else if opts.Versioned { 2040 fi.VersionID = mustGetUUID() 2041 } 2042 // versioning suspended means we add `null` version as 2043 // delete marker. Add delete marker, since we don't have 2044 // any version specified explicitly. Or if a particular 2045 // version id needs to be replicated. 2046 if err = er.deleteObjectVersion(ctx, bucket, object, fi, opts.DeleteMarker); err != nil { 2047 return objInfo, toObjectErr(err, bucket, object) 2048 } 2049 oi := fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended) 2050 oi.replicationDecision = goi.replicationDecision 2051 return oi, nil 2052 } 2053 2054 // Delete the object version on all disks. 2055 dfi := FileInfo{ 2056 Name: object, 2057 VersionID: opts.VersionID, 2058 MarkDeleted: markDelete, 2059 Deleted: deleteMarker, 2060 ModTime: modTime, 2061 ReplicationState: opts.DeleteReplication, 2062 TransitionStatus: opts.Transition.Status, 2063 ExpireRestored: opts.Transition.ExpireRestored, 2064 } 2065 dfi.SetTierFreeVersionID(fvID) 2066 if opts.SkipFreeVersion { 2067 dfi.SetSkipTierFreeVersion() 2068 } 2069 if err = er.deleteObjectVersion(ctx, bucket, object, dfi, opts.DeleteMarker); err != nil { 2070 return objInfo, toObjectErr(err, bucket, object) 2071 } 2072 2073 return dfi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended), nil 2074 } 2075 2076 // Send the successful but partial upload/delete, however ignore 2077 // if the channel is blocked by other items. 2078 func (er erasureObjects) addPartial(bucket, object, versionID string) { 2079 globalMRFState.addPartialOp(partialOperation{ 2080 bucket: bucket, 2081 object: object, 2082 versionID: versionID, 2083 queued: time.Now(), 2084 }) 2085 } 2086 2087 func (er erasureObjects) PutObjectMetadata(ctx context.Context, bucket, object string, opts ObjectOptions) (ObjectInfo, error) { 2088 if !opts.NoLock { 2089 // Lock the object before updating metadata. 2090 lk := er.NewNSLock(bucket, object) 2091 lkctx, err := lk.GetLock(ctx, globalOperationTimeout) 2092 if err != nil { 2093 return ObjectInfo{}, err 2094 } 2095 ctx = lkctx.Context() 2096 defer lk.Unlock(lkctx) 2097 } 2098 2099 disks := er.getDisks() 2100 2101 var metaArr []FileInfo 2102 var errs []error 2103 2104 // Read metadata associated with the object from all disks. 2105 if opts.VersionID != "" { 2106 metaArr, errs = readAllFileInfo(ctx, disks, "", bucket, object, opts.VersionID, false, false) 2107 } else { 2108 metaArr, errs = readAllXL(ctx, disks, bucket, object, false, false, true) 2109 } 2110 2111 readQuorum, _, err := objectQuorumFromMeta(ctx, metaArr, errs, er.defaultParityCount) 2112 if err != nil { 2113 if errors.Is(err, errErasureReadQuorum) && !strings.HasPrefix(bucket, minioMetaBucket) { 2114 _, derr := er.deleteIfDangling(context.Background(), bucket, object, metaArr, errs, nil, opts) 2115 if derr != nil { 2116 err = derr 2117 } 2118 } 2119 return ObjectInfo{}, toObjectErr(err, bucket, object) 2120 } 2121 2122 // List all online disks. 2123 onlineDisks, modTime, etag := listOnlineDisks(disks, metaArr, errs, readQuorum) 2124 2125 // Pick latest valid metadata. 2126 fi, err := pickValidFileInfo(ctx, metaArr, modTime, etag, readQuorum) 2127 if err != nil { 2128 return ObjectInfo{}, toObjectErr(err, bucket, object) 2129 } 2130 2131 if fi.Deleted { 2132 return ObjectInfo{}, toObjectErr(errMethodNotAllowed, bucket, object) 2133 } 2134 2135 filterOnlineDisksInplace(fi, metaArr, onlineDisks) 2136 2137 // if version-id is not specified retention is supposed to be set on the latest object. 2138 if opts.VersionID == "" { 2139 opts.VersionID = fi.VersionID 2140 } 2141 2142 objInfo := fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended) 2143 if opts.EvalMetadataFn != nil { 2144 if _, err := opts.EvalMetadataFn(&objInfo, err); err != nil { 2145 return ObjectInfo{}, err 2146 } 2147 } 2148 for k, v := range objInfo.UserDefined { 2149 fi.Metadata[k] = v 2150 } 2151 fi.ModTime = opts.MTime 2152 fi.VersionID = opts.VersionID 2153 2154 if err = er.updateObjectMeta(ctx, bucket, object, fi, onlineDisks); err != nil { 2155 return ObjectInfo{}, toObjectErr(err, bucket, object) 2156 } 2157 2158 return fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended), nil 2159 } 2160 2161 // PutObjectTags - replace or add tags to an existing object 2162 func (er erasureObjects) PutObjectTags(ctx context.Context, bucket, object string, tags string, opts ObjectOptions) (ObjectInfo, error) { 2163 // Lock the object before updating tags. 2164 lk := er.NewNSLock(bucket, object) 2165 lkctx, err := lk.GetLock(ctx, globalOperationTimeout) 2166 if err != nil { 2167 return ObjectInfo{}, err 2168 } 2169 ctx = lkctx.Context() 2170 defer lk.Unlock(lkctx) 2171 2172 disks := er.getDisks() 2173 2174 var metaArr []FileInfo 2175 var errs []error 2176 2177 // Read metadata associated with the object from all disks. 2178 if opts.VersionID != "" { 2179 metaArr, errs = readAllFileInfo(ctx, disks, "", bucket, object, opts.VersionID, false, false) 2180 } else { 2181 metaArr, errs = readAllXL(ctx, disks, bucket, object, false, false, true) 2182 } 2183 2184 readQuorum, _, err := objectQuorumFromMeta(ctx, metaArr, errs, er.defaultParityCount) 2185 if err != nil { 2186 if errors.Is(err, errErasureReadQuorum) && !strings.HasPrefix(bucket, minioMetaBucket) { 2187 _, derr := er.deleteIfDangling(context.Background(), bucket, object, metaArr, errs, nil, opts) 2188 if derr != nil { 2189 err = derr 2190 } 2191 } 2192 return ObjectInfo{}, toObjectErr(err, bucket, object) 2193 } 2194 2195 // List all online disks. 2196 onlineDisks, modTime, etag := listOnlineDisks(disks, metaArr, errs, readQuorum) 2197 2198 // Pick latest valid metadata. 2199 fi, err := pickValidFileInfo(ctx, metaArr, modTime, etag, readQuorum) 2200 if err != nil { 2201 return ObjectInfo{}, toObjectErr(err, bucket, object) 2202 } 2203 if fi.Deleted { 2204 if opts.VersionID == "" { 2205 return ObjectInfo{}, toObjectErr(errFileNotFound, bucket, object) 2206 } 2207 return ObjectInfo{}, toObjectErr(errMethodNotAllowed, bucket, object) 2208 } 2209 2210 filterOnlineDisksInplace(fi, metaArr, onlineDisks) 2211 2212 fi.Metadata[xhttp.AmzObjectTagging] = tags 2213 fi.ReplicationState = opts.PutReplicationState() 2214 for k, v := range opts.UserDefined { 2215 fi.Metadata[k] = v 2216 } 2217 2218 if err = er.updateObjectMeta(ctx, bucket, object, fi, onlineDisks); err != nil { 2219 return ObjectInfo{}, toObjectErr(err, bucket, object) 2220 } 2221 2222 return fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended), nil 2223 } 2224 2225 func (er erasureObjects) updateObjectMetaWithOpts(ctx context.Context, bucket, object string, fi FileInfo, onlineDisks []StorageAPI, opts UpdateMetadataOpts) error { 2226 if len(fi.Metadata) == 0 { 2227 return nil 2228 } 2229 2230 g := errgroup.WithNErrs(len(onlineDisks)) 2231 2232 // Start writing `xl.meta` to all disks in parallel. 2233 for index := range onlineDisks { 2234 index := index 2235 g.Go(func() error { 2236 if onlineDisks[index] == nil { 2237 return errDiskNotFound 2238 } 2239 return onlineDisks[index].UpdateMetadata(ctx, bucket, object, fi, opts) 2240 }, index) 2241 } 2242 2243 // Wait for all the routines. 2244 mErrs := g.Wait() 2245 2246 return reduceWriteQuorumErrs(ctx, mErrs, objectOpIgnoredErrs, fi.WriteQuorum(er.defaultWQuorum())) 2247 } 2248 2249 // updateObjectMeta will update the metadata of a file. 2250 func (er erasureObjects) updateObjectMeta(ctx context.Context, bucket, object string, fi FileInfo, onlineDisks []StorageAPI) error { 2251 return er.updateObjectMetaWithOpts(ctx, bucket, object, fi, onlineDisks, UpdateMetadataOpts{}) 2252 } 2253 2254 // DeleteObjectTags - delete object tags from an existing object 2255 func (er erasureObjects) DeleteObjectTags(ctx context.Context, bucket, object string, opts ObjectOptions) (ObjectInfo, error) { 2256 return er.PutObjectTags(ctx, bucket, object, "", opts) 2257 } 2258 2259 // GetObjectTags - get object tags from an existing object 2260 func (er erasureObjects) GetObjectTags(ctx context.Context, bucket, object string, opts ObjectOptions) (*tags.Tags, error) { 2261 // GetObjectInfo will return tag value as well 2262 oi, err := er.GetObjectInfo(ctx, bucket, object, opts) 2263 if err != nil { 2264 return nil, err 2265 } 2266 2267 return tags.ParseObjectTags(oi.UserTags) 2268 } 2269 2270 // TransitionObject - transition object content to target tier. 2271 func (er erasureObjects) TransitionObject(ctx context.Context, bucket, object string, opts ObjectOptions) error { 2272 tgtClient, err := globalTierConfigMgr.getDriver(opts.Transition.Tier) 2273 if err != nil { 2274 return err 2275 } 2276 2277 // Acquire write lock before starting to transition the object. 2278 lk := er.NewNSLock(bucket, object) 2279 lkctx, err := lk.GetLock(ctx, globalDeleteOperationTimeout) 2280 if err != nil { 2281 return err 2282 } 2283 ctx = lkctx.Context() 2284 defer lk.Unlock(lkctx) 2285 2286 fi, metaArr, onlineDisks, err := er.getObjectFileInfo(ctx, bucket, object, opts, true) 2287 if err != nil { 2288 return toObjectErr(err, bucket, object) 2289 } 2290 if fi.Deleted { 2291 if opts.VersionID == "" { 2292 return toObjectErr(errFileNotFound, bucket, object) 2293 } 2294 // Make sure to return object info to provide extra information. 2295 return toObjectErr(errMethodNotAllowed, bucket, object) 2296 } 2297 // verify that the object queued for transition is identical to that on disk. 2298 if !opts.MTime.Equal(fi.ModTime) || !strings.EqualFold(opts.Transition.ETag, extractETag(fi.Metadata)) { 2299 return toObjectErr(errFileNotFound, bucket, object) 2300 } 2301 // if object already transitioned, return 2302 if fi.TransitionStatus == lifecycle.TransitionComplete { 2303 return nil 2304 } 2305 2306 if fi.XLV1 { 2307 if _, err = er.HealObject(ctx, bucket, object, "", madmin.HealOpts{NoLock: true}); err != nil { 2308 return err 2309 } 2310 // Fetch FileInfo again. HealObject migrates object the latest 2311 // format. Among other things this changes fi.DataDir and 2312 // possibly fi.Data (if data is inlined). 2313 fi, metaArr, onlineDisks, err = er.getObjectFileInfo(ctx, bucket, object, opts, true) 2314 if err != nil { 2315 return toObjectErr(err, bucket, object) 2316 } 2317 } 2318 traceFn := globalLifecycleSys.trace(fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended)) 2319 2320 destObj, err := genTransitionObjName(bucket) 2321 if err != nil { 2322 return err 2323 } 2324 2325 pr, pw := xioutil.WaitPipe() 2326 go func() { 2327 err := er.getObjectWithFileInfo(ctx, bucket, object, 0, fi.Size, pw, fi, metaArr, onlineDisks) 2328 pw.CloseWithError(err) 2329 }() 2330 2331 var rv remoteVersionID 2332 rv, err = tgtClient.Put(ctx, destObj, pr, fi.Size) 2333 pr.CloseWithError(err) 2334 if err != nil { 2335 return err 2336 } 2337 fi.TransitionStatus = lifecycle.TransitionComplete 2338 fi.TransitionedObjName = destObj 2339 fi.TransitionTier = opts.Transition.Tier 2340 fi.TransitionVersionID = string(rv) 2341 eventName := event.ObjectTransitionComplete 2342 2343 storageDisks := er.getDisks() 2344 2345 if err = er.deleteObjectVersion(ctx, bucket, object, fi, false); err != nil { 2346 eventName = event.ObjectTransitionFailed 2347 } 2348 2349 for _, disk := range storageDisks { 2350 if disk != nil && disk.IsOnline() { 2351 continue 2352 } 2353 er.addPartial(bucket, object, opts.VersionID) 2354 break 2355 } 2356 2357 objInfo := fi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended) 2358 sendEvent(eventArgs{ 2359 EventName: eventName, 2360 BucketName: bucket, 2361 Object: objInfo, 2362 UserAgent: "Internal: [ILM-Transition]", 2363 Host: globalLocalNodeName, 2364 }) 2365 tags := opts.LifecycleAuditEvent.Tags() 2366 auditLogLifecycle(ctx, objInfo, ILMTransition, tags, traceFn) 2367 return err 2368 } 2369 2370 // RestoreTransitionedObject - restore transitioned object content locally on this cluster. 2371 // This is similar to PostObjectRestore from AWS GLACIER 2372 // storage class. When PostObjectRestore API is called, a temporary copy of the object 2373 // is restored locally to the bucket on source cluster until the restore expiry date. 2374 // The copy that was transitioned continues to reside in the transitioned tier. 2375 func (er erasureObjects) RestoreTransitionedObject(ctx context.Context, bucket, object string, opts ObjectOptions) error { 2376 return er.restoreTransitionedObject(ctx, bucket, object, opts) 2377 } 2378 2379 // update restore status header in the metadata 2380 func (er erasureObjects) updateRestoreMetadata(ctx context.Context, bucket, object string, objInfo ObjectInfo, opts ObjectOptions) error { 2381 oi := objInfo.Clone() 2382 oi.metadataOnly = true // Perform only metadata updates. 2383 2384 // allow retry in the case of failure to restore 2385 delete(oi.UserDefined, xhttp.AmzRestore) 2386 2387 if _, err := er.CopyObject(ctx, bucket, object, bucket, object, oi, ObjectOptions{ 2388 VersionID: oi.VersionID, 2389 }, ObjectOptions{ 2390 VersionID: oi.VersionID, 2391 }); err != nil { 2392 logger.LogIf(ctx, fmt.Errorf("Unable to update transition restore metadata for %s/%s(%s): %s", bucket, object, oi.VersionID, err)) 2393 return err 2394 } 2395 return nil 2396 } 2397 2398 // restoreTransitionedObject for multipart object chunks the file stream from remote tier into the same number of parts 2399 // as in the xl.meta for this version and rehydrates the part.n into the fi.DataDir for this version as in the xl.meta 2400 func (er erasureObjects) restoreTransitionedObject(ctx context.Context, bucket string, object string, opts ObjectOptions) error { 2401 setRestoreHeaderFn := func(oi ObjectInfo, rerr error) error { 2402 if rerr == nil { 2403 return nil // nothing to do; restore object was successful 2404 } 2405 er.updateRestoreMetadata(ctx, bucket, object, oi, opts) 2406 return rerr 2407 } 2408 var oi ObjectInfo 2409 // get the file info on disk for transitioned object 2410 actualfi, _, _, err := er.getObjectFileInfo(ctx, bucket, object, opts, false) 2411 if err != nil { 2412 return setRestoreHeaderFn(oi, toObjectErr(err, bucket, object)) 2413 } 2414 2415 oi = actualfi.ToObjectInfo(bucket, object, opts.Versioned || opts.VersionSuspended) 2416 ropts := putRestoreOpts(bucket, object, opts.Transition.RestoreRequest, oi) 2417 if len(oi.Parts) == 1 { 2418 var rs *HTTPRangeSpec 2419 gr, err := getTransitionedObjectReader(ctx, bucket, object, rs, http.Header{}, oi, opts) 2420 if err != nil { 2421 return setRestoreHeaderFn(oi, toObjectErr(err, bucket, object)) 2422 } 2423 defer gr.Close() 2424 hashReader, err := hash.NewReader(ctx, gr, gr.ObjInfo.Size, "", "", gr.ObjInfo.Size) 2425 if err != nil { 2426 return setRestoreHeaderFn(oi, toObjectErr(err, bucket, object)) 2427 } 2428 pReader := NewPutObjReader(hashReader) 2429 _, err = er.PutObject(ctx, bucket, object, pReader, ropts) 2430 return setRestoreHeaderFn(oi, toObjectErr(err, bucket, object)) 2431 } 2432 2433 res, err := er.NewMultipartUpload(ctx, bucket, object, ropts) 2434 if err != nil { 2435 return setRestoreHeaderFn(oi, err) 2436 } 2437 2438 var uploadedParts []CompletePart 2439 var rs *HTTPRangeSpec 2440 // get reader from the warm backend - note that even in the case of encrypted objects, this stream is still encrypted. 2441 gr, err := getTransitionedObjectReader(ctx, bucket, object, rs, http.Header{}, oi, opts) 2442 if err != nil { 2443 return setRestoreHeaderFn(oi, err) 2444 } 2445 defer gr.Close() 2446 2447 // rehydrate the parts back on disk as per the original xl.meta prior to transition 2448 for _, partInfo := range oi.Parts { 2449 hr, err := hash.NewReader(ctx, io.LimitReader(gr, partInfo.Size), partInfo.Size, "", "", partInfo.Size) 2450 if err != nil { 2451 return setRestoreHeaderFn(oi, err) 2452 } 2453 pInfo, err := er.PutObjectPart(ctx, bucket, object, res.UploadID, partInfo.Number, NewPutObjReader(hr), ObjectOptions{}) 2454 if err != nil { 2455 return setRestoreHeaderFn(oi, err) 2456 } 2457 if pInfo.Size != partInfo.Size { 2458 return setRestoreHeaderFn(oi, InvalidObjectState{Bucket: bucket, Object: object}) 2459 } 2460 uploadedParts = append(uploadedParts, CompletePart{ 2461 PartNumber: pInfo.PartNumber, 2462 ETag: pInfo.ETag, 2463 }) 2464 } 2465 _, err = er.CompleteMultipartUpload(ctx, bucket, object, res.UploadID, uploadedParts, ObjectOptions{ 2466 MTime: oi.ModTime, 2467 }) 2468 return setRestoreHeaderFn(oi, err) 2469 } 2470 2471 // DecomTieredObject - moves tiered object to another pool during decommissioning. 2472 func (er erasureObjects) DecomTieredObject(ctx context.Context, bucket, object string, fi FileInfo, opts ObjectOptions) error { 2473 if opts.UserDefined == nil { 2474 opts.UserDefined = make(map[string]string) 2475 } 2476 // overlay Erasure info for this set of disks 2477 storageDisks := er.getDisks() 2478 // Get parity and data drive count based on storage class metadata 2479 parityDrives := globalStorageClass.GetParityForSC(opts.UserDefined[xhttp.AmzStorageClass]) 2480 if parityDrives < 0 { 2481 parityDrives = er.defaultParityCount 2482 } 2483 dataDrives := len(storageDisks) - parityDrives 2484 2485 // we now know the number of blocks this object needs for data and parity. 2486 // writeQuorum is dataBlocks + 1 2487 writeQuorum := dataDrives 2488 if dataDrives == parityDrives { 2489 writeQuorum++ 2490 } 2491 2492 // Initialize parts metadata 2493 partsMetadata := make([]FileInfo, len(storageDisks)) 2494 2495 fi2 := newFileInfo(pathJoin(bucket, object), dataDrives, parityDrives) 2496 fi.Erasure = fi2.Erasure 2497 // Initialize erasure metadata. 2498 for index := range partsMetadata { 2499 partsMetadata[index] = fi 2500 partsMetadata[index].Erasure.Index = index + 1 2501 } 2502 2503 // Order disks according to erasure distribution 2504 var onlineDisks []StorageAPI 2505 onlineDisks, partsMetadata = shuffleDisksAndPartsMetadata(storageDisks, partsMetadata, fi) 2506 2507 if _, err := writeUniqueFileInfo(ctx, onlineDisks, "", bucket, object, partsMetadata, writeQuorum); err != nil { 2508 return toObjectErr(err, bucket, object) 2509 } 2510 2511 return nil 2512 }