github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/erasure-metadata.go (about) 1 // Copyright (c) 2015-2021 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package cmd 19 20 import ( 21 "context" 22 "encoding/hex" 23 "fmt" 24 "sort" 25 "strings" 26 "time" 27 28 "github.com/minio/minio/internal/amztime" 29 "github.com/minio/minio/internal/bucket/replication" 30 "github.com/minio/minio/internal/crypto" 31 "github.com/minio/minio/internal/hash/sha256" 32 xhttp "github.com/minio/minio/internal/http" 33 "github.com/minio/minio/internal/logger" 34 "github.com/minio/pkg/v2/sync/errgroup" 35 ) 36 37 // Object was stored with additional erasure codes due to degraded system at upload time 38 const minIOErasureUpgraded = "x-minio-internal-erasure-upgraded" 39 40 const erasureAlgorithm = "rs-vandermonde" 41 42 // GetChecksumInfo - get checksum of a part. 43 func (e ErasureInfo) GetChecksumInfo(partNumber int) (ckSum ChecksumInfo) { 44 for _, sum := range e.Checksums { 45 if sum.PartNumber == partNumber { 46 // Return the checksum 47 return sum 48 } 49 } 50 return ChecksumInfo{Algorithm: DefaultBitrotAlgorithm} 51 } 52 53 // ShardFileSize - returns final erasure size from original size. 54 func (e ErasureInfo) ShardFileSize(totalLength int64) int64 { 55 if totalLength == 0 { 56 return 0 57 } 58 if totalLength == -1 { 59 return -1 60 } 61 numShards := totalLength / e.BlockSize 62 lastBlockSize := totalLength % e.BlockSize 63 lastShardSize := ceilFrac(lastBlockSize, int64(e.DataBlocks)) 64 return numShards*e.ShardSize() + lastShardSize 65 } 66 67 // ShardSize - returns actual shared size from erasure blockSize. 68 func (e ErasureInfo) ShardSize() int64 { 69 return ceilFrac(e.BlockSize, int64(e.DataBlocks)) 70 } 71 72 // IsValid - tells if erasure info fields are valid. 73 func (fi FileInfo) IsValid() bool { 74 if fi.Deleted { 75 // Delete marker has no data, no need to check 76 // for erasure coding information 77 return true 78 } 79 dataBlocks := fi.Erasure.DataBlocks 80 parityBlocks := fi.Erasure.ParityBlocks 81 correctIndexes := (fi.Erasure.Index > 0 && 82 fi.Erasure.Index <= dataBlocks+parityBlocks && 83 len(fi.Erasure.Distribution) == (dataBlocks+parityBlocks)) 84 return ((dataBlocks >= parityBlocks) && 85 (dataBlocks > 0) && (parityBlocks >= 0) && 86 correctIndexes) 87 } 88 89 // ToObjectInfo - Converts metadata to object info. 90 func (fi FileInfo) ToObjectInfo(bucket, object string, versioned bool) ObjectInfo { 91 object = decodeDirObject(object) 92 versionID := fi.VersionID 93 if versioned && versionID == "" { 94 versionID = nullVersionID 95 } 96 97 objInfo := ObjectInfo{ 98 IsDir: HasSuffix(object, SlashSeparator), 99 Bucket: bucket, 100 Name: object, 101 ParityBlocks: fi.Erasure.ParityBlocks, 102 DataBlocks: fi.Erasure.DataBlocks, 103 VersionID: versionID, 104 IsLatest: fi.IsLatest, 105 DeleteMarker: fi.Deleted, 106 Size: fi.Size, 107 ModTime: fi.ModTime, 108 Legacy: fi.XLV1, 109 ContentType: fi.Metadata["content-type"], 110 ContentEncoding: fi.Metadata["content-encoding"], 111 NumVersions: fi.NumVersions, 112 SuccessorModTime: fi.SuccessorModTime, 113 CacheControl: fi.Metadata["cache-control"], 114 } 115 116 if exp, ok := fi.Metadata["expires"]; ok { 117 if t, err := amztime.ParseHeader(exp); err == nil { 118 objInfo.Expires = t.UTC() 119 } 120 } 121 122 // Extract etag from metadata. 123 objInfo.ETag = extractETag(fi.Metadata) 124 125 // Add user tags to the object info 126 tags := fi.Metadata[xhttp.AmzObjectTagging] 127 if len(tags) != 0 { 128 objInfo.UserTags = tags 129 } 130 131 // Add replication status to the object info 132 objInfo.ReplicationStatusInternal = fi.ReplicationState.ReplicationStatusInternal 133 objInfo.VersionPurgeStatusInternal = fi.ReplicationState.VersionPurgeStatusInternal 134 objInfo.ReplicationStatus = fi.ReplicationStatus() 135 if objInfo.ReplicationStatus.Empty() { // overlay x-amx-replication-status if present for replicas 136 if st, ok := fi.Metadata[xhttp.AmzBucketReplicationStatus]; ok && st == string(replication.Replica) { 137 objInfo.ReplicationStatus = replication.StatusType(st) 138 } 139 } 140 objInfo.VersionPurgeStatus = fi.VersionPurgeStatus() 141 142 objInfo.TransitionedObject = TransitionedObject{ 143 Name: fi.TransitionedObjName, 144 VersionID: fi.TransitionVersionID, 145 Status: fi.TransitionStatus, 146 FreeVersion: fi.TierFreeVersion(), 147 Tier: fi.TransitionTier, 148 } 149 150 // etag/md5Sum has already been extracted. We need to 151 // remove to avoid it from appearing as part of 152 // response headers. e.g, X-Minio-* or X-Amz-*. 153 // Tags have also been extracted, we remove that as well. 154 objInfo.UserDefined = cleanMetadata(fi.Metadata) 155 156 // All the parts per object. 157 objInfo.Parts = fi.Parts 158 159 // Update storage class 160 if fi.TransitionTier != "" { 161 objInfo.StorageClass = fi.TransitionTier 162 } else if sc, ok := fi.Metadata[xhttp.AmzStorageClass]; ok { 163 objInfo.StorageClass = sc 164 } else { 165 objInfo.StorageClass = globalMinioDefaultStorageClass 166 } 167 168 // set restore status for transitioned object 169 restoreHdr, ok := fi.Metadata[xhttp.AmzRestore] 170 if ok { 171 if restoreStatus, err := parseRestoreObjStatus(restoreHdr); err == nil { 172 objInfo.RestoreOngoing = restoreStatus.Ongoing() 173 objInfo.RestoreExpires, _ = restoreStatus.Expiry() 174 } 175 } 176 objInfo.Checksum = fi.Checksum 177 objInfo.Inlined = fi.InlineData() 178 // Success. 179 return objInfo 180 } 181 182 // TransitionInfoEquals returns true if transition related information are equal, false otherwise. 183 func (fi FileInfo) TransitionInfoEquals(ofi FileInfo) bool { 184 switch { 185 case fi.TransitionStatus != ofi.TransitionStatus, 186 fi.TransitionTier != ofi.TransitionTier, 187 fi.TransitionedObjName != ofi.TransitionedObjName, 188 fi.TransitionVersionID != ofi.TransitionVersionID: 189 return false 190 } 191 return true 192 } 193 194 // MetadataEquals returns true if FileInfos Metadata maps are equal, false otherwise. 195 func (fi FileInfo) MetadataEquals(ofi FileInfo) bool { 196 if len(fi.Metadata) != len(ofi.Metadata) { 197 return false 198 } 199 for k, v := range fi.Metadata { 200 if ov, ok := ofi.Metadata[k]; !ok || ov != v { 201 return false 202 } 203 } 204 return true 205 } 206 207 // ReplicationInfoEquals returns true if server-side replication related fields are equal, false otherwise. 208 func (fi FileInfo) ReplicationInfoEquals(ofi FileInfo) bool { 209 switch { 210 case fi.MarkDeleted != ofi.MarkDeleted, 211 !fi.ReplicationState.Equal(ofi.ReplicationState): 212 return false 213 } 214 return true 215 } 216 217 // objectPartIndex - returns the index of matching object part number. 218 func objectPartIndex(parts []ObjectPartInfo, partNumber int) int { 219 for i, part := range parts { 220 if partNumber == part.Number { 221 return i 222 } 223 } 224 return -1 225 } 226 227 // AddObjectPart - add a new object part in order. 228 func (fi *FileInfo) AddObjectPart(partNumber int, partETag string, partSize, actualSize int64, modTime time.Time, idx []byte, checksums map[string]string) { 229 partInfo := ObjectPartInfo{ 230 Number: partNumber, 231 ETag: partETag, 232 Size: partSize, 233 ActualSize: actualSize, 234 ModTime: modTime, 235 Index: idx, 236 Checksums: checksums, 237 } 238 239 // Update part info if it already exists. 240 for i, part := range fi.Parts { 241 if partNumber == part.Number { 242 fi.Parts[i] = partInfo 243 return 244 } 245 } 246 247 // Proceed to include new part info. 248 fi.Parts = append(fi.Parts, partInfo) 249 250 // Parts in FileInfo should be in sorted order by part number. 251 sort.Slice(fi.Parts, func(i, j int) bool { return fi.Parts[i].Number < fi.Parts[j].Number }) 252 } 253 254 // ObjectToPartOffset - translate offset of an object to offset of its individual part. 255 func (fi FileInfo) ObjectToPartOffset(ctx context.Context, offset int64) (partIndex int, partOffset int64, err error) { 256 if offset == 0 { 257 // Special case - if offset is 0, then partIndex and partOffset are always 0. 258 return 0, 0, nil 259 } 260 partOffset = offset 261 // Seek until object offset maps to a particular part offset. 262 for i, part := range fi.Parts { 263 partIndex = i 264 // Offset is smaller than size we have reached the proper part offset. 265 if partOffset < part.Size { 266 return partIndex, partOffset, nil 267 } 268 // Continue to towards the next part. 269 partOffset -= part.Size 270 } 271 logger.LogIf(ctx, InvalidRange{}) 272 // Offset beyond the size of the object return InvalidRange. 273 return 0, 0, InvalidRange{} 274 } 275 276 func findFileInfoInQuorum(ctx context.Context, metaArr []FileInfo, modTime time.Time, etag string, quorum int) (FileInfo, error) { 277 // with less quorum return error. 278 if quorum < 1 { 279 return FileInfo{}, errErasureReadQuorum 280 } 281 metaHashes := make([]string, len(metaArr)) 282 h := sha256.New() 283 for i, meta := range metaArr { 284 if !meta.IsValid() { 285 continue 286 } 287 etagOnly := modTime.Equal(timeSentinel) && (etag != "" && etag == meta.Metadata["etag"]) 288 mtimeValid := meta.ModTime.Equal(modTime) 289 if mtimeValid || etagOnly { 290 fmt.Fprintf(h, "%v", meta.XLV1) 291 if !etagOnly { 292 // Verify dataDir is same only when mtime is valid and etag is not considered. 293 fmt.Fprintf(h, "%v", meta.GetDataDir()) 294 } 295 for _, part := range meta.Parts { 296 fmt.Fprintf(h, "part.%d", part.Number) 297 } 298 299 if !meta.Deleted && meta.Size != 0 { 300 fmt.Fprintf(h, "%v+%v", meta.Erasure.DataBlocks, meta.Erasure.ParityBlocks) 301 fmt.Fprintf(h, "%v", meta.Erasure.Distribution) 302 } 303 304 if meta.IsRemote() { 305 // ILM transition fields 306 fmt.Fprint(h, meta.TransitionStatus) 307 fmt.Fprint(h, meta.TransitionTier) 308 fmt.Fprint(h, meta.TransitionedObjName) 309 fmt.Fprint(h, meta.TransitionVersionID) 310 } 311 312 // If metadata says encrypted, ask for it in quorum. 313 if etyp, ok := crypto.IsEncrypted(meta.Metadata); ok { 314 fmt.Fprint(h, etyp) 315 } 316 317 // If compressed, look for compressed FileInfo only 318 if meta.IsCompressed() { 319 fmt.Fprint(h, meta.Metadata[ReservedMetadataPrefix+"compression"]) 320 } 321 322 metaHashes[i] = hex.EncodeToString(h.Sum(nil)) 323 h.Reset() 324 } 325 } 326 327 metaHashCountMap := make(map[string]int) 328 for _, hash := range metaHashes { 329 if hash == "" { 330 continue 331 } 332 metaHashCountMap[hash]++ 333 } 334 335 maxHash := "" 336 maxCount := 0 337 for hash, count := range metaHashCountMap { 338 if count > maxCount { 339 maxCount = count 340 maxHash = hash 341 } 342 } 343 344 if maxCount < quorum { 345 return FileInfo{}, errErasureReadQuorum 346 } 347 348 // Find the successor mod time in quorum, otherwise leave the 349 // candidate's successor modTime as found 350 succModTimeMap := make(map[time.Time]int) 351 var candidate FileInfo 352 var found bool 353 for i, hash := range metaHashes { 354 if hash == maxHash { 355 if metaArr[i].IsValid() { 356 if !found { 357 candidate = metaArr[i] 358 found = true 359 } 360 succModTimeMap[metaArr[i].SuccessorModTime]++ 361 } 362 } 363 } 364 var succModTime time.Time 365 var smodTimeQuorum bool 366 for smodTime, count := range succModTimeMap { 367 if count >= quorum { 368 smodTimeQuorum = true 369 succModTime = smodTime 370 break 371 } 372 } 373 374 if found { 375 if smodTimeQuorum { 376 candidate.SuccessorModTime = succModTime 377 candidate.IsLatest = succModTime.IsZero() 378 } 379 return candidate, nil 380 } 381 return FileInfo{}, errErasureReadQuorum 382 } 383 384 // pickValidFileInfo - picks one valid FileInfo content and returns from a 385 // slice of FileInfo. 386 func pickValidFileInfo(ctx context.Context, metaArr []FileInfo, modTime time.Time, etag string, quorum int) (FileInfo, error) { 387 return findFileInfoInQuorum(ctx, metaArr, modTime, etag, quorum) 388 } 389 390 // writeUniqueFileInfo - writes unique `xl.meta` content for each disk concurrently. 391 func writeUniqueFileInfo(ctx context.Context, disks []StorageAPI, origbucket, bucket, prefix string, files []FileInfo, quorum int) ([]StorageAPI, error) { 392 g := errgroup.WithNErrs(len(disks)) 393 394 // Start writing `xl.meta` to all disks in parallel. 395 for index := range disks { 396 index := index 397 g.Go(func() error { 398 if disks[index] == nil { 399 return errDiskNotFound 400 } 401 // Pick one FileInfo for a disk at index. 402 fi := files[index] 403 fi.Erasure.Index = index + 1 404 if fi.IsValid() { 405 return disks[index].WriteMetadata(ctx, origbucket, bucket, prefix, fi) 406 } 407 return errCorruptedFormat 408 }, index) 409 } 410 411 // Wait for all the routines. 412 mErrs := g.Wait() 413 414 err := reduceWriteQuorumErrs(ctx, mErrs, objectOpIgnoredErrs, quorum) 415 return evalDisks(disks, mErrs), err 416 } 417 418 func commonParity(parities []int, defaultParityCount int) int { 419 N := len(parities) 420 421 occMap := make(map[int]int) 422 for _, p := range parities { 423 occMap[p]++ 424 } 425 426 var maxOcc, cparity int 427 for parity, occ := range occMap { 428 if parity == -1 { 429 // Ignore non defined parity 430 continue 431 } 432 433 readQuorum := N - parity 434 if defaultParityCount > 0 && parity == 0 { 435 // In this case, parity == 0 implies that this object version is a 436 // delete marker 437 readQuorum = N/2 + 1 438 } 439 if occ < readQuorum { 440 // Ignore this parity since we don't have enough shards for read quorum 441 continue 442 } 443 444 if occ > maxOcc { 445 maxOcc = occ 446 cparity = parity 447 } 448 } 449 450 if maxOcc == 0 { 451 // Did not found anything useful 452 return -1 453 } 454 return cparity 455 } 456 457 func listObjectParities(partsMetadata []FileInfo, errs []error) (parities []int) { 458 parities = make([]int, len(partsMetadata)) 459 for index, metadata := range partsMetadata { 460 if errs[index] != nil { 461 parities[index] = -1 462 continue 463 } 464 if !metadata.IsValid() { 465 parities[index] = -1 466 continue 467 } 468 // Delete marker or zero byte objects take highest parity. 469 if metadata.Deleted || metadata.Size == 0 { 470 parities[index] = len(partsMetadata) / 2 471 } else { 472 parities[index] = metadata.Erasure.ParityBlocks 473 } 474 } 475 return 476 } 477 478 // Returns per object readQuorum and writeQuorum 479 // readQuorum is the min required disks to read data. 480 // writeQuorum is the min required disks to write data. 481 func objectQuorumFromMeta(ctx context.Context, partsMetaData []FileInfo, errs []error, defaultParityCount int) (objectReadQuorum, objectWriteQuorum int, err error) { 482 // There should be at least half correct entries, if not return failure 483 expectedRQuorum := len(partsMetaData) / 2 484 if defaultParityCount == 0 { 485 // if parity count is '0', we expected all entries to be present. 486 expectedRQuorum = len(partsMetaData) 487 } 488 489 reducedErr := reduceReadQuorumErrs(ctx, errs, objectOpIgnoredErrs, expectedRQuorum) 490 if reducedErr != nil { 491 return -1, -1, reducedErr 492 } 493 494 // special case when parity is '0' 495 if defaultParityCount == 0 { 496 return len(partsMetaData), len(partsMetaData), nil 497 } 498 499 parities := listObjectParities(partsMetaData, errs) 500 parityBlocks := commonParity(parities, defaultParityCount) 501 if parityBlocks < 0 { 502 return -1, -1, errErasureReadQuorum 503 } 504 505 dataBlocks := len(partsMetaData) - parityBlocks 506 507 writeQuorum := dataBlocks 508 if dataBlocks == parityBlocks { 509 writeQuorum++ 510 } 511 512 // Since all the valid erasure code meta updated at the same time are equivalent, pass dataBlocks 513 // from latestFileInfo to get the quorum 514 return dataBlocks, writeQuorum, nil 515 } 516 517 const ( 518 tierFVID = "tier-free-versionID" 519 tierFVMarker = "tier-free-marker" 520 tierSkipFVID = "tier-skip-fvid" 521 ) 522 523 // SetTierFreeVersionID sets free-version's versionID. This method is used by 524 // object layer to pass down a versionID to set for a free-version that may be 525 // created. 526 func (fi *FileInfo) SetTierFreeVersionID(versionID string) { 527 if fi.Metadata == nil { 528 fi.Metadata = make(map[string]string) 529 } 530 fi.Metadata[ReservedMetadataPrefixLower+tierFVID] = versionID 531 } 532 533 // TierFreeVersionID returns the free-version's version id. 534 func (fi *FileInfo) TierFreeVersionID() string { 535 return fi.Metadata[ReservedMetadataPrefixLower+tierFVID] 536 } 537 538 // SetTierFreeVersion sets fi as a free-version. This method is used by 539 // lower layers to indicate a free-version. 540 func (fi *FileInfo) SetTierFreeVersion() { 541 if fi.Metadata == nil { 542 fi.Metadata = make(map[string]string) 543 } 544 fi.Metadata[ReservedMetadataPrefixLower+tierFVMarker] = "" 545 } 546 547 // SetSkipTierFreeVersion indicates to skip adding a tier free version id. 548 // Note: Used only when expiring tiered objects and the remote content has 549 // already been scheduled for deletion 550 func (fi *FileInfo) SetSkipTierFreeVersion() { 551 if fi.Metadata == nil { 552 fi.Metadata = make(map[string]string) 553 } 554 fi.Metadata[ReservedMetadataPrefixLower+tierSkipFVID] = "" 555 } 556 557 // SkipTierFreeVersion returns true if set, false otherwise. 558 // See SetSkipTierVersion for its purpose. 559 func (fi *FileInfo) SkipTierFreeVersion() bool { 560 _, ok := fi.Metadata[ReservedMetadataPrefixLower+tierSkipFVID] 561 return ok 562 } 563 564 // TierFreeVersion returns true if version is a free-version. 565 func (fi *FileInfo) TierFreeVersion() bool { 566 _, ok := fi.Metadata[ReservedMetadataPrefixLower+tierFVMarker] 567 return ok 568 } 569 570 // IsRestoreObjReq returns true if fi corresponds to a RestoreObject request. 571 func (fi *FileInfo) IsRestoreObjReq() bool { 572 if restoreHdr, ok := fi.Metadata[xhttp.AmzRestore]; ok { 573 if restoreStatus, err := parseRestoreObjStatus(restoreHdr); err == nil { 574 if !restoreStatus.Ongoing() { 575 return true 576 } 577 } 578 } 579 return false 580 } 581 582 // VersionPurgeStatus returns overall version purge status for this object version across targets 583 func (fi *FileInfo) VersionPurgeStatus() VersionPurgeStatusType { 584 return fi.ReplicationState.CompositeVersionPurgeStatus() 585 } 586 587 // ReplicationStatus returns overall version replication status for this object version across targets 588 func (fi *FileInfo) ReplicationStatus() replication.StatusType { 589 return fi.ReplicationState.CompositeReplicationStatus() 590 } 591 592 // DeleteMarkerReplicationStatus returns overall replication status for this delete marker version across targets 593 func (fi *FileInfo) DeleteMarkerReplicationStatus() replication.StatusType { 594 if fi.Deleted { 595 return fi.ReplicationState.CompositeReplicationStatus() 596 } 597 return replication.StatusType("") 598 } 599 600 // GetInternalReplicationState is a wrapper method to fetch internal replication state from the map m 601 func GetInternalReplicationState(m map[string][]byte) ReplicationState { 602 m1 := make(map[string]string, len(m)) 603 for k, v := range m { 604 m1[k] = string(v) 605 } 606 return getInternalReplicationState(m1) 607 } 608 609 // getInternalReplicationState fetches internal replication state from the map m 610 func getInternalReplicationState(m map[string]string) ReplicationState { 611 d := ReplicationState{} 612 for k, v := range m { 613 switch { 614 case equals(k, ReservedMetadataPrefixLower+ReplicationTimestamp): 615 d.ReplicaTimeStamp, _ = amztime.ParseReplicationTS(v) 616 case equals(k, ReservedMetadataPrefixLower+ReplicaTimestamp): 617 d.ReplicaTimeStamp, _ = amztime.ParseReplicationTS(v) 618 case equals(k, ReservedMetadataPrefixLower+ReplicaStatus): 619 d.ReplicaStatus = replication.StatusType(v) 620 case equals(k, ReservedMetadataPrefixLower+ReplicationStatus): 621 d.ReplicationStatusInternal = v 622 d.Targets = replicationStatusesMap(v) 623 case equals(k, VersionPurgeStatusKey): 624 d.VersionPurgeStatusInternal = v 625 d.PurgeTargets = versionPurgeStatusesMap(v) 626 case strings.HasPrefix(k, ReservedMetadataPrefixLower+ReplicationReset): 627 arn := strings.TrimPrefix(k, fmt.Sprintf("%s-", ReservedMetadataPrefixLower+ReplicationReset)) 628 if d.ResetStatusesMap == nil { 629 d.ResetStatusesMap = make(map[string]string, 1) 630 } 631 d.ResetStatusesMap[arn] = v 632 } 633 } 634 return d 635 }