storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/cmd/erasure-multipart.go (about)

     1  /*
     2   * MinIO Cloud Storage, (C) 2016-2020 MinIO, Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package cmd
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"io"
    23  	"os"
    24  	"path"
    25  	"sort"
    26  	"strconv"
    27  	"strings"
    28  	"sync"
    29  	"time"
    30  
    31  	"github.com/minio/minio-go/v7/pkg/set"
    32  
    33  	xhttp "storj.io/minio/cmd/http"
    34  	"storj.io/minio/cmd/logger"
    35  	"storj.io/minio/pkg/mimedb"
    36  	"storj.io/minio/pkg/sync/errgroup"
    37  )
    38  
    39  func (er erasureObjects) getUploadIDDir(bucket, object, uploadID string) string {
    40  	return pathJoin(er.getMultipartSHADir(bucket, object), uploadID)
    41  }
    42  
    43  func (er erasureObjects) getMultipartSHADir(bucket, object string) string {
    44  	return getSHA256Hash([]byte(pathJoin(bucket, object)))
    45  }
    46  
    47  // checkUploadIDExists - verify if a given uploadID exists and is valid.
    48  func (er erasureObjects) checkUploadIDExists(ctx context.Context, bucket, object, uploadID string) (err error) {
    49  	defer func() {
    50  		if err == errFileNotFound {
    51  			err = errUploadIDNotFound
    52  		}
    53  	}()
    54  
    55  	disks := er.getDisks()
    56  
    57  	// Read metadata associated with the object from all disks.
    58  	metaArr, errs := readAllFileInfo(ctx, disks, minioMetaMultipartBucket, er.getUploadIDDir(bucket, object, uploadID), "", false)
    59  
    60  	readQuorum, _, err := objectQuorumFromMeta(ctx, metaArr, errs, er.defaultParityCount)
    61  	if err != nil {
    62  		return err
    63  	}
    64  
    65  	if reducedErr := reduceReadQuorumErrs(ctx, errs, objectOpIgnoredErrs, readQuorum); reducedErr != nil {
    66  		return reducedErr
    67  	}
    68  
    69  	// List all online disks.
    70  	_, modTime, dataDir := listOnlineDisks(disks, metaArr, errs)
    71  
    72  	// Pick latest valid metadata.
    73  	_, err = pickValidFileInfo(ctx, metaArr, modTime, dataDir, readQuorum)
    74  	return err
    75  }
    76  
    77  // Removes part given by partName belonging to a mulitpart upload from minioMetaBucket
    78  func (er erasureObjects) removeObjectPart(bucket, object, uploadID, dataDir string, partNumber int) {
    79  	uploadIDPath := er.getUploadIDDir(bucket, object, uploadID)
    80  	curpartPath := pathJoin(uploadIDPath, dataDir, fmt.Sprintf("part.%d", partNumber))
    81  	storageDisks := er.getDisks()
    82  
    83  	g := errgroup.WithNErrs(len(storageDisks))
    84  	for index, disk := range storageDisks {
    85  		if disk == nil {
    86  			continue
    87  		}
    88  		index := index
    89  		g.Go(func() error {
    90  			// Ignoring failure to remove parts that weren't present in CompleteMultipartUpload
    91  			// requests. xl.meta is the authoritative source of truth on which parts constitute
    92  			// the object. The presence of parts that don't belong in the object doesn't affect correctness.
    93  			_ = storageDisks[index].Delete(context.TODO(), minioMetaMultipartBucket, curpartPath, false)
    94  			return nil
    95  		}, index)
    96  	}
    97  	g.Wait()
    98  }
    99  
   100  // Clean-up the old multipart uploads. Should be run in a Go routine.
   101  func (er erasureObjects) cleanupStaleUploads(ctx context.Context, expiry time.Duration) {
   102  	// run multiple cleanup's local to this server.
   103  	var wg sync.WaitGroup
   104  	for _, disk := range er.getLoadBalancedLocalDisks() {
   105  		if disk != nil {
   106  			wg.Add(1)
   107  			go func(disk StorageAPI) {
   108  				defer wg.Done()
   109  				er.cleanupStaleUploadsOnDisk(ctx, disk, expiry)
   110  			}(disk)
   111  		}
   112  	}
   113  	wg.Wait()
   114  }
   115  
   116  func (er erasureObjects) renameAll(ctx context.Context, bucket, prefix string) {
   117  	var wg sync.WaitGroup
   118  	for _, disk := range er.getDisks() {
   119  		if disk == nil {
   120  			continue
   121  		}
   122  		wg.Add(1)
   123  		go func(disk StorageAPI) {
   124  			defer wg.Done()
   125  			disk.RenameFile(ctx, bucket, prefix, minioMetaTmpBucket, mustGetUUID())
   126  		}(disk)
   127  	}
   128  	wg.Wait()
   129  }
   130  
   131  func (er erasureObjects) deleteAll(ctx context.Context, bucket, prefix string) {
   132  	var wg sync.WaitGroup
   133  	for _, disk := range er.getDisks() {
   134  		if disk == nil {
   135  			continue
   136  		}
   137  		wg.Add(1)
   138  		go func(disk StorageAPI) {
   139  			defer wg.Done()
   140  			disk.Delete(ctx, bucket, prefix, true)
   141  		}(disk)
   142  	}
   143  	wg.Wait()
   144  }
   145  
   146  // Remove the old multipart uploads on the given disk.
   147  func (er erasureObjects) cleanupStaleUploadsOnDisk(ctx context.Context, disk StorageAPI, expiry time.Duration) {
   148  	now := time.Now()
   149  	diskPath := disk.Endpoint().Path
   150  
   151  	readDirFn(pathJoin(diskPath, minioMetaMultipartBucket), func(shaDir string, typ os.FileMode) error {
   152  		return readDirFn(pathJoin(diskPath, minioMetaMultipartBucket, shaDir), func(uploadIDDir string, typ os.FileMode) error {
   153  			uploadIDPath := pathJoin(shaDir, uploadIDDir)
   154  			fi, err := disk.ReadVersion(ctx, minioMetaMultipartBucket, uploadIDPath, "", false)
   155  			if err != nil {
   156  				return nil
   157  			}
   158  			wait := er.deletedCleanupSleeper.Timer(ctx)
   159  			if now.Sub(fi.ModTime) > expiry {
   160  				er.renameAll(ctx, minioMetaMultipartBucket, uploadIDPath)
   161  			}
   162  			wait()
   163  			return nil
   164  		})
   165  	})
   166  
   167  	readDirFn(pathJoin(diskPath, minioMetaTmpBucket), func(tmpDir string, typ os.FileMode) error {
   168  		if tmpDir == ".trash/" { // do not remove .trash/ here, it has its own routines
   169  			return nil
   170  		}
   171  		vi, err := disk.StatVol(ctx, pathJoin(minioMetaTmpBucket, tmpDir))
   172  		if err != nil {
   173  			return nil
   174  		}
   175  		wait := er.deletedCleanupSleeper.Timer(ctx)
   176  		if now.Sub(vi.Created) > expiry {
   177  			er.deleteAll(ctx, minioMetaTmpBucket, tmpDir)
   178  		}
   179  		wait()
   180  		return nil
   181  	})
   182  }
   183  
   184  // ListMultipartUploads - lists all the pending multipart
   185  // uploads for a particular object in a bucket.
   186  //
   187  // Implements minimal S3 compatible ListMultipartUploads API. We do
   188  // not support prefix based listing, this is a deliberate attempt
   189  // towards simplification of multipart APIs.
   190  // The resulting ListMultipartsInfo structure is unmarshalled directly as XML.
   191  func (er erasureObjects) ListMultipartUploads(ctx context.Context, bucket, object, keyMarker, uploadIDMarker, delimiter string, maxUploads int) (result ListMultipartsInfo, err error) {
   192  	result.MaxUploads = maxUploads
   193  	result.KeyMarker = keyMarker
   194  	result.Prefix = object
   195  	result.Delimiter = delimiter
   196  
   197  	var uploadIDs []string
   198  	var disk StorageAPI
   199  	for _, disk = range er.getLoadBalancedDisks(true) {
   200  		uploadIDs, err = disk.ListDir(ctx, minioMetaMultipartBucket, er.getMultipartSHADir(bucket, object), -1)
   201  		if err != nil {
   202  			if err == errDiskNotFound {
   203  				continue
   204  			}
   205  			if err == errFileNotFound {
   206  				return result, nil
   207  			}
   208  			logger.LogIf(ctx, err)
   209  			return result, toObjectErr(err, bucket, object)
   210  		}
   211  		break
   212  	}
   213  
   214  	for i := range uploadIDs {
   215  		uploadIDs[i] = strings.TrimSuffix(uploadIDs[i], SlashSeparator)
   216  	}
   217  
   218  	// S3 spec says uploadIDs should be sorted based on initiated time, we need
   219  	// to read the metadata entry.
   220  	var uploads []MultipartInfo
   221  
   222  	populatedUploadIds := set.NewStringSet()
   223  
   224  	for _, uploadID := range uploadIDs {
   225  		if populatedUploadIds.Contains(uploadID) {
   226  			continue
   227  		}
   228  		fi, err := disk.ReadVersion(ctx, minioMetaMultipartBucket, pathJoin(er.getUploadIDDir(bucket, object, uploadID)), "", false)
   229  		if err != nil {
   230  			return result, toObjectErr(err, bucket, object)
   231  		}
   232  		populatedUploadIds.Add(uploadID)
   233  		uploads = append(uploads, MultipartInfo{
   234  			Object:    object,
   235  			UploadID:  uploadID,
   236  			Initiated: fi.ModTime,
   237  		})
   238  	}
   239  
   240  	sort.Slice(uploads, func(i int, j int) bool {
   241  		return uploads[i].Initiated.Before(uploads[j].Initiated)
   242  	})
   243  
   244  	uploadIndex := 0
   245  	if uploadIDMarker != "" {
   246  		for uploadIndex < len(uploads) {
   247  			if uploads[uploadIndex].UploadID != uploadIDMarker {
   248  				uploadIndex++
   249  				continue
   250  			}
   251  			if uploads[uploadIndex].UploadID == uploadIDMarker {
   252  				uploadIndex++
   253  				break
   254  			}
   255  			uploadIndex++
   256  		}
   257  	}
   258  	for uploadIndex < len(uploads) {
   259  		result.Uploads = append(result.Uploads, uploads[uploadIndex])
   260  		result.NextUploadIDMarker = uploads[uploadIndex].UploadID
   261  		uploadIndex++
   262  		if len(result.Uploads) == maxUploads {
   263  			break
   264  		}
   265  	}
   266  
   267  	result.IsTruncated = uploadIndex < len(uploads)
   268  
   269  	if !result.IsTruncated {
   270  		result.NextKeyMarker = ""
   271  		result.NextUploadIDMarker = ""
   272  	}
   273  
   274  	return result, nil
   275  }
   276  
   277  // newMultipartUpload - wrapper for initializing a new multipart
   278  // request; returns a unique upload id.
   279  //
   280  // Internally this function creates 'uploads.json' associated for the
   281  // incoming object at
   282  // '.minio.sys/multipart/bucket/object/uploads.json' on all the
   283  // disks. `uploads.json` carries metadata regarding on-going multipart
   284  // operation(s) on the object.
   285  func (er erasureObjects) newMultipartUpload(ctx context.Context, bucket string, object string, opts ObjectOptions) (string, error) {
   286  	onlineDisks := er.getDisks()
   287  	parityDrives := globalStorageClass.GetParityForSC(opts.UserDefined[xhttp.AmzStorageClass])
   288  	if parityDrives <= 0 {
   289  		parityDrives = er.defaultParityCount
   290  	}
   291  
   292  	dataDrives := len(onlineDisks) - parityDrives
   293  	// we now know the number of blocks this object needs for data and parity.
   294  	// establish the writeQuorum using this data
   295  	writeQuorum := dataDrives
   296  	if dataDrives == parityDrives {
   297  		writeQuorum++
   298  	}
   299  
   300  	// Initialize parts metadata
   301  	partsMetadata := make([]FileInfo, len(onlineDisks))
   302  
   303  	fi := newFileInfo(pathJoin(bucket, object), dataDrives, parityDrives)
   304  	if opts.Versioned {
   305  		fi.VersionID = opts.VersionID
   306  		if fi.VersionID == "" {
   307  			fi.VersionID = mustGetUUID()
   308  		}
   309  	}
   310  	fi.DataDir = mustGetUUID()
   311  
   312  	// Initialize erasure metadata.
   313  	for index := range partsMetadata {
   314  		partsMetadata[index] = fi
   315  	}
   316  
   317  	// Guess content-type from the extension if possible.
   318  	if opts.UserDefined["content-type"] == "" {
   319  		opts.UserDefined["content-type"] = mimedb.TypeByExtension(path.Ext(object))
   320  	}
   321  
   322  	modTime := opts.MTime
   323  	if opts.MTime.IsZero() {
   324  		modTime = UTCNow()
   325  	}
   326  
   327  	onlineDisks, partsMetadata = shuffleDisksAndPartsMetadata(onlineDisks, partsMetadata, fi)
   328  
   329  	// Fill all the necessary metadata.
   330  	// Update `xl.meta` content on each disks.
   331  	for index := range partsMetadata {
   332  		partsMetadata[index].Metadata = opts.UserDefined
   333  		partsMetadata[index].ModTime = modTime
   334  	}
   335  
   336  	uploadID := mustGetUUID()
   337  	uploadIDPath := er.getUploadIDDir(bucket, object, uploadID)
   338  
   339  	// Write updated `xl.meta` to all disks.
   340  	if _, err := writeUniqueFileInfo(ctx, onlineDisks, minioMetaMultipartBucket, uploadIDPath, partsMetadata, writeQuorum); err != nil {
   341  		return "", toObjectErr(err, minioMetaMultipartBucket, uploadIDPath)
   342  	}
   343  
   344  	// Return success.
   345  	return uploadID, nil
   346  }
   347  
   348  // NewMultipartUpload - initialize a new multipart upload, returns a
   349  // unique id. The unique id returned here is of UUID form, for each
   350  // subsequent request each UUID is unique.
   351  //
   352  // Implements S3 compatible initiate multipart API.
   353  func (er erasureObjects) NewMultipartUpload(ctx context.Context, bucket, object string, opts ObjectOptions) (string, error) {
   354  	// No metadata is set, allocate a new one.
   355  	if opts.UserDefined == nil {
   356  		opts.UserDefined = make(map[string]string)
   357  	}
   358  	return er.newMultipartUpload(ctx, bucket, object, opts)
   359  }
   360  
   361  // CopyObjectPart - reads incoming stream and internally erasure codes
   362  // them. This call is similar to put object part operation but the source
   363  // data is read from an existing object.
   364  //
   365  // Implements S3 compatible Upload Part Copy API.
   366  func (er erasureObjects) CopyObjectPart(ctx context.Context, srcBucket, srcObject, dstBucket, dstObject, uploadID string, partID int, startOffset int64, length int64, srcInfo ObjectInfo, srcOpts, dstOpts ObjectOptions) (pi PartInfo, e error) {
   367  	partInfo, err := er.PutObjectPart(ctx, dstBucket, dstObject, uploadID, partID, NewPutObjReader(srcInfo.Reader), dstOpts)
   368  	if err != nil {
   369  		return pi, toObjectErr(err, dstBucket, dstObject)
   370  	}
   371  
   372  	// Success.
   373  	return partInfo, nil
   374  }
   375  
   376  // PutObjectPart - reads incoming stream and internally erasure codes
   377  // them. This call is similar to single put operation but it is part
   378  // of the multipart transaction.
   379  //
   380  // Implements S3 compatible Upload Part API.
   381  func (er erasureObjects) PutObjectPart(ctx context.Context, bucket, object, uploadID string, partID int, r *PutObjReader, opts ObjectOptions) (pi PartInfo, err error) {
   382  	uploadIDLock := er.NewNSLock(bucket, pathJoin(object, uploadID))
   383  	ctx, err = uploadIDLock.GetRLock(ctx, globalOperationTimeout)
   384  	if err != nil {
   385  		return PartInfo{}, err
   386  	}
   387  	readLocked := true
   388  	defer func() {
   389  		if readLocked {
   390  			uploadIDLock.RUnlock()
   391  		}
   392  	}()
   393  
   394  	data := r.Reader
   395  	// Validate input data size and it can never be less than zero.
   396  	if data.Size() < -1 {
   397  		logger.LogIf(ctx, errInvalidArgument, logger.Application)
   398  		return pi, toObjectErr(errInvalidArgument)
   399  	}
   400  
   401  	var partsMetadata []FileInfo
   402  	var errs []error
   403  	uploadIDPath := er.getUploadIDDir(bucket, object, uploadID)
   404  
   405  	// Validates if upload ID exists.
   406  	if err = er.checkUploadIDExists(ctx, bucket, object, uploadID); err != nil {
   407  		return pi, toObjectErr(err, bucket, object, uploadID)
   408  	}
   409  
   410  	storageDisks := er.getDisks()
   411  
   412  	// Read metadata associated with the object from all disks.
   413  	partsMetadata, errs = readAllFileInfo(ctx, storageDisks, minioMetaMultipartBucket,
   414  		uploadIDPath, "", false)
   415  
   416  	// get Quorum for this object
   417  	_, writeQuorum, err := objectQuorumFromMeta(ctx, partsMetadata, errs, er.defaultParityCount)
   418  	if err != nil {
   419  		return pi, toObjectErr(err, bucket, object)
   420  	}
   421  
   422  	reducedErr := reduceWriteQuorumErrs(ctx, errs, objectOpIgnoredErrs, writeQuorum)
   423  	if reducedErr == errErasureWriteQuorum {
   424  		return pi, toObjectErr(reducedErr, bucket, object)
   425  	}
   426  
   427  	// List all online disks.
   428  	onlineDisks, modTime, dataDir := listOnlineDisks(storageDisks, partsMetadata, errs)
   429  
   430  	// Pick one from the first valid metadata.
   431  	fi, err := pickValidFileInfo(ctx, partsMetadata, modTime, dataDir, writeQuorum)
   432  	if err != nil {
   433  		return pi, err
   434  	}
   435  
   436  	onlineDisks = shuffleDisks(onlineDisks, fi.Erasure.Distribution)
   437  
   438  	// Need a unique name for the part being written in minioMetaBucket to
   439  	// accommodate concurrent PutObjectPart requests
   440  
   441  	partSuffix := fmt.Sprintf("part.%d", partID)
   442  	tmpPart := mustGetUUID()
   443  	tmpPartPath := pathJoin(tmpPart, partSuffix)
   444  
   445  	// Delete the temporary object part. If PutObjectPart succeeds there would be nothing to delete.
   446  	var online int
   447  	defer func() {
   448  		if online != len(onlineDisks) {
   449  			er.deleteObject(context.Background(), minioMetaTmpBucket, tmpPart, writeQuorum)
   450  		}
   451  	}()
   452  
   453  	erasure, err := NewErasure(ctx, fi.Erasure.DataBlocks, fi.Erasure.ParityBlocks, fi.Erasure.BlockSize)
   454  	if err != nil {
   455  		return pi, toObjectErr(err, bucket, object)
   456  	}
   457  
   458  	// Fetch buffer for I/O, returns from the pool if not allocates a new one and returns.
   459  	var buffer []byte
   460  	switch size := data.Size(); {
   461  	case size == 0:
   462  		buffer = make([]byte, 1) // Allocate atleast a byte to reach EOF
   463  	case size == -1:
   464  		if size := data.ActualSize(); size > 0 && size < fi.Erasure.BlockSize {
   465  			buffer = make([]byte, data.ActualSize()+256, data.ActualSize()*2+512)
   466  		} else {
   467  			buffer = er.bp.Get()
   468  			defer er.bp.Put(buffer)
   469  		}
   470  	case size >= fi.Erasure.BlockSize:
   471  		buffer = er.bp.Get()
   472  		defer er.bp.Put(buffer)
   473  	case size < fi.Erasure.BlockSize:
   474  		// No need to allocate fully fi.Erasure.BlockSize buffer if the incoming data is smaller.
   475  		buffer = make([]byte, size, 2*size+int64(fi.Erasure.ParityBlocks+fi.Erasure.DataBlocks-1))
   476  	}
   477  
   478  	if len(buffer) > int(fi.Erasure.BlockSize) {
   479  		buffer = buffer[:fi.Erasure.BlockSize]
   480  	}
   481  	writers := make([]io.Writer, len(onlineDisks))
   482  	for i, disk := range onlineDisks {
   483  		if disk == nil {
   484  			continue
   485  		}
   486  		writers[i] = newBitrotWriter(disk, minioMetaTmpBucket, tmpPartPath,
   487  			erasure.ShardFileSize(data.Size()), DefaultBitrotAlgorithm, erasure.ShardSize(), false)
   488  	}
   489  
   490  	n, err := erasure.Encode(ctx, data, writers, buffer, writeQuorum)
   491  	closeBitrotWriters(writers)
   492  	if err != nil {
   493  		return pi, toObjectErr(err, bucket, object)
   494  	}
   495  
   496  	// Should return IncompleteBody{} error when reader has fewer bytes
   497  	// than specified in request header.
   498  	if n < data.Size() {
   499  		return pi, IncompleteBody{Bucket: bucket, Object: object}
   500  	}
   501  
   502  	for i := range writers {
   503  		if writers[i] == nil {
   504  			onlineDisks[i] = nil
   505  		}
   506  	}
   507  
   508  	// Unlock here before acquiring write locks all concurrent
   509  	// PutObjectParts would serialize here updating `xl.meta`
   510  	uploadIDLock.RUnlock()
   511  	readLocked = false
   512  	ctx, err = uploadIDLock.GetLock(ctx, globalOperationTimeout)
   513  	if err != nil {
   514  		return PartInfo{}, err
   515  	}
   516  	defer uploadIDLock.Unlock()
   517  
   518  	// Validates if upload ID exists.
   519  	if err = er.checkUploadIDExists(ctx, bucket, object, uploadID); err != nil {
   520  		return pi, toObjectErr(err, bucket, object, uploadID)
   521  	}
   522  
   523  	// Rename temporary part file to its final location.
   524  	partPath := pathJoin(uploadIDPath, fi.DataDir, partSuffix)
   525  	onlineDisks, err = rename(ctx, onlineDisks, minioMetaTmpBucket, tmpPartPath, minioMetaMultipartBucket, partPath, false, writeQuorum, nil)
   526  	if err != nil {
   527  		return pi, toObjectErr(err, minioMetaMultipartBucket, partPath)
   528  	}
   529  
   530  	// Read metadata again because it might be updated with parallel upload of another part.
   531  	partsMetadata, errs = readAllFileInfo(ctx, onlineDisks, minioMetaMultipartBucket, uploadIDPath, "", false)
   532  	reducedErr = reduceWriteQuorumErrs(ctx, errs, objectOpIgnoredErrs, writeQuorum)
   533  	if reducedErr == errErasureWriteQuorum {
   534  		return pi, toObjectErr(reducedErr, bucket, object)
   535  	}
   536  
   537  	// Get current highest version based on re-read partsMetadata.
   538  	onlineDisks, modTime, dataDir = listOnlineDisks(onlineDisks, partsMetadata, errs)
   539  
   540  	// Pick one from the first valid metadata.
   541  	fi, err = pickValidFileInfo(ctx, partsMetadata, modTime, dataDir, writeQuorum)
   542  	if err != nil {
   543  		return pi, err
   544  	}
   545  
   546  	// Once part is successfully committed, proceed with updating erasure metadata.
   547  	fi.ModTime = UTCNow()
   548  
   549  	md5hex := r.MD5CurrentHexString()
   550  
   551  	// Add the current part.
   552  	fi.AddObjectPart(partID, md5hex, n, data.ActualSize())
   553  
   554  	for i, disk := range onlineDisks {
   555  		if disk == OfflineDisk {
   556  			continue
   557  		}
   558  		partsMetadata[i].Size = fi.Size
   559  		partsMetadata[i].ModTime = fi.ModTime
   560  		partsMetadata[i].Parts = fi.Parts
   561  		partsMetadata[i].Erasure.AddChecksumInfo(ChecksumInfo{
   562  			PartNumber: partID,
   563  			Algorithm:  DefaultBitrotAlgorithm,
   564  			Hash:       bitrotWriterSum(writers[i]),
   565  		})
   566  	}
   567  
   568  	// Writes update `xl.meta` format for each disk.
   569  	if _, err = writeUniqueFileInfo(ctx, onlineDisks, minioMetaMultipartBucket, uploadIDPath, partsMetadata, writeQuorum); err != nil {
   570  		return pi, toObjectErr(err, minioMetaMultipartBucket, uploadIDPath)
   571  	}
   572  
   573  	online = countOnlineDisks(onlineDisks)
   574  
   575  	// Return success.
   576  	return PartInfo{
   577  		PartNumber:   partID,
   578  		ETag:         md5hex,
   579  		LastModified: fi.ModTime,
   580  		Size:         fi.Size,
   581  		ActualSize:   data.ActualSize(),
   582  	}, nil
   583  }
   584  
   585  // GetMultipartInfo returns multipart metadata uploaded during newMultipartUpload, used
   586  // by callers to verify object states
   587  // - encrypted
   588  // - compressed
   589  func (er erasureObjects) GetMultipartInfo(ctx context.Context, bucket, object, uploadID string, opts ObjectOptions) (MultipartInfo, error) {
   590  	result := MultipartInfo{
   591  		Bucket:   bucket,
   592  		Object:   object,
   593  		UploadID: uploadID,
   594  	}
   595  
   596  	var err error
   597  	uploadIDLock := er.NewNSLock(bucket, pathJoin(object, uploadID))
   598  	ctx, err = uploadIDLock.GetRLock(ctx, globalOperationTimeout)
   599  	if err != nil {
   600  		return MultipartInfo{}, err
   601  	}
   602  	defer uploadIDLock.RUnlock()
   603  
   604  	if err := er.checkUploadIDExists(ctx, bucket, object, uploadID); err != nil {
   605  		return result, toObjectErr(err, bucket, object, uploadID)
   606  	}
   607  
   608  	uploadIDPath := er.getUploadIDDir(bucket, object, uploadID)
   609  
   610  	storageDisks := er.getDisks()
   611  
   612  	// Read metadata associated with the object from all disks.
   613  	partsMetadata, errs := readAllFileInfo(ctx, storageDisks, minioMetaMultipartBucket, uploadIDPath, opts.VersionID, false)
   614  
   615  	// get Quorum for this object
   616  	readQuorum, _, err := objectQuorumFromMeta(ctx, partsMetadata, errs, er.defaultParityCount)
   617  	if err != nil {
   618  		return result, toObjectErr(err, minioMetaMultipartBucket, uploadIDPath)
   619  	}
   620  
   621  	reducedErr := reduceWriteQuorumErrs(ctx, errs, objectOpIgnoredErrs, readQuorum)
   622  	if reducedErr == errErasureReadQuorum {
   623  		return result, toObjectErr(reducedErr, minioMetaMultipartBucket, uploadIDPath)
   624  	}
   625  
   626  	_, modTime, dataDir := listOnlineDisks(storageDisks, partsMetadata, errs)
   627  
   628  	// Pick one from the first valid metadata.
   629  	fi, err := pickValidFileInfo(ctx, partsMetadata, modTime, dataDir, readQuorum)
   630  	if err != nil {
   631  		return result, err
   632  	}
   633  
   634  	result.UserDefined = cloneMSS(fi.Metadata)
   635  	return result, nil
   636  }
   637  
   638  // ListObjectParts - lists all previously uploaded parts for a given
   639  // object and uploadID.  Takes additional input of part-number-marker
   640  // to indicate where the listing should begin from.
   641  //
   642  // Implements S3 compatible ListObjectParts API. The resulting
   643  // ListPartsInfo structure is marshaled directly into XML and
   644  // replied back to the client.
   645  func (er erasureObjects) ListObjectParts(ctx context.Context, bucket, object, uploadID string, partNumberMarker, maxParts int, opts ObjectOptions) (result ListPartsInfo, err error) {
   646  	uploadIDLock := er.NewNSLock(bucket, pathJoin(object, uploadID))
   647  	ctx, err = uploadIDLock.GetRLock(ctx, globalOperationTimeout)
   648  	if err != nil {
   649  		return ListPartsInfo{}, err
   650  	}
   651  	defer uploadIDLock.RUnlock()
   652  
   653  	if err := er.checkUploadIDExists(ctx, bucket, object, uploadID); err != nil {
   654  		return result, toObjectErr(err, bucket, object, uploadID)
   655  	}
   656  
   657  	uploadIDPath := er.getUploadIDDir(bucket, object, uploadID)
   658  
   659  	storageDisks := er.getDisks()
   660  
   661  	// Read metadata associated with the object from all disks.
   662  	partsMetadata, errs := readAllFileInfo(ctx, storageDisks, minioMetaMultipartBucket, uploadIDPath, "", false)
   663  
   664  	// get Quorum for this object
   665  	_, writeQuorum, err := objectQuorumFromMeta(ctx, partsMetadata, errs, er.defaultParityCount)
   666  	if err != nil {
   667  		return result, toObjectErr(err, minioMetaMultipartBucket, uploadIDPath)
   668  	}
   669  
   670  	reducedErr := reduceWriteQuorumErrs(ctx, errs, objectOpIgnoredErrs, writeQuorum)
   671  	if reducedErr == errErasureWriteQuorum {
   672  		return result, toObjectErr(reducedErr, minioMetaMultipartBucket, uploadIDPath)
   673  	}
   674  
   675  	_, modTime, dataDir := listOnlineDisks(storageDisks, partsMetadata, errs)
   676  
   677  	// Pick one from the first valid metadata.
   678  	fi, err := pickValidFileInfo(ctx, partsMetadata, modTime, dataDir, writeQuorum)
   679  	if err != nil {
   680  		return result, err
   681  	}
   682  
   683  	// Populate the result stub.
   684  	result.Bucket = bucket
   685  	result.Object = object
   686  	result.UploadID = uploadID
   687  	result.MaxParts = maxParts
   688  	result.PartNumberMarker = partNumberMarker
   689  	result.UserDefined = cloneMSS(fi.Metadata)
   690  
   691  	// For empty number of parts or maxParts as zero, return right here.
   692  	if len(fi.Parts) == 0 || maxParts == 0 {
   693  		return result, nil
   694  	}
   695  
   696  	// Limit output to maxPartsList.
   697  	if maxParts > maxPartsList {
   698  		maxParts = maxPartsList
   699  	}
   700  
   701  	// Only parts with higher part numbers will be listed.
   702  	partIdx := objectPartIndex(fi.Parts, partNumberMarker)
   703  	parts := fi.Parts
   704  	if partIdx != -1 {
   705  		parts = fi.Parts[partIdx+1:]
   706  	}
   707  	count := maxParts
   708  	for _, part := range parts {
   709  		result.Parts = append(result.Parts, PartInfo{
   710  			PartNumber:   part.Number,
   711  			ETag:         part.ETag,
   712  			LastModified: fi.ModTime,
   713  			Size:         part.Size,
   714  		})
   715  		count--
   716  		if count == 0 {
   717  			break
   718  		}
   719  	}
   720  	// If listed entries are more than maxParts, we set IsTruncated as true.
   721  	if len(parts) > len(result.Parts) {
   722  		result.IsTruncated = true
   723  		// Make sure to fill next part number marker if IsTruncated is
   724  		// true for subsequent listing.
   725  		nextPartNumberMarker := result.Parts[len(result.Parts)-1].PartNumber
   726  		result.NextPartNumberMarker = nextPartNumberMarker
   727  	}
   728  	return result, nil
   729  }
   730  
   731  // CompleteMultipartUpload - completes an ongoing multipart
   732  // transaction after receiving all the parts indicated by the client.
   733  // Returns an md5sum calculated by concatenating all the individual
   734  // md5sums of all the parts.
   735  //
   736  // Implements S3 compatible Complete multipart API.
   737  func (er erasureObjects) CompleteMultipartUpload(ctx context.Context, bucket string, object string, uploadID string, parts []CompletePart, opts ObjectOptions) (oi ObjectInfo, err error) {
   738  	// Hold read-locks to verify uploaded parts, also disallows
   739  	// parallel part uploads as well.
   740  	uploadIDLock := er.NewNSLock(bucket, pathJoin(object, uploadID))
   741  	ctx, err = uploadIDLock.GetRLock(ctx, globalOperationTimeout)
   742  	if err != nil {
   743  		return oi, err
   744  	}
   745  	defer uploadIDLock.RUnlock()
   746  
   747  	if err = er.checkUploadIDExists(ctx, bucket, object, uploadID); err != nil {
   748  		return oi, toObjectErr(err, bucket, object, uploadID)
   749  	}
   750  
   751  	// Check if an object is present as one of the parent dir.
   752  	// -- FIXME. (needs a new kind of lock).
   753  	if opts.ParentIsObject != nil && opts.ParentIsObject(ctx, bucket, path.Dir(object)) {
   754  		return oi, toObjectErr(errFileParentIsFile, bucket, object)
   755  	}
   756  
   757  	defer ObjectPathUpdated(pathJoin(bucket, object))
   758  
   759  	// Calculate s3 compatible md5sum for complete multipart.
   760  	s3MD5 := getCompleteMultipartMD5(parts)
   761  
   762  	uploadIDPath := er.getUploadIDDir(bucket, object, uploadID)
   763  
   764  	storageDisks := er.getDisks()
   765  
   766  	// Read metadata associated with the object from all disks.
   767  	partsMetadata, errs := readAllFileInfo(ctx, storageDisks, minioMetaMultipartBucket, uploadIDPath, "", false)
   768  
   769  	// get Quorum for this object
   770  	_, writeQuorum, err := objectQuorumFromMeta(ctx, partsMetadata, errs, er.defaultParityCount)
   771  	if err != nil {
   772  		return oi, toObjectErr(err, bucket, object)
   773  	}
   774  
   775  	reducedErr := reduceWriteQuorumErrs(ctx, errs, objectOpIgnoredErrs, writeQuorum)
   776  	if reducedErr == errErasureWriteQuorum {
   777  		return oi, toObjectErr(reducedErr, bucket, object)
   778  	}
   779  
   780  	onlineDisks, modTime, dataDir := listOnlineDisks(storageDisks, partsMetadata, errs)
   781  
   782  	// Pick one from the first valid metadata.
   783  	fi, err := pickValidFileInfo(ctx, partsMetadata, modTime, dataDir, writeQuorum)
   784  	if err != nil {
   785  		return oi, err
   786  	}
   787  
   788  	// Calculate full object size.
   789  	var objectSize int64
   790  
   791  	// Calculate consolidated actual size.
   792  	var objectActualSize int64
   793  
   794  	// Order online disks in accordance with distribution order.
   795  	// Order parts metadata in accordance with distribution order.
   796  	onlineDisks, partsMetadata = shuffleDisksAndPartsMetadataByIndex(onlineDisks, partsMetadata, fi)
   797  
   798  	// Save current erasure metadata for validation.
   799  	var currentFI = fi
   800  
   801  	// Allocate parts similar to incoming slice.
   802  	fi.Parts = make([]ObjectPartInfo, len(parts))
   803  
   804  	// Validate each part and then commit to disk.
   805  	for i, part := range parts {
   806  		partIdx := objectPartIndex(currentFI.Parts, part.PartNumber)
   807  		// All parts should have same part number.
   808  		if partIdx == -1 {
   809  			invp := InvalidPart{
   810  				PartNumber: part.PartNumber,
   811  				GotETag:    part.ETag,
   812  			}
   813  			return oi, invp
   814  		}
   815  
   816  		// ensure that part ETag is canonicalized to strip off extraneous quotes
   817  		part.ETag = canonicalizeETag(part.ETag)
   818  		if currentFI.Parts[partIdx].ETag != part.ETag {
   819  			invp := InvalidPart{
   820  				PartNumber: part.PartNumber,
   821  				ExpETag:    currentFI.Parts[partIdx].ETag,
   822  				GotETag:    part.ETag,
   823  			}
   824  			return oi, invp
   825  		}
   826  
   827  		// All parts except the last part has to be atleast 5MB.
   828  		if (i < len(parts)-1) && !isMinAllowedPartSize(currentFI.Parts[partIdx].ActualSize) {
   829  			return oi, PartTooSmall{
   830  				PartNumber: part.PartNumber,
   831  				PartSize:   currentFI.Parts[partIdx].ActualSize,
   832  				PartETag:   part.ETag,
   833  			}
   834  		}
   835  
   836  		// Save for total object size.
   837  		objectSize += currentFI.Parts[partIdx].Size
   838  
   839  		// Save the consolidated actual size.
   840  		objectActualSize += currentFI.Parts[partIdx].ActualSize
   841  
   842  		// Add incoming parts.
   843  		fi.Parts[i] = ObjectPartInfo{
   844  			Number:     part.PartNumber,
   845  			Size:       currentFI.Parts[partIdx].Size,
   846  			ActualSize: currentFI.Parts[partIdx].ActualSize,
   847  		}
   848  	}
   849  
   850  	// Save the final object size and modtime.
   851  	fi.Size = objectSize
   852  	fi.ModTime = opts.MTime
   853  	if opts.MTime.IsZero() {
   854  		fi.ModTime = UTCNow()
   855  	}
   856  
   857  	// Save successfully calculated md5sum.
   858  	fi.Metadata["etag"] = s3MD5
   859  	if opts.UserDefined["etag"] != "" { // preserve ETag if set
   860  		fi.Metadata["etag"] = opts.UserDefined["etag"]
   861  	}
   862  
   863  	// Save the consolidated actual size.
   864  	fi.Metadata[ReservedMetadataPrefix+"actual-size"] = strconv.FormatInt(objectActualSize, 10)
   865  
   866  	// Update all erasure metadata, make sure to not modify fields like
   867  	// checksum which are different on each disks.
   868  	for index := range partsMetadata {
   869  		if partsMetadata[index].IsValid() {
   870  			partsMetadata[index].Size = fi.Size
   871  			partsMetadata[index].ModTime = fi.ModTime
   872  			partsMetadata[index].Metadata = fi.Metadata
   873  			partsMetadata[index].Parts = fi.Parts
   874  		}
   875  	}
   876  
   877  	// Write final `xl.meta` at uploadID location
   878  	if onlineDisks, err = writeUniqueFileInfo(ctx, onlineDisks, minioMetaMultipartBucket, uploadIDPath, partsMetadata, writeQuorum); err != nil {
   879  		return oi, toObjectErr(err, minioMetaMultipartBucket, uploadIDPath)
   880  	}
   881  
   882  	// Remove parts that weren't present in CompleteMultipartUpload request.
   883  	for _, curpart := range currentFI.Parts {
   884  		if objectPartIndex(fi.Parts, curpart.Number) == -1 {
   885  			// Delete the missing part files. e.g,
   886  			// Request 1: NewMultipart
   887  			// Request 2: PutObjectPart 1
   888  			// Request 3: PutObjectPart 2
   889  			// Request 4: CompleteMultipartUpload --part 2
   890  			// N.B. 1st part is not present. This part should be removed from the storage.
   891  			er.removeObjectPart(bucket, object, uploadID, fi.DataDir, curpart.Number)
   892  		}
   893  	}
   894  
   895  	// Hold namespace to complete the transaction
   896  	lk := er.NewNSLock(bucket, object)
   897  	ctx, err = lk.GetLock(ctx, globalOperationTimeout)
   898  	if err != nil {
   899  		return oi, err
   900  	}
   901  	defer lk.Unlock()
   902  
   903  	// Rename the multipart object to final location.
   904  	if onlineDisks, err = renameData(ctx, onlineDisks, minioMetaMultipartBucket, uploadIDPath,
   905  		partsMetadata, bucket, object, writeQuorum); err != nil {
   906  		return oi, toObjectErr(err, bucket, object)
   907  	}
   908  
   909  	// Check if there is any offline disk and add it to the MRF list
   910  	for _, disk := range onlineDisks {
   911  		if disk != nil && disk.IsOnline() {
   912  			continue
   913  		}
   914  		er.addPartial(bucket, object, fi.VersionID)
   915  		break
   916  	}
   917  
   918  	for i := 0; i < len(onlineDisks); i++ {
   919  		if onlineDisks[i] != nil && onlineDisks[i].IsOnline() {
   920  			// Object info is the same in all disks, so we can pick
   921  			// the first meta from online disk
   922  			fi = partsMetadata[i]
   923  			break
   924  		}
   925  	}
   926  
   927  	// Success, return object info.
   928  	return fi.ToObjectInfo(bucket, object), nil
   929  }
   930  
   931  // AbortMultipartUpload - aborts an ongoing multipart operation
   932  // signified by the input uploadID. This is an atomic operation
   933  // doesn't require clients to initiate multiple such requests.
   934  //
   935  // All parts are purged from all disks and reference to the uploadID
   936  // would be removed from the system, rollback is not possible on this
   937  // operation.
   938  func (er erasureObjects) AbortMultipartUpload(ctx context.Context, bucket, object, uploadID string, opts ObjectOptions) (err error) {
   939  	lk := er.NewNSLock(bucket, pathJoin(object, uploadID))
   940  	ctx, err = lk.GetLock(ctx, globalOperationTimeout)
   941  	if err != nil {
   942  		return err
   943  	}
   944  	defer lk.Unlock()
   945  
   946  	// Validates if upload ID exists.
   947  	if err := er.checkUploadIDExists(ctx, bucket, object, uploadID); err != nil {
   948  		return toObjectErr(err, bucket, object, uploadID)
   949  	}
   950  
   951  	uploadIDPath := er.getUploadIDDir(bucket, object, uploadID)
   952  
   953  	// Read metadata associated with the object from all disks.
   954  	partsMetadata, errs := readAllFileInfo(ctx, er.getDisks(), minioMetaMultipartBucket, uploadIDPath, "", false)
   955  
   956  	// get Quorum for this object
   957  	_, writeQuorum, err := objectQuorumFromMeta(ctx, partsMetadata, errs, er.defaultParityCount)
   958  	if err != nil {
   959  		return toObjectErr(err, bucket, object, uploadID)
   960  	}
   961  
   962  	// Cleanup all uploaded parts.
   963  	if err = er.deleteObject(ctx, minioMetaMultipartBucket, uploadIDPath, writeQuorum); err != nil {
   964  		return toObjectErr(err, bucket, object, uploadID)
   965  	}
   966  
   967  	// Successfully purged.
   968  	return nil
   969  }