github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/bucket-replication.go (about)

     1  // Copyright (c) 2015-2021 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package cmd
    19  
    20  import (
    21  	"context"
    22  	"encoding/binary"
    23  	"errors"
    24  	"fmt"
    25  	"io"
    26  	"math/rand"
    27  	"net/http"
    28  	"net/url"
    29  	"path"
    30  	"reflect"
    31  	"strings"
    32  	"sync"
    33  	"sync/atomic"
    34  	"time"
    35  
    36  	"github.com/dustin/go-humanize"
    37  	"github.com/minio/madmin-go/v3"
    38  	"github.com/minio/minio-go/v7"
    39  	"github.com/minio/minio-go/v7/pkg/encrypt"
    40  	"github.com/minio/minio-go/v7/pkg/tags"
    41  	"github.com/minio/minio/internal/amztime"
    42  	"github.com/minio/minio/internal/bucket/bandwidth"
    43  	objectlock "github.com/minio/minio/internal/bucket/object/lock"
    44  	"github.com/minio/minio/internal/bucket/replication"
    45  	"github.com/minio/minio/internal/config/storageclass"
    46  	"github.com/minio/minio/internal/crypto"
    47  	"github.com/minio/minio/internal/event"
    48  	"github.com/minio/minio/internal/hash"
    49  	xhttp "github.com/minio/minio/internal/http"
    50  	xioutil "github.com/minio/minio/internal/ioutil"
    51  	"github.com/minio/minio/internal/logger"
    52  	"github.com/tinylib/msgp/msgp"
    53  	"github.com/zeebo/xxh3"
    54  	"golang.org/x/exp/maps"
    55  	"golang.org/x/exp/slices"
    56  )
    57  
    58  const (
    59  	throttleDeadline = 1 * time.Hour
    60  	// ReplicationReset has reset id and timestamp of last reset operation
    61  	ReplicationReset = "replication-reset"
    62  	// ReplicationStatus has internal replication status - stringified representation of target's replication status for all replication
    63  	// activity initiated from this cluster
    64  	ReplicationStatus = "replication-status"
    65  	// ReplicationTimestamp - the last time replication was initiated on this cluster for this object version
    66  	ReplicationTimestamp = "replication-timestamp"
    67  	// ReplicaStatus - this header is present if a replica was received by this cluster for this object version
    68  	ReplicaStatus = "replica-status"
    69  	// ReplicaTimestamp - the last time a replica was received by this cluster for this object version
    70  	ReplicaTimestamp = "replica-timestamp"
    71  	// TaggingTimestamp - the last time a tag metadata modification happened on this cluster for this object version
    72  	TaggingTimestamp = "tagging-timestamp"
    73  	// ObjectLockRetentionTimestamp - the last time a object lock metadata modification happened on this cluster for this object version
    74  	ObjectLockRetentionTimestamp = "objectlock-retention-timestamp"
    75  	// ObjectLockLegalHoldTimestamp - the last time a legal hold metadata modification happened on this cluster for this object version
    76  	ObjectLockLegalHoldTimestamp = "objectlock-legalhold-timestamp"
    77  	// ReplicationWorkerMultiplier is suggested worker multiplier if traffic exceeds replication worker capacity
    78  	ReplicationWorkerMultiplier = 1.5
    79  )
    80  
    81  // gets replication config associated to a given bucket name.
    82  func getReplicationConfig(ctx context.Context, bucketName string) (rc *replication.Config, err error) {
    83  	rCfg, _, err := globalBucketMetadataSys.GetReplicationConfig(ctx, bucketName)
    84  	if err != nil {
    85  		if errors.Is(err, BucketReplicationConfigNotFound{Bucket: bucketName}) || errors.Is(err, errInvalidArgument) {
    86  			return rCfg, err
    87  		}
    88  		logger.CriticalIf(ctx, err)
    89  	}
    90  	return rCfg, err
    91  }
    92  
    93  // validateReplicationDestination returns error if replication destination bucket missing or not configured
    94  // It also returns true if replication destination is same as this server.
    95  func validateReplicationDestination(ctx context.Context, bucket string, rCfg *replication.Config, checkRemote bool) (bool, APIError) {
    96  	var arns []string
    97  	if rCfg.RoleArn != "" {
    98  		arns = append(arns, rCfg.RoleArn)
    99  	} else {
   100  		for _, rule := range rCfg.Rules {
   101  			arns = append(arns, rule.Destination.String())
   102  		}
   103  	}
   104  	var sameTarget bool
   105  	for _, arnStr := range arns {
   106  		arn, err := madmin.ParseARN(arnStr)
   107  		if err != nil {
   108  			return sameTarget, errorCodes.ToAPIErrWithErr(ErrBucketRemoteArnInvalid, err)
   109  		}
   110  		if arn.Type != madmin.ReplicationService {
   111  			return sameTarget, toAPIError(ctx, BucketRemoteArnTypeInvalid{Bucket: bucket})
   112  		}
   113  		clnt := globalBucketTargetSys.GetRemoteTargetClient(bucket, arnStr)
   114  		if clnt == nil {
   115  			return sameTarget, toAPIError(ctx, BucketRemoteTargetNotFound{Bucket: bucket})
   116  		}
   117  		if checkRemote { // validate remote bucket
   118  			found, err := clnt.BucketExists(ctx, arn.Bucket)
   119  			if err != nil {
   120  				return sameTarget, errorCodes.ToAPIErrWithErr(ErrRemoteDestinationNotFoundError, err)
   121  			}
   122  			if !found {
   123  				return sameTarget, errorCodes.ToAPIErrWithErr(ErrRemoteDestinationNotFoundError, BucketRemoteTargetNotFound{Bucket: arn.Bucket})
   124  			}
   125  			if ret, err := globalBucketObjectLockSys.Get(bucket); err == nil {
   126  				if ret.LockEnabled {
   127  					lock, _, _, _, err := clnt.GetObjectLockConfig(ctx, arn.Bucket)
   128  					if err != nil {
   129  						return sameTarget, errorCodes.ToAPIErrWithErr(ErrReplicationDestinationMissingLock, err)
   130  					}
   131  					if lock != objectlock.Enabled {
   132  						return sameTarget, errorCodes.ToAPIErrWithErr(ErrReplicationDestinationMissingLock, nil)
   133  					}
   134  				}
   135  			}
   136  		}
   137  		// validate replication ARN against target endpoint
   138  		c := globalBucketTargetSys.GetRemoteTargetClient(bucket, arnStr)
   139  		if c != nil {
   140  			if err := checkRemoteEndpoint(ctx, c.EndpointURL()); err != nil {
   141  				switch err.(type) {
   142  				case BucketRemoteIdenticalToSource:
   143  					return true, errorCodes.ToAPIErrWithErr(ErrBucketRemoteIdenticalToSource, fmt.Errorf("remote target endpoint %s is self referential", c.EndpointURL().String()))
   144  				default:
   145  				}
   146  			}
   147  			if c.EndpointURL().String() == clnt.EndpointURL().String() {
   148  				selfTarget, _ := isLocalHost(clnt.EndpointURL().Hostname(), clnt.EndpointURL().Port(), globalMinioPort)
   149  				if !sameTarget {
   150  					sameTarget = selfTarget
   151  				}
   152  				continue
   153  			}
   154  		}
   155  	}
   156  
   157  	if len(arns) == 0 {
   158  		return false, toAPIError(ctx, BucketRemoteTargetNotFound{Bucket: bucket})
   159  	}
   160  	return sameTarget, toAPIError(ctx, nil)
   161  }
   162  
   163  // performs a http request to remote endpoint to check if deployment id of remote endpoint is same as
   164  // local cluster deployment id. This is to prevent replication to self, especially in case of a loadbalancer
   165  // in front of MinIO.
   166  func checkRemoteEndpoint(ctx context.Context, epURL *url.URL) error {
   167  	reqURL := &url.URL{
   168  		Scheme: epURL.Scheme,
   169  		Host:   epURL.Host,
   170  		Path:   healthCheckPathPrefix + healthCheckReadinessPath,
   171  	}
   172  
   173  	req, err := http.NewRequestWithContext(ctx, http.MethodGet, reqURL.String(), nil)
   174  	if err != nil {
   175  		return err
   176  	}
   177  
   178  	client := &http.Client{
   179  		Transport: globalRemoteTargetTransport,
   180  		Timeout:   10 * time.Second,
   181  	}
   182  
   183  	resp, err := client.Do(req)
   184  	if err != nil {
   185  		return err
   186  	}
   187  	if err == nil {
   188  		// Drain the connection.
   189  		xhttp.DrainBody(resp.Body)
   190  	}
   191  	if resp != nil {
   192  		amzid := resp.Header.Get(xhttp.AmzRequestHostID)
   193  		if _, ok := globalNodeNamesHex[amzid]; ok {
   194  			return BucketRemoteIdenticalToSource{
   195  				Endpoint: epURL.String(),
   196  			}
   197  		}
   198  	}
   199  	return nil
   200  }
   201  
   202  type mustReplicateOptions struct {
   203  	meta               map[string]string
   204  	status             replication.StatusType
   205  	opType             replication.Type
   206  	replicationRequest bool // incoming request is a replication request
   207  }
   208  
   209  func (o mustReplicateOptions) ReplicationStatus() (s replication.StatusType) {
   210  	if rs, ok := o.meta[xhttp.AmzBucketReplicationStatus]; ok {
   211  		return replication.StatusType(rs)
   212  	}
   213  	return s
   214  }
   215  
   216  func (o mustReplicateOptions) isExistingObjectReplication() bool {
   217  	return o.opType == replication.ExistingObjectReplicationType
   218  }
   219  
   220  func (o mustReplicateOptions) isMetadataReplication() bool {
   221  	return o.opType == replication.MetadataReplicationType
   222  }
   223  
   224  func (o ObjectInfo) getMustReplicateOptions(op replication.Type, opts ObjectOptions) mustReplicateOptions {
   225  	return getMustReplicateOptions(o.UserDefined, o.UserTags, o.ReplicationStatus, op, opts)
   226  }
   227  
   228  func getMustReplicateOptions(userDefined map[string]string, userTags string, status replication.StatusType, op replication.Type, opts ObjectOptions) mustReplicateOptions {
   229  	meta := cloneMSS(userDefined)
   230  	if userTags != "" {
   231  		meta[xhttp.AmzObjectTagging] = userTags
   232  	}
   233  
   234  	return mustReplicateOptions{
   235  		meta:               meta,
   236  		status:             status,
   237  		opType:             op,
   238  		replicationRequest: opts.ReplicationRequest,
   239  	}
   240  }
   241  
   242  // mustReplicate returns 2 booleans - true if object meets replication criteria and true if replication is to be done in
   243  // a synchronous manner.
   244  func mustReplicate(ctx context.Context, bucket, object string, mopts mustReplicateOptions) (dsc ReplicateDecision) {
   245  	// object layer not initialized we return with no decision.
   246  	if newObjectLayerFn() == nil {
   247  		return
   248  	}
   249  
   250  	// Disable server-side replication on object prefixes which are excluded
   251  	// from versioning via the MinIO bucket versioning extension.
   252  	if !globalBucketVersioningSys.PrefixEnabled(bucket, object) {
   253  		return
   254  	}
   255  
   256  	replStatus := mopts.ReplicationStatus()
   257  	if replStatus == replication.Replica && !mopts.isMetadataReplication() {
   258  		return
   259  	}
   260  
   261  	if mopts.replicationRequest { // incoming replication request on target cluster
   262  		return
   263  	}
   264  	cfg, err := getReplicationConfig(ctx, bucket)
   265  	if err != nil {
   266  		return
   267  	}
   268  	opts := replication.ObjectOpts{
   269  		Name:           object,
   270  		SSEC:           crypto.SSEC.IsEncrypted(mopts.meta),
   271  		Replica:        replStatus == replication.Replica,
   272  		ExistingObject: mopts.isExistingObjectReplication(),
   273  	}
   274  	tagStr, ok := mopts.meta[xhttp.AmzObjectTagging]
   275  	if ok {
   276  		opts.UserTags = tagStr
   277  	}
   278  	tgtArns := cfg.FilterTargetArns(opts)
   279  	for _, tgtArn := range tgtArns {
   280  		tgt := globalBucketTargetSys.GetRemoteTargetClient(bucket, tgtArn)
   281  		// the target online status should not be used here while deciding
   282  		// whether to replicate as the target could be temporarily down
   283  		opts.TargetArn = tgtArn
   284  		replicate := cfg.Replicate(opts)
   285  		var synchronous bool
   286  		if tgt != nil {
   287  			synchronous = tgt.replicateSync
   288  		}
   289  		dsc.Set(newReplicateTargetDecision(tgtArn, replicate, synchronous))
   290  	}
   291  	return dsc
   292  }
   293  
   294  // Standard headers that needs to be extracted from User metadata.
   295  var standardHeaders = []string{
   296  	xhttp.ContentType,
   297  	xhttp.CacheControl,
   298  	xhttp.ContentEncoding,
   299  	xhttp.ContentLanguage,
   300  	xhttp.ContentDisposition,
   301  	xhttp.AmzStorageClass,
   302  	xhttp.AmzObjectTagging,
   303  	xhttp.AmzBucketReplicationStatus,
   304  	xhttp.AmzObjectLockMode,
   305  	xhttp.AmzObjectLockRetainUntilDate,
   306  	xhttp.AmzObjectLockLegalHold,
   307  	xhttp.AmzTagCount,
   308  	xhttp.AmzServerSideEncryption,
   309  }
   310  
   311  // returns true if any of the objects being deleted qualifies for replication.
   312  func hasReplicationRules(ctx context.Context, bucket string, objects []ObjectToDelete) bool {
   313  	c, err := getReplicationConfig(ctx, bucket)
   314  	if err != nil || c == nil {
   315  		return false
   316  	}
   317  	for _, obj := range objects {
   318  		if c.HasActiveRules(obj.ObjectName, true) {
   319  			return true
   320  		}
   321  	}
   322  	return false
   323  }
   324  
   325  // isStandardHeader returns true if header is a supported header and not a custom header
   326  func isStandardHeader(matchHeaderKey string) bool {
   327  	return equals(matchHeaderKey, standardHeaders...)
   328  }
   329  
   330  // returns whether object version is a deletemarker and if object qualifies for replication
   331  func checkReplicateDelete(ctx context.Context, bucket string, dobj ObjectToDelete, oi ObjectInfo, delOpts ObjectOptions, gerr error) (dsc ReplicateDecision) {
   332  	rcfg, err := getReplicationConfig(ctx, bucket)
   333  	if err != nil || rcfg == nil {
   334  		return
   335  	}
   336  	// If incoming request is a replication request, it does not need to be re-replicated.
   337  	if delOpts.ReplicationRequest {
   338  		return
   339  	}
   340  	// Skip replication if this object's prefix is excluded from being
   341  	// versioned.
   342  	if !delOpts.Versioned {
   343  		return
   344  	}
   345  	opts := replication.ObjectOpts{
   346  		Name:         dobj.ObjectName,
   347  		SSEC:         crypto.SSEC.IsEncrypted(oi.UserDefined),
   348  		UserTags:     oi.UserTags,
   349  		DeleteMarker: oi.DeleteMarker,
   350  		VersionID:    dobj.VersionID,
   351  		OpType:       replication.DeleteReplicationType,
   352  	}
   353  	tgtArns := rcfg.FilterTargetArns(opts)
   354  	dsc.targetsMap = make(map[string]replicateTargetDecision, len(tgtArns))
   355  	if len(tgtArns) == 0 {
   356  		return dsc
   357  	}
   358  	var sync, replicate bool
   359  	for _, tgtArn := range tgtArns {
   360  		opts.TargetArn = tgtArn
   361  		replicate = rcfg.Replicate(opts)
   362  		// when incoming delete is removal of a delete marker(a.k.a versioned delete),
   363  		// GetObjectInfo returns extra information even though it returns errFileNotFound
   364  		if gerr != nil {
   365  			validReplStatus := false
   366  			switch oi.TargetReplicationStatus(tgtArn) {
   367  			case replication.Pending, replication.Completed, replication.Failed:
   368  				validReplStatus = true
   369  			}
   370  			if oi.DeleteMarker && (validReplStatus || replicate) {
   371  				dsc.Set(newReplicateTargetDecision(tgtArn, replicate, sync))
   372  				continue
   373  			}
   374  			// can be the case that other cluster is down and duplicate `mc rm --vid`
   375  			// is issued - this still needs to be replicated back to the other target
   376  			if !oi.VersionPurgeStatus.Empty() {
   377  				replicate = oi.VersionPurgeStatus == Pending || oi.VersionPurgeStatus == Failed
   378  				dsc.Set(newReplicateTargetDecision(tgtArn, replicate, sync))
   379  			}
   380  			continue
   381  		}
   382  		tgt := globalBucketTargetSys.GetRemoteTargetClient(bucket, tgtArn)
   383  		// the target online status should not be used here while deciding
   384  		// whether to replicate deletes as the target could be temporarily down
   385  		tgtDsc := newReplicateTargetDecision(tgtArn, false, false)
   386  		if tgt != nil {
   387  			tgtDsc = newReplicateTargetDecision(tgtArn, replicate, tgt.replicateSync)
   388  		}
   389  		dsc.Set(tgtDsc)
   390  	}
   391  	return dsc
   392  }
   393  
   394  // replicate deletes to the designated replication target if replication configuration
   395  // has delete marker replication or delete replication (MinIO extension to allow deletes where version id
   396  // is specified) enabled.
   397  // Similar to bucket replication for PUT operation, soft delete (a.k.a setting delete marker) and
   398  // permanent deletes (by specifying a version ID in the delete operation) have three states "Pending", "Complete"
   399  // and "Failed" to mark the status of the replication of "DELETE" operation. All failed operations can
   400  // then be retried by healing. In the case of permanent deletes, until the replication is completed on the
   401  // target cluster, the object version is marked deleted on the source and hidden from listing. It is permanently
   402  // deleted from the source when the VersionPurgeStatus changes to "Complete", i.e after replication succeeds
   403  // on target.
   404  func replicateDelete(ctx context.Context, dobj DeletedObjectReplicationInfo, objectAPI ObjectLayer) {
   405  	var replicationStatus replication.StatusType
   406  	bucket := dobj.Bucket
   407  	versionID := dobj.DeleteMarkerVersionID
   408  	if versionID == "" {
   409  		versionID = dobj.VersionID
   410  	}
   411  
   412  	defer func() {
   413  		replStatus := string(replicationStatus)
   414  		auditLogInternal(context.Background(), AuditLogOptions{
   415  			Event:     dobj.EventType,
   416  			APIName:   ReplicateDeleteAPI,
   417  			Bucket:    bucket,
   418  			Object:    dobj.ObjectName,
   419  			VersionID: versionID,
   420  			Status:    replStatus,
   421  		})
   422  	}()
   423  
   424  	rcfg, err := getReplicationConfig(ctx, bucket)
   425  	if err != nil || rcfg == nil {
   426  		logger.LogOnceIf(ctx, fmt.Errorf("unable to obtain replication config for bucket: %s: err: %s", bucket, err), bucket)
   427  		sendEvent(eventArgs{
   428  			BucketName: bucket,
   429  			Object: ObjectInfo{
   430  				Bucket:       bucket,
   431  				Name:         dobj.ObjectName,
   432  				VersionID:    versionID,
   433  				DeleteMarker: dobj.DeleteMarker,
   434  			},
   435  			UserAgent: "Internal: [Replication]",
   436  			Host:      globalLocalNodeName,
   437  			EventName: event.ObjectReplicationNotTracked,
   438  		})
   439  		return
   440  	}
   441  	dsc, err := parseReplicateDecision(ctx, bucket, dobj.ReplicationState.ReplicateDecisionStr)
   442  	if err != nil {
   443  		logger.LogOnceIf(ctx, fmt.Errorf("unable to parse replication decision parameters for bucket: %s, err: %s, decision: %s",
   444  			bucket, err, dobj.ReplicationState.ReplicateDecisionStr), dobj.ReplicationState.ReplicateDecisionStr)
   445  		sendEvent(eventArgs{
   446  			BucketName: bucket,
   447  			Object: ObjectInfo{
   448  				Bucket:       bucket,
   449  				Name:         dobj.ObjectName,
   450  				VersionID:    versionID,
   451  				DeleteMarker: dobj.DeleteMarker,
   452  			},
   453  			UserAgent: "Internal: [Replication]",
   454  			Host:      globalLocalNodeName,
   455  			EventName: event.ObjectReplicationNotTracked,
   456  		})
   457  		return
   458  	}
   459  
   460  	// Lock the object name before starting replication operation.
   461  	// Use separate lock that doesn't collide with regular objects.
   462  	lk := objectAPI.NewNSLock(bucket, "/[replicate]/"+dobj.ObjectName)
   463  	lkctx, err := lk.GetLock(ctx, globalOperationTimeout)
   464  	if err != nil {
   465  		globalReplicationPool.queueMRFSave(dobj.ToMRFEntry())
   466  		sendEvent(eventArgs{
   467  			BucketName: bucket,
   468  			Object: ObjectInfo{
   469  				Bucket:       bucket,
   470  				Name:         dobj.ObjectName,
   471  				VersionID:    versionID,
   472  				DeleteMarker: dobj.DeleteMarker,
   473  			},
   474  			UserAgent: "Internal: [Replication]",
   475  			Host:      globalLocalNodeName,
   476  			EventName: event.ObjectReplicationNotTracked,
   477  		})
   478  		return
   479  	}
   480  	ctx = lkctx.Context()
   481  	defer lk.Unlock(lkctx)
   482  
   483  	rinfos := replicatedInfos{Targets: make([]replicatedTargetInfo, 0, len(dsc.targetsMap))}
   484  	var wg sync.WaitGroup
   485  	var mu sync.Mutex
   486  	for _, tgtEntry := range dsc.targetsMap {
   487  		if !tgtEntry.Replicate {
   488  			continue
   489  		}
   490  		// if dobj.TargetArn is not empty string, this is a case of specific target being re-synced.
   491  		if dobj.TargetArn != "" && dobj.TargetArn != tgtEntry.Arn {
   492  			continue
   493  		}
   494  		tgtClnt := globalBucketTargetSys.GetRemoteTargetClient(bucket, tgtEntry.Arn)
   495  		if tgtClnt == nil {
   496  			// Skip stale targets if any and log them to be missing at least once.
   497  			logger.LogOnceIf(ctx, fmt.Errorf("failed to get target for bucket:%s arn:%s", bucket, tgtEntry.Arn), tgtEntry.Arn)
   498  			sendEvent(eventArgs{
   499  				EventName:  event.ObjectReplicationNotTracked,
   500  				BucketName: bucket,
   501  				Object: ObjectInfo{
   502  					Bucket:       bucket,
   503  					Name:         dobj.ObjectName,
   504  					VersionID:    versionID,
   505  					DeleteMarker: dobj.DeleteMarker,
   506  				},
   507  				UserAgent: "Internal: [Replication]",
   508  				Host:      globalLocalNodeName,
   509  			})
   510  			continue
   511  		}
   512  		wg.Add(1)
   513  		go func(tgt *TargetClient) {
   514  			defer wg.Done()
   515  			tgtInfo := replicateDeleteToTarget(ctx, dobj, tgt)
   516  
   517  			mu.Lock()
   518  			rinfos.Targets = append(rinfos.Targets, tgtInfo)
   519  			mu.Unlock()
   520  		}(tgtClnt)
   521  	}
   522  	wg.Wait()
   523  
   524  	replicationStatus = rinfos.ReplicationStatus()
   525  	prevStatus := dobj.DeleteMarkerReplicationStatus()
   526  
   527  	if dobj.VersionID != "" {
   528  		prevStatus = replication.StatusType(dobj.VersionPurgeStatus())
   529  		replicationStatus = replication.StatusType(rinfos.VersionPurgeStatus())
   530  	}
   531  
   532  	// to decrement pending count later.
   533  	for _, rinfo := range rinfos.Targets {
   534  		if rinfo.ReplicationStatus != rinfo.PrevReplicationStatus {
   535  			globalReplicationStats.Update(dobj.Bucket, rinfo, replicationStatus,
   536  				prevStatus)
   537  		}
   538  	}
   539  
   540  	eventName := event.ObjectReplicationComplete
   541  	if replicationStatus == replication.Failed {
   542  		eventName = event.ObjectReplicationFailed
   543  		globalReplicationPool.queueMRFSave(dobj.ToMRFEntry())
   544  	}
   545  	drs := getReplicationState(rinfos, dobj.ReplicationState, dobj.VersionID)
   546  	if replicationStatus != prevStatus {
   547  		drs.ReplicationTimeStamp = UTCNow()
   548  	}
   549  
   550  	dobjInfo, err := objectAPI.DeleteObject(ctx, bucket, dobj.ObjectName, ObjectOptions{
   551  		VersionID:         versionID,
   552  		MTime:             dobj.DeleteMarkerMTime.Time,
   553  		DeleteReplication: drs,
   554  		Versioned:         globalBucketVersioningSys.PrefixEnabled(bucket, dobj.ObjectName),
   555  		// Objects matching prefixes should not leave delete markers,
   556  		// dramatically reduces namespace pollution while keeping the
   557  		// benefits of replication, make sure to apply version suspension
   558  		// only at bucket level instead.
   559  		VersionSuspended: globalBucketVersioningSys.Suspended(bucket),
   560  	})
   561  	if err != nil && !isErrVersionNotFound(err) { // VersionNotFound would be reported by pool that object version is missing on.
   562  		sendEvent(eventArgs{
   563  			BucketName: bucket,
   564  			Object: ObjectInfo{
   565  				Bucket:       bucket,
   566  				Name:         dobj.ObjectName,
   567  				VersionID:    versionID,
   568  				DeleteMarker: dobj.DeleteMarker,
   569  			},
   570  			UserAgent: "Internal: [Replication]",
   571  			Host:      globalLocalNodeName,
   572  			EventName: eventName,
   573  		})
   574  	} else {
   575  		sendEvent(eventArgs{
   576  			BucketName: bucket,
   577  			Object:     dobjInfo,
   578  			UserAgent:  "Internal: [Replication]",
   579  			Host:       globalLocalNodeName,
   580  			EventName:  eventName,
   581  		})
   582  	}
   583  }
   584  
   585  func replicateDeleteToTarget(ctx context.Context, dobj DeletedObjectReplicationInfo, tgt *TargetClient) (rinfo replicatedTargetInfo) {
   586  	versionID := dobj.DeleteMarkerVersionID
   587  	if versionID == "" {
   588  		versionID = dobj.VersionID
   589  	}
   590  
   591  	rinfo = dobj.ReplicationState.targetState(tgt.ARN)
   592  	rinfo.OpType = dobj.OpType
   593  	rinfo.endpoint = tgt.EndpointURL().Host
   594  	rinfo.secure = tgt.EndpointURL().Scheme == "https"
   595  	defer func() {
   596  		if rinfo.ReplicationStatus == replication.Completed && tgt.ResetID != "" && dobj.OpType == replication.ExistingObjectReplicationType {
   597  			rinfo.ResyncTimestamp = fmt.Sprintf("%s;%s", UTCNow().Format(http.TimeFormat), tgt.ResetID)
   598  		}
   599  	}()
   600  
   601  	if dobj.VersionID == "" && rinfo.PrevReplicationStatus == replication.Completed && dobj.OpType != replication.ExistingObjectReplicationType {
   602  		rinfo.ReplicationStatus = rinfo.PrevReplicationStatus
   603  		return
   604  	}
   605  	if dobj.VersionID != "" && rinfo.VersionPurgeStatus == Complete {
   606  		return
   607  	}
   608  	if globalBucketTargetSys.isOffline(tgt.EndpointURL()) {
   609  		logger.LogOnceIf(ctx, fmt.Errorf("remote target is offline for bucket:%s arn:%s", dobj.Bucket, tgt.ARN), "replication-target-offline-delete-"+tgt.ARN)
   610  		sendEvent(eventArgs{
   611  			BucketName: dobj.Bucket,
   612  			Object: ObjectInfo{
   613  				Bucket:       dobj.Bucket,
   614  				Name:         dobj.ObjectName,
   615  				VersionID:    dobj.VersionID,
   616  				DeleteMarker: dobj.DeleteMarker,
   617  			},
   618  			UserAgent: "Internal: [Replication]",
   619  			Host:      globalLocalNodeName,
   620  			EventName: event.ObjectReplicationNotTracked,
   621  		})
   622  		if dobj.VersionID == "" {
   623  			rinfo.ReplicationStatus = replication.Failed
   624  		} else {
   625  			rinfo.VersionPurgeStatus = Failed
   626  		}
   627  		return
   628  	}
   629  	// early return if already replicated delete marker for existing object replication/ healing delete markers
   630  	if dobj.DeleteMarkerVersionID != "" {
   631  		toi, err := tgt.StatObject(ctx, tgt.Bucket, dobj.ObjectName, minio.StatObjectOptions{
   632  			VersionID: versionID,
   633  			Internal: minio.AdvancedGetOptions{
   634  				ReplicationProxyRequest:           "false",
   635  				IsReplicationReadyForDeleteMarker: true,
   636  			},
   637  		})
   638  		serr := ErrorRespToObjectError(err, dobj.Bucket, dobj.ObjectName, dobj.VersionID)
   639  		switch {
   640  		case isErrMethodNotAllowed(serr):
   641  			// delete marker already replicated
   642  			if dobj.VersionID == "" && rinfo.VersionPurgeStatus.Empty() {
   643  				rinfo.ReplicationStatus = replication.Completed
   644  				return
   645  			}
   646  		case isErrObjectNotFound(serr), isErrVersionNotFound(serr):
   647  			// version being purged is already not found on target.
   648  			if !rinfo.VersionPurgeStatus.Empty() {
   649  				rinfo.VersionPurgeStatus = Complete
   650  				return
   651  			}
   652  		case isErrReadQuorum(serr), isErrWriteQuorum(serr):
   653  			// destination has some quorum issues, perform removeObject() anyways
   654  			// to complete the operation.
   655  		default:
   656  			if err != nil && minio.IsNetworkOrHostDown(err, true) && !globalBucketTargetSys.isOffline(tgt.EndpointURL()) {
   657  				globalBucketTargetSys.markOffline(tgt.EndpointURL())
   658  			}
   659  			// mark delete marker replication as failed if target cluster not ready to receive
   660  			// this request yet (object version not replicated yet)
   661  			if err != nil && !toi.ReplicationReady {
   662  				rinfo.ReplicationStatus = replication.Failed
   663  				rinfo.Err = err
   664  				return
   665  			}
   666  		}
   667  	}
   668  	rmErr := tgt.RemoveObject(ctx, tgt.Bucket, dobj.ObjectName, minio.RemoveObjectOptions{
   669  		VersionID: versionID,
   670  		Internal: minio.AdvancedRemoveOptions{
   671  			ReplicationDeleteMarker: dobj.DeleteMarkerVersionID != "",
   672  			ReplicationMTime:        dobj.DeleteMarkerMTime.Time,
   673  			ReplicationStatus:       minio.ReplicationStatusReplica,
   674  			ReplicationRequest:      true, // always set this to distinguish between `mc mirror` replication and serverside
   675  		},
   676  	})
   677  	if rmErr != nil {
   678  		rinfo.Err = rmErr
   679  		if dobj.VersionID == "" {
   680  			rinfo.ReplicationStatus = replication.Failed
   681  		} else {
   682  			rinfo.VersionPurgeStatus = Failed
   683  		}
   684  		logger.LogIf(ctx, fmt.Errorf("unable to replicate delete marker to %s: %s/%s(%s): %w", tgt.EndpointURL(), tgt.Bucket, dobj.ObjectName, versionID, rmErr))
   685  		if rmErr != nil && minio.IsNetworkOrHostDown(rmErr, true) && !globalBucketTargetSys.isOffline(tgt.EndpointURL()) {
   686  			globalBucketTargetSys.markOffline(tgt.EndpointURL())
   687  		}
   688  	} else {
   689  		if dobj.VersionID == "" {
   690  			rinfo.ReplicationStatus = replication.Completed
   691  		} else {
   692  			rinfo.VersionPurgeStatus = Complete
   693  		}
   694  	}
   695  	return
   696  }
   697  
   698  func getCopyObjMetadata(oi ObjectInfo, sc string) map[string]string {
   699  	meta := make(map[string]string, len(oi.UserDefined))
   700  	for k, v := range oi.UserDefined {
   701  		if stringsHasPrefixFold(k, ReservedMetadataPrefixLower) {
   702  			continue
   703  		}
   704  
   705  		if equals(k, xhttp.AmzBucketReplicationStatus) {
   706  			continue
   707  		}
   708  
   709  		// https://github.com/google/security-research/security/advisories/GHSA-76wf-9vgp-pj7w
   710  		if equals(k, xhttp.AmzMetaUnencryptedContentLength, xhttp.AmzMetaUnencryptedContentMD5) {
   711  			continue
   712  		}
   713  		meta[k] = v
   714  	}
   715  
   716  	if oi.ContentEncoding != "" {
   717  		meta[xhttp.ContentEncoding] = oi.ContentEncoding
   718  	}
   719  
   720  	if oi.ContentType != "" {
   721  		meta[xhttp.ContentType] = oi.ContentType
   722  	}
   723  
   724  	meta[xhttp.AmzObjectTagging] = oi.UserTags
   725  	meta[xhttp.AmzTagDirective] = "REPLACE"
   726  
   727  	if sc == "" {
   728  		sc = oi.StorageClass
   729  	}
   730  	// drop non standard storage classes for tiering from replication
   731  	if sc != "" && (sc == storageclass.RRS || sc == storageclass.STANDARD) {
   732  		meta[xhttp.AmzStorageClass] = sc
   733  	}
   734  
   735  	meta[xhttp.MinIOSourceETag] = oi.ETag
   736  	meta[xhttp.MinIOSourceMTime] = oi.ModTime.UTC().Format(time.RFC3339Nano)
   737  	meta[xhttp.AmzBucketReplicationStatus] = replication.Replica.String()
   738  	return meta
   739  }
   740  
   741  type caseInsensitiveMap map[string]string
   742  
   743  // Lookup map entry case insensitively.
   744  func (m caseInsensitiveMap) Lookup(key string) (string, bool) {
   745  	if len(m) == 0 {
   746  		return "", false
   747  	}
   748  	for _, k := range []string{
   749  		key,
   750  		strings.ToLower(key),
   751  		http.CanonicalHeaderKey(key),
   752  	} {
   753  		v, ok := m[k]
   754  		if ok {
   755  			return v, ok
   756  		}
   757  	}
   758  	return "", false
   759  }
   760  
   761  func putReplicationOpts(ctx context.Context, sc string, objInfo ObjectInfo) (putOpts minio.PutObjectOptions, err error) {
   762  	meta := make(map[string]string)
   763  	for k, v := range objInfo.UserDefined {
   764  		// In case of SSE-C objects copy the allowed internal headers as well
   765  		if !crypto.SSEC.IsEncrypted(objInfo.UserDefined) || !slices.Contains(maps.Keys(validSSEReplicationHeaders), k) {
   766  			if stringsHasPrefixFold(k, ReservedMetadataPrefixLower) {
   767  				continue
   768  			}
   769  			if isStandardHeader(k) {
   770  				continue
   771  			}
   772  		}
   773  		if slices.Contains(maps.Keys(validSSEReplicationHeaders), k) {
   774  			meta[validSSEReplicationHeaders[k]] = v
   775  		} else {
   776  			meta[k] = v
   777  		}
   778  	}
   779  
   780  	if sc == "" && (objInfo.StorageClass == storageclass.STANDARD || objInfo.StorageClass == storageclass.RRS) {
   781  		sc = objInfo.StorageClass
   782  	}
   783  	putOpts = minio.PutObjectOptions{
   784  		UserMetadata:    meta,
   785  		ContentType:     objInfo.ContentType,
   786  		ContentEncoding: objInfo.ContentEncoding,
   787  		Expires:         objInfo.Expires,
   788  		StorageClass:    sc,
   789  		Internal: minio.AdvancedPutOptions{
   790  			SourceVersionID:    objInfo.VersionID,
   791  			ReplicationStatus:  minio.ReplicationStatusReplica,
   792  			SourceMTime:        objInfo.ModTime,
   793  			SourceETag:         objInfo.ETag,
   794  			ReplicationRequest: true, // always set this to distinguish between `mc mirror` replication and serverside
   795  		},
   796  	}
   797  	if objInfo.UserTags != "" {
   798  		tag, _ := tags.ParseObjectTags(objInfo.UserTags)
   799  		if tag != nil {
   800  			putOpts.UserTags = tag.ToMap()
   801  			// set tag timestamp in opts
   802  			tagTimestamp := objInfo.ModTime
   803  			if tagTmstampStr, ok := objInfo.UserDefined[ReservedMetadataPrefixLower+TaggingTimestamp]; ok {
   804  				tagTimestamp, err = time.Parse(time.RFC3339Nano, tagTmstampStr)
   805  				if err != nil {
   806  					return putOpts, err
   807  				}
   808  			}
   809  			putOpts.Internal.TaggingTimestamp = tagTimestamp
   810  		}
   811  	}
   812  
   813  	lkMap := caseInsensitiveMap(objInfo.UserDefined)
   814  	if lang, ok := lkMap.Lookup(xhttp.ContentLanguage); ok {
   815  		putOpts.ContentLanguage = lang
   816  	}
   817  	if disp, ok := lkMap.Lookup(xhttp.ContentDisposition); ok {
   818  		putOpts.ContentDisposition = disp
   819  	}
   820  	if cc, ok := lkMap.Lookup(xhttp.CacheControl); ok {
   821  		putOpts.CacheControl = cc
   822  	}
   823  	if mode, ok := lkMap.Lookup(xhttp.AmzObjectLockMode); ok {
   824  		rmode := minio.RetentionMode(mode)
   825  		putOpts.Mode = rmode
   826  	}
   827  	if retainDateStr, ok := lkMap.Lookup(xhttp.AmzObjectLockRetainUntilDate); ok {
   828  		rdate, err := amztime.ISO8601Parse(retainDateStr)
   829  		if err != nil {
   830  			return putOpts, err
   831  		}
   832  		putOpts.RetainUntilDate = rdate
   833  		// set retention timestamp in opts
   834  		retTimestamp := objInfo.ModTime
   835  		if retainTmstampStr, ok := objInfo.UserDefined[ReservedMetadataPrefixLower+ObjectLockRetentionTimestamp]; ok {
   836  			retTimestamp, err = time.Parse(time.RFC3339Nano, retainTmstampStr)
   837  			if err != nil {
   838  				return putOpts, err
   839  			}
   840  		}
   841  		putOpts.Internal.RetentionTimestamp = retTimestamp
   842  	}
   843  	if lhold, ok := lkMap.Lookup(xhttp.AmzObjectLockLegalHold); ok {
   844  		putOpts.LegalHold = minio.LegalHoldStatus(lhold)
   845  		// set legalhold timestamp in opts
   846  		lholdTimestamp := objInfo.ModTime
   847  		if lholdTmstampStr, ok := objInfo.UserDefined[ReservedMetadataPrefixLower+ObjectLockLegalHoldTimestamp]; ok {
   848  			lholdTimestamp, err = time.Parse(time.RFC3339Nano, lholdTmstampStr)
   849  			if err != nil {
   850  				return putOpts, err
   851  			}
   852  		}
   853  		putOpts.Internal.LegalholdTimestamp = lholdTimestamp
   854  	}
   855  	if crypto.S3.IsEncrypted(objInfo.UserDefined) {
   856  		putOpts.ServerSideEncryption = encrypt.NewSSE()
   857  	}
   858  	return
   859  }
   860  
   861  type replicationAction string
   862  
   863  const (
   864  	replicateMetadata replicationAction = "metadata"
   865  	replicateNone     replicationAction = "none"
   866  	replicateAll      replicationAction = "all"
   867  )
   868  
   869  // matches k1 with all keys, returns 'true' if one of them matches
   870  func equals(k1 string, keys ...string) bool {
   871  	for _, k2 := range keys {
   872  		if strings.EqualFold(k1, k2) {
   873  			return true
   874  		}
   875  	}
   876  	return false
   877  }
   878  
   879  // returns replicationAction by comparing metadata between source and target
   880  func getReplicationAction(oi1 ObjectInfo, oi2 minio.ObjectInfo, opType replication.Type) replicationAction {
   881  	// Avoid resyncing null versions created prior to enabling replication if target has a newer copy
   882  	if opType == replication.ExistingObjectReplicationType &&
   883  		oi1.ModTime.Unix() > oi2.LastModified.Unix() && oi1.VersionID == nullVersionID {
   884  		return replicateNone
   885  	}
   886  	sz, _ := oi1.GetActualSize()
   887  
   888  	// needs full replication
   889  	if oi1.ETag != oi2.ETag ||
   890  		oi1.VersionID != oi2.VersionID ||
   891  		sz != oi2.Size ||
   892  		oi1.DeleteMarker != oi2.IsDeleteMarker ||
   893  		oi1.ModTime.Unix() != oi2.LastModified.Unix() {
   894  		return replicateAll
   895  	}
   896  
   897  	if oi1.ContentType != oi2.ContentType {
   898  		return replicateMetadata
   899  	}
   900  
   901  	if oi1.ContentEncoding != "" {
   902  		enc, ok := oi2.Metadata[xhttp.ContentEncoding]
   903  		if !ok {
   904  			enc, ok = oi2.Metadata[strings.ToLower(xhttp.ContentEncoding)]
   905  			if !ok {
   906  				return replicateMetadata
   907  			}
   908  		}
   909  		if strings.Join(enc, ",") != oi1.ContentEncoding {
   910  			return replicateMetadata
   911  		}
   912  	}
   913  
   914  	t, _ := tags.ParseObjectTags(oi1.UserTags)
   915  	if (oi2.UserTagCount > 0 && !reflect.DeepEqual(oi2.UserTags, t.ToMap())) || (oi2.UserTagCount != len(t.ToMap())) {
   916  		return replicateMetadata
   917  	}
   918  
   919  	// Compare only necessary headers
   920  	compareKeys := []string{
   921  		"Expires",
   922  		"Cache-Control",
   923  		"Content-Language",
   924  		"Content-Disposition",
   925  		"X-Amz-Object-Lock-Mode",
   926  		"X-Amz-Object-Lock-Retain-Until-Date",
   927  		"X-Amz-Object-Lock-Legal-Hold",
   928  		"X-Amz-Website-Redirect-Location",
   929  		"X-Amz-Meta-",
   930  	}
   931  
   932  	// compare metadata on both maps to see if meta is identical
   933  	compareMeta1 := make(map[string]string)
   934  	for k, v := range oi1.UserDefined {
   935  		var found bool
   936  		for _, prefix := range compareKeys {
   937  			if !stringsHasPrefixFold(k, prefix) {
   938  				continue
   939  			}
   940  			found = true
   941  			break
   942  		}
   943  		if found {
   944  			compareMeta1[strings.ToLower(k)] = v
   945  		}
   946  	}
   947  
   948  	compareMeta2 := make(map[string]string)
   949  	for k, v := range oi2.Metadata {
   950  		var found bool
   951  		for _, prefix := range compareKeys {
   952  			if !stringsHasPrefixFold(k, prefix) {
   953  				continue
   954  			}
   955  			found = true
   956  			break
   957  		}
   958  		if found {
   959  			compareMeta2[strings.ToLower(k)] = strings.Join(v, ",")
   960  		}
   961  	}
   962  
   963  	if !reflect.DeepEqual(compareMeta1, compareMeta2) {
   964  		return replicateMetadata
   965  	}
   966  
   967  	return replicateNone
   968  }
   969  
   970  // replicateObject replicates the specified version of the object to destination bucket
   971  // The source object is then updated to reflect the replication status.
   972  func replicateObject(ctx context.Context, ri ReplicateObjectInfo, objectAPI ObjectLayer) {
   973  	var replicationStatus replication.StatusType
   974  	defer func() {
   975  		if replicationStatus.Empty() {
   976  			// replication status is empty means
   977  			// replication was not attempted for some
   978  			// reason, notify the state of the object
   979  			// on disk.
   980  			replicationStatus = ri.ReplicationStatus
   981  		}
   982  		auditLogInternal(ctx, AuditLogOptions{
   983  			Event:     ri.EventType,
   984  			APIName:   ReplicateObjectAPI,
   985  			Bucket:    ri.Bucket,
   986  			Object:    ri.Name,
   987  			VersionID: ri.VersionID,
   988  			Status:    replicationStatus.String(),
   989  		})
   990  	}()
   991  
   992  	bucket := ri.Bucket
   993  	object := ri.Name
   994  
   995  	cfg, err := getReplicationConfig(ctx, bucket)
   996  	if err != nil {
   997  		logger.LogOnceIf(ctx, err, "get-replication-config-"+bucket)
   998  		sendEvent(eventArgs{
   999  			EventName:  event.ObjectReplicationNotTracked,
  1000  			BucketName: bucket,
  1001  			Object:     ri.ToObjectInfo(),
  1002  			UserAgent:  "Internal: [Replication]",
  1003  			Host:       globalLocalNodeName,
  1004  		})
  1005  		return
  1006  	}
  1007  	tgtArns := cfg.FilterTargetArns(replication.ObjectOpts{
  1008  		Name:     object,
  1009  		SSEC:     ri.SSEC,
  1010  		UserTags: ri.UserTags,
  1011  	})
  1012  	// Lock the object name before starting replication.
  1013  	// Use separate lock that doesn't collide with regular objects.
  1014  	lk := objectAPI.NewNSLock(bucket, "/[replicate]/"+object)
  1015  	lkctx, err := lk.GetLock(ctx, globalOperationTimeout)
  1016  	if err != nil {
  1017  		sendEvent(eventArgs{
  1018  			EventName:  event.ObjectReplicationNotTracked,
  1019  			BucketName: bucket,
  1020  			Object:     ri.ToObjectInfo(),
  1021  			UserAgent:  "Internal: [Replication]",
  1022  			Host:       globalLocalNodeName,
  1023  		})
  1024  		globalReplicationPool.queueMRFSave(ri.ToMRFEntry())
  1025  		return
  1026  	}
  1027  	ctx = lkctx.Context()
  1028  	defer lk.Unlock(lkctx)
  1029  
  1030  	rinfos := replicatedInfos{Targets: make([]replicatedTargetInfo, 0, len(tgtArns))}
  1031  	var wg sync.WaitGroup
  1032  	var mu sync.Mutex
  1033  	for _, tgtArn := range tgtArns {
  1034  		tgt := globalBucketTargetSys.GetRemoteTargetClient(bucket, tgtArn)
  1035  		if tgt == nil {
  1036  			logger.LogOnceIf(ctx, fmt.Errorf("failed to get target for bucket:%s arn:%s", bucket, tgtArn), tgtArn)
  1037  			sendEvent(eventArgs{
  1038  				EventName:  event.ObjectReplicationNotTracked,
  1039  				BucketName: bucket,
  1040  				Object:     ri.ToObjectInfo(),
  1041  				UserAgent:  "Internal: [Replication]",
  1042  				Host:       globalLocalNodeName,
  1043  			})
  1044  			continue
  1045  		}
  1046  		wg.Add(1)
  1047  		go func(tgt *TargetClient) {
  1048  			defer wg.Done()
  1049  
  1050  			var tgtInfo replicatedTargetInfo
  1051  			if ri.OpType == replication.ObjectReplicationType {
  1052  				// all incoming calls go through optimized path.
  1053  				tgtInfo = ri.replicateObject(ctx, objectAPI, tgt)
  1054  			} else {
  1055  				tgtInfo = ri.replicateAll(ctx, objectAPI, tgt)
  1056  			}
  1057  
  1058  			mu.Lock()
  1059  			rinfos.Targets = append(rinfos.Targets, tgtInfo)
  1060  			mu.Unlock()
  1061  		}(tgt)
  1062  	}
  1063  	wg.Wait()
  1064  
  1065  	replicationStatus = rinfos.ReplicationStatus() // used in defer function
  1066  	// FIXME: add support for missing replication events
  1067  	// - event.ObjectReplicationMissedThreshold
  1068  	// - event.ObjectReplicationReplicatedAfterThreshold
  1069  	eventName := event.ObjectReplicationComplete
  1070  	if replicationStatus == replication.Failed {
  1071  		eventName = event.ObjectReplicationFailed
  1072  	}
  1073  	newReplStatusInternal := rinfos.ReplicationStatusInternal()
  1074  	// Note that internal replication status(es) may match for previously replicated objects - in such cases
  1075  	// metadata should be updated with last resync timestamp.
  1076  	objInfo := ri.ToObjectInfo()
  1077  	if ri.ReplicationStatusInternal != newReplStatusInternal || rinfos.ReplicationResynced() {
  1078  		popts := ObjectOptions{
  1079  			MTime:     ri.ModTime,
  1080  			VersionID: ri.VersionID,
  1081  			EvalMetadataFn: func(oi *ObjectInfo, gerr error) (dsc ReplicateDecision, err error) {
  1082  				oi.UserDefined[ReservedMetadataPrefixLower+ReplicationStatus] = newReplStatusInternal
  1083  				oi.UserDefined[ReservedMetadataPrefixLower+ReplicationTimestamp] = UTCNow().Format(time.RFC3339Nano)
  1084  				oi.UserDefined[xhttp.AmzBucketReplicationStatus] = string(rinfos.ReplicationStatus())
  1085  				for _, rinfo := range rinfos.Targets {
  1086  					if rinfo.ResyncTimestamp != "" {
  1087  						oi.UserDefined[targetResetHeader(rinfo.Arn)] = rinfo.ResyncTimestamp
  1088  					}
  1089  				}
  1090  				if ri.UserTags != "" {
  1091  					oi.UserDefined[xhttp.AmzObjectTagging] = ri.UserTags
  1092  				}
  1093  				return dsc, nil
  1094  			},
  1095  		}
  1096  
  1097  		uobjInfo, _ := objectAPI.PutObjectMetadata(ctx, bucket, object, popts)
  1098  		if uobjInfo.Name != "" {
  1099  			objInfo = uobjInfo
  1100  		}
  1101  
  1102  		opType := replication.MetadataReplicationType
  1103  		if rinfos.Action() == replicateAll {
  1104  			opType = replication.ObjectReplicationType
  1105  		}
  1106  		for _, rinfo := range rinfos.Targets {
  1107  			if rinfo.ReplicationStatus != rinfo.PrevReplicationStatus {
  1108  				rinfo.OpType = opType // update optype to reflect correct operation.
  1109  				globalReplicationStats.Update(bucket, rinfo, rinfo.ReplicationStatus, rinfo.PrevReplicationStatus)
  1110  			}
  1111  		}
  1112  	}
  1113  
  1114  	sendEvent(eventArgs{
  1115  		EventName:  eventName,
  1116  		BucketName: bucket,
  1117  		Object:     objInfo,
  1118  		UserAgent:  "Internal: [Replication]",
  1119  		Host:       globalLocalNodeName,
  1120  	})
  1121  
  1122  	// re-queue failures once more - keep a retry count to avoid flooding the queue if
  1123  	// the target site is down. Leave it to scanner to catch up instead.
  1124  	if rinfos.ReplicationStatus() != replication.Completed {
  1125  		ri.OpType = replication.HealReplicationType
  1126  		ri.EventType = ReplicateMRF
  1127  		ri.ReplicationStatusInternal = rinfos.ReplicationStatusInternal()
  1128  		ri.RetryCount++
  1129  		globalReplicationPool.queueMRFSave(ri.ToMRFEntry())
  1130  	}
  1131  }
  1132  
  1133  // replicateObject replicates object data for specified version of the object to destination bucket
  1134  // The source object is then updated to reflect the replication status.
  1135  func (ri ReplicateObjectInfo) replicateObject(ctx context.Context, objectAPI ObjectLayer, tgt *TargetClient) (rinfo replicatedTargetInfo) {
  1136  	startTime := time.Now()
  1137  	bucket := ri.Bucket
  1138  	object := ri.Name
  1139  
  1140  	rAction := replicateAll
  1141  	rinfo = replicatedTargetInfo{
  1142  		Size:                  ri.ActualSize,
  1143  		Arn:                   tgt.ARN,
  1144  		PrevReplicationStatus: ri.TargetReplicationStatus(tgt.ARN),
  1145  		ReplicationStatus:     replication.Failed,
  1146  		OpType:                ri.OpType,
  1147  		ReplicationAction:     rAction,
  1148  		endpoint:              tgt.EndpointURL().Host,
  1149  		secure:                tgt.EndpointURL().Scheme == "https",
  1150  	}
  1151  	if ri.TargetReplicationStatus(tgt.ARN) == replication.Completed && !ri.ExistingObjResync.Empty() && !ri.ExistingObjResync.mustResyncTarget(tgt.ARN) {
  1152  		rinfo.ReplicationStatus = replication.Completed
  1153  		rinfo.ReplicationResynced = true
  1154  		return
  1155  	}
  1156  
  1157  	if globalBucketTargetSys.isOffline(tgt.EndpointURL()) {
  1158  		logger.LogOnceIf(ctx, fmt.Errorf("remote target is offline for bucket:%s arn:%s retry:%d", bucket, tgt.ARN, ri.RetryCount), "replication-target-offline"+tgt.ARN)
  1159  		sendEvent(eventArgs{
  1160  			EventName:  event.ObjectReplicationNotTracked,
  1161  			BucketName: bucket,
  1162  			Object:     ri.ToObjectInfo(),
  1163  			UserAgent:  "Internal: [Replication]",
  1164  			Host:       globalLocalNodeName,
  1165  		})
  1166  		return
  1167  	}
  1168  
  1169  	versioned := globalBucketVersioningSys.PrefixEnabled(bucket, object)
  1170  	versionSuspended := globalBucketVersioningSys.PrefixSuspended(bucket, object)
  1171  
  1172  	gr, err := objectAPI.GetObjectNInfo(ctx, bucket, object, nil, http.Header{}, ObjectOptions{
  1173  		VersionID:          ri.VersionID,
  1174  		Versioned:          versioned,
  1175  		VersionSuspended:   versionSuspended,
  1176  		ReplicationRequest: true,
  1177  	})
  1178  	if err != nil {
  1179  		if !isErrVersionNotFound(err) && !isErrObjectNotFound(err) {
  1180  			objInfo := ri.ToObjectInfo()
  1181  			sendEvent(eventArgs{
  1182  				EventName:  event.ObjectReplicationNotTracked,
  1183  				BucketName: bucket,
  1184  				Object:     objInfo,
  1185  				UserAgent:  "Internal: [Replication]",
  1186  				Host:       globalLocalNodeName,
  1187  			})
  1188  			logger.LogOnceIf(ctx, fmt.Errorf("unable to read source object %s/%s(%s): %w", bucket, object, objInfo.VersionID, err), object+":"+objInfo.VersionID)
  1189  		}
  1190  		return
  1191  	}
  1192  	defer gr.Close()
  1193  
  1194  	objInfo := gr.ObjInfo
  1195  
  1196  	// make sure we have the latest metadata for metrics calculation
  1197  	rinfo.PrevReplicationStatus = objInfo.TargetReplicationStatus(tgt.ARN)
  1198  
  1199  	size, err := objInfo.GetActualSize()
  1200  	if err != nil {
  1201  		logger.LogIf(ctx, err)
  1202  		sendEvent(eventArgs{
  1203  			EventName:  event.ObjectReplicationNotTracked,
  1204  			BucketName: bucket,
  1205  			Object:     objInfo,
  1206  			UserAgent:  "Internal: [Replication]",
  1207  			Host:       globalLocalNodeName,
  1208  		})
  1209  		return
  1210  	}
  1211  
  1212  	if tgt.Bucket == "" {
  1213  		logger.LogIf(ctx, fmt.Errorf("unable to replicate object %s(%s), bucket is empty for target %s", objInfo.Name, objInfo.VersionID, tgt.EndpointURL()))
  1214  		sendEvent(eventArgs{
  1215  			EventName:  event.ObjectReplicationNotTracked,
  1216  			BucketName: bucket,
  1217  			Object:     objInfo,
  1218  			UserAgent:  "Internal: [Replication]",
  1219  			Host:       globalLocalNodeName,
  1220  		})
  1221  		return rinfo
  1222  	}
  1223  	defer func() {
  1224  		if rinfo.ReplicationStatus == replication.Completed && ri.OpType == replication.ExistingObjectReplicationType && tgt.ResetID != "" {
  1225  			rinfo.ResyncTimestamp = fmt.Sprintf("%s;%s", UTCNow().Format(http.TimeFormat), tgt.ResetID)
  1226  			rinfo.ReplicationResynced = true
  1227  		}
  1228  		rinfo.Duration = time.Since(startTime)
  1229  	}()
  1230  
  1231  	rinfo.ReplicationStatus = replication.Completed
  1232  	rinfo.Size = size
  1233  	rinfo.ReplicationAction = rAction
  1234  	// use core client to avoid doing multipart on PUT
  1235  	c := &minio.Core{Client: tgt.Client}
  1236  
  1237  	putOpts, err := putReplicationOpts(ctx, tgt.StorageClass, objInfo)
  1238  	if err != nil {
  1239  		logger.LogIf(ctx, fmt.Errorf("failure setting options for replication bucket:%s err:%w", bucket, err))
  1240  		sendEvent(eventArgs{
  1241  			EventName:  event.ObjectReplicationNotTracked,
  1242  			BucketName: bucket,
  1243  			Object:     objInfo,
  1244  			UserAgent:  "Internal: [Replication]",
  1245  			Host:       globalLocalNodeName,
  1246  		})
  1247  		return
  1248  	}
  1249  
  1250  	var headerSize int
  1251  	for k, v := range putOpts.Header() {
  1252  		headerSize += len(k) + len(v)
  1253  	}
  1254  
  1255  	opts := &bandwidth.MonitorReaderOptions{
  1256  		BucketOptions: bandwidth.BucketOptions{
  1257  			Name:           ri.Bucket,
  1258  			ReplicationARN: tgt.ARN,
  1259  		},
  1260  		HeaderSize: headerSize,
  1261  	}
  1262  	newCtx := ctx
  1263  	if globalBucketMonitor.IsThrottled(bucket, tgt.ARN) {
  1264  		var cancel context.CancelFunc
  1265  		newCtx, cancel = context.WithTimeout(ctx, throttleDeadline)
  1266  		defer cancel()
  1267  	}
  1268  	r := bandwidth.NewMonitoredReader(newCtx, globalBucketMonitor, gr, opts)
  1269  	if objInfo.isMultipart() {
  1270  		if rinfo.Err = replicateObjectWithMultipart(ctx, c, tgt.Bucket, object,
  1271  			r, objInfo, putOpts); rinfo.Err != nil {
  1272  			if minio.ToErrorResponse(rinfo.Err).Code != "PreconditionFailed" {
  1273  				rinfo.ReplicationStatus = replication.Failed
  1274  				logger.LogIf(ctx, fmt.Errorf("unable to replicate for object %s/%s(%s): %s (target: %s)", bucket, objInfo.Name, objInfo.VersionID, rinfo.Err, tgt.EndpointURL()))
  1275  			}
  1276  		}
  1277  	} else {
  1278  		if _, rinfo.Err = c.PutObject(ctx, tgt.Bucket, object, r, size, "", "", putOpts); rinfo.Err != nil {
  1279  			if minio.ToErrorResponse(rinfo.Err).Code != "PreconditionFailed" {
  1280  				rinfo.ReplicationStatus = replication.Failed
  1281  				logger.LogIf(ctx, fmt.Errorf("unable to replicate for object %s/%s(%s): %s (target: %s)", bucket, objInfo.Name, objInfo.VersionID, rinfo.Err, tgt.EndpointURL()))
  1282  			}
  1283  		}
  1284  	}
  1285  	if rinfo.Err != nil && minio.IsNetworkOrHostDown(rinfo.Err, true) && !globalBucketTargetSys.isOffline(tgt.EndpointURL()) {
  1286  		globalBucketTargetSys.markOffline(tgt.EndpointURL())
  1287  	}
  1288  	return
  1289  }
  1290  
  1291  // replicateAll replicates metadata for specified version of the object to destination bucket
  1292  // if the destination version is missing it automatically does fully copy as well.
  1293  // The source object is then updated to reflect the replication status.
  1294  func (ri ReplicateObjectInfo) replicateAll(ctx context.Context, objectAPI ObjectLayer, tgt *TargetClient) (rinfo replicatedTargetInfo) {
  1295  	startTime := time.Now()
  1296  	bucket := ri.Bucket
  1297  	object := ri.Name
  1298  
  1299  	// set defaults for replication action based on operation being performed - actual
  1300  	// replication action can only be determined after stat on remote. This default is
  1301  	// needed for updating replication metrics correctly when target is offline.
  1302  	rAction := replicateMetadata
  1303  
  1304  	rinfo = replicatedTargetInfo{
  1305  		Size:                  ri.ActualSize,
  1306  		Arn:                   tgt.ARN,
  1307  		PrevReplicationStatus: ri.TargetReplicationStatus(tgt.ARN),
  1308  		ReplicationStatus:     replication.Failed,
  1309  		OpType:                ri.OpType,
  1310  		ReplicationAction:     rAction,
  1311  		endpoint:              tgt.EndpointURL().Host,
  1312  		secure:                tgt.EndpointURL().Scheme == "https",
  1313  	}
  1314  
  1315  	if globalBucketTargetSys.isOffline(tgt.EndpointURL()) {
  1316  		logger.LogOnceIf(ctx, fmt.Errorf("remote target is offline for bucket:%s arn:%s retry:%d", bucket, tgt.ARN, ri.RetryCount), "replication-target-offline-heal"+tgt.ARN)
  1317  		sendEvent(eventArgs{
  1318  			EventName:  event.ObjectReplicationNotTracked,
  1319  			BucketName: bucket,
  1320  			Object:     ri.ToObjectInfo(),
  1321  			UserAgent:  "Internal: [Replication]",
  1322  			Host:       globalLocalNodeName,
  1323  		})
  1324  		return
  1325  	}
  1326  
  1327  	versioned := globalBucketVersioningSys.PrefixEnabled(bucket, object)
  1328  	versionSuspended := globalBucketVersioningSys.PrefixSuspended(bucket, object)
  1329  
  1330  	gr, err := objectAPI.GetObjectNInfo(ctx, bucket, object, nil, http.Header{},
  1331  		ObjectOptions{
  1332  			VersionID:          ri.VersionID,
  1333  			Versioned:          versioned,
  1334  			VersionSuspended:   versionSuspended,
  1335  			ReplicationRequest: true,
  1336  		})
  1337  	if err != nil {
  1338  		if !isErrVersionNotFound(err) && !isErrObjectNotFound(err) {
  1339  			objInfo := ri.ToObjectInfo()
  1340  			sendEvent(eventArgs{
  1341  				EventName:  event.ObjectReplicationNotTracked,
  1342  				BucketName: bucket,
  1343  				Object:     objInfo,
  1344  				UserAgent:  "Internal: [Replication]",
  1345  				Host:       globalLocalNodeName,
  1346  			})
  1347  			logger.LogIf(ctx, fmt.Errorf("unable to replicate to target %s for %s/%s(%s): %w", tgt.EndpointURL(), bucket, object, objInfo.VersionID, err))
  1348  		}
  1349  		return
  1350  	}
  1351  	defer gr.Close()
  1352  
  1353  	objInfo := gr.ObjInfo
  1354  
  1355  	// make sure we have the latest metadata for metrics calculation
  1356  	rinfo.PrevReplicationStatus = objInfo.TargetReplicationStatus(tgt.ARN)
  1357  
  1358  	// use latest ObjectInfo to check if previous replication attempt succeeded
  1359  	if objInfo.TargetReplicationStatus(tgt.ARN) == replication.Completed && !ri.ExistingObjResync.Empty() && !ri.ExistingObjResync.mustResyncTarget(tgt.ARN) {
  1360  		rinfo.ReplicationStatus = replication.Completed
  1361  		rinfo.ReplicationResynced = true
  1362  		return
  1363  	}
  1364  
  1365  	size, err := objInfo.GetActualSize()
  1366  	if err != nil {
  1367  		logger.LogIf(ctx, err)
  1368  		sendEvent(eventArgs{
  1369  			EventName:  event.ObjectReplicationNotTracked,
  1370  			BucketName: bucket,
  1371  			Object:     objInfo,
  1372  			UserAgent:  "Internal: [Replication]",
  1373  			Host:       globalLocalNodeName,
  1374  		})
  1375  		return
  1376  	}
  1377  
  1378  	// Set the encrypted size for SSE-C objects
  1379  	if crypto.SSEC.IsEncrypted(objInfo.UserDefined) {
  1380  		size = objInfo.Size
  1381  	}
  1382  
  1383  	if tgt.Bucket == "" {
  1384  		logger.LogIf(ctx, fmt.Errorf("unable to replicate object %s(%s) to %s, target bucket is missing", objInfo.Name, objInfo.VersionID, tgt.EndpointURL()))
  1385  		sendEvent(eventArgs{
  1386  			EventName:  event.ObjectReplicationNotTracked,
  1387  			BucketName: bucket,
  1388  			Object:     objInfo,
  1389  			UserAgent:  "Internal: [Replication]",
  1390  			Host:       globalLocalNodeName,
  1391  		})
  1392  		return rinfo
  1393  	}
  1394  	defer func() {
  1395  		if rinfo.ReplicationStatus == replication.Completed && ri.OpType == replication.ExistingObjectReplicationType && tgt.ResetID != "" {
  1396  			rinfo.ResyncTimestamp = fmt.Sprintf("%s;%s", UTCNow().Format(http.TimeFormat), tgt.ResetID)
  1397  			rinfo.ReplicationResynced = true
  1398  		}
  1399  		rinfo.Duration = time.Since(startTime)
  1400  	}()
  1401  
  1402  	oi, cerr := tgt.StatObject(ctx, tgt.Bucket, object, minio.StatObjectOptions{
  1403  		VersionID: objInfo.VersionID,
  1404  		Internal: minio.AdvancedGetOptions{
  1405  			ReplicationProxyRequest: "false",
  1406  		},
  1407  	})
  1408  	if cerr == nil {
  1409  		rAction = getReplicationAction(objInfo, oi, ri.OpType)
  1410  		rinfo.ReplicationStatus = replication.Completed
  1411  		if rAction == replicateNone {
  1412  			if ri.OpType == replication.ExistingObjectReplicationType &&
  1413  				objInfo.ModTime.Unix() > oi.LastModified.Unix() && objInfo.VersionID == nullVersionID {
  1414  				logger.LogIf(ctx, fmt.Errorf("unable to replicate %s/%s (null). Newer version exists on target %s", bucket, object, tgt.EndpointURL()))
  1415  				sendEvent(eventArgs{
  1416  					EventName:  event.ObjectReplicationNotTracked,
  1417  					BucketName: bucket,
  1418  					Object:     objInfo,
  1419  					UserAgent:  "Internal: [Replication]",
  1420  					Host:       globalLocalNodeName,
  1421  				})
  1422  			}
  1423  			// object with same VersionID already exists, replication kicked off by
  1424  			// PutObject might have completed
  1425  			if objInfo.TargetReplicationStatus(tgt.ARN) == replication.Pending ||
  1426  				objInfo.TargetReplicationStatus(tgt.ARN) == replication.Failed ||
  1427  				ri.OpType == replication.ExistingObjectReplicationType {
  1428  				// if metadata is not updated for some reason after replication, such as
  1429  				// 503 encountered while updating metadata - make sure to set ReplicationStatus
  1430  				// as Completed.
  1431  				//
  1432  				// Note: Replication Stats would have been updated despite metadata update failure.
  1433  				rinfo.ReplicationAction = rAction
  1434  				rinfo.ReplicationStatus = replication.Completed
  1435  			}
  1436  			return
  1437  		}
  1438  	} else {
  1439  		// if target returns error other than NoSuchKey, defer replication attempt
  1440  		if minio.IsNetworkOrHostDown(cerr, true) && !globalBucketTargetSys.isOffline(tgt.EndpointURL()) {
  1441  			globalBucketTargetSys.markOffline(tgt.EndpointURL())
  1442  		}
  1443  
  1444  		serr := ErrorRespToObjectError(cerr, bucket, object, objInfo.VersionID)
  1445  		switch {
  1446  		case isErrMethodNotAllowed(serr):
  1447  			rAction = replicateAll
  1448  		case isErrObjectNotFound(serr), isErrVersionNotFound(serr):
  1449  			rAction = replicateAll
  1450  		case isErrReadQuorum(serr), isErrWriteQuorum(serr):
  1451  			rAction = replicateAll
  1452  		default:
  1453  			rinfo.Err = cerr
  1454  			logger.LogIf(ctx, fmt.Errorf("unable to replicate %s/%s (%s). Target (%s) returned %s error on HEAD",
  1455  				bucket, object, objInfo.VersionID, tgt.EndpointURL(), cerr))
  1456  			sendEvent(eventArgs{
  1457  				EventName:  event.ObjectReplicationNotTracked,
  1458  				BucketName: bucket,
  1459  				Object:     objInfo,
  1460  				UserAgent:  "Internal: [Replication]",
  1461  				Host:       globalLocalNodeName,
  1462  			})
  1463  			return
  1464  		}
  1465  	}
  1466  	rinfo.ReplicationStatus = replication.Completed
  1467  	rinfo.Size = size
  1468  	rinfo.ReplicationAction = rAction
  1469  	// use core client to avoid doing multipart on PUT
  1470  	c := &minio.Core{Client: tgt.Client}
  1471  	if rAction != replicateAll {
  1472  		// replicate metadata for object tagging/copy with metadata replacement
  1473  		srcOpts := minio.CopySrcOptions{
  1474  			Bucket:    tgt.Bucket,
  1475  			Object:    object,
  1476  			VersionID: objInfo.VersionID,
  1477  		}
  1478  		dstOpts := minio.PutObjectOptions{
  1479  			Internal: minio.AdvancedPutOptions{
  1480  				SourceVersionID:    objInfo.VersionID,
  1481  				ReplicationRequest: true, // always set this to distinguish between `mc mirror` replication and serverside
  1482  			},
  1483  		}
  1484  		if tagTmStr, ok := objInfo.UserDefined[ReservedMetadataPrefixLower+TaggingTimestamp]; ok {
  1485  			ondiskTimestamp, err := time.Parse(time.RFC3339, tagTmStr)
  1486  			if err == nil {
  1487  				dstOpts.Internal.TaggingTimestamp = ondiskTimestamp
  1488  			}
  1489  		}
  1490  		if retTmStr, ok := objInfo.UserDefined[ReservedMetadataPrefixLower+ObjectLockRetentionTimestamp]; ok {
  1491  			ondiskTimestamp, err := time.Parse(time.RFC3339, retTmStr)
  1492  			if err == nil {
  1493  				dstOpts.Internal.RetentionTimestamp = ondiskTimestamp
  1494  			}
  1495  		}
  1496  		if lholdTmStr, ok := objInfo.UserDefined[ReservedMetadataPrefixLower+ObjectLockLegalHoldTimestamp]; ok {
  1497  			ondiskTimestamp, err := time.Parse(time.RFC3339, lholdTmStr)
  1498  			if err == nil {
  1499  				dstOpts.Internal.LegalholdTimestamp = ondiskTimestamp
  1500  			}
  1501  		}
  1502  		if _, rinfo.Err = c.CopyObject(ctx, tgt.Bucket, object, tgt.Bucket, object, getCopyObjMetadata(objInfo, tgt.StorageClass), srcOpts, dstOpts); rinfo.Err != nil {
  1503  			rinfo.ReplicationStatus = replication.Failed
  1504  			logger.LogIf(ctx, fmt.Errorf("unable to replicate metadata for object %s/%s(%s) to target %s: %w", bucket, objInfo.Name, objInfo.VersionID, tgt.EndpointURL(), rinfo.Err))
  1505  		}
  1506  	} else {
  1507  		var putOpts minio.PutObjectOptions
  1508  		putOpts, err = putReplicationOpts(ctx, tgt.StorageClass, objInfo)
  1509  		if err != nil {
  1510  			logger.LogIf(ctx, fmt.Errorf("failed to set replicate options for object %s/%s(%s) (target %s) err:%w", bucket, objInfo.Name, objInfo.VersionID, tgt.EndpointURL(), err))
  1511  			sendEvent(eventArgs{
  1512  				EventName:  event.ObjectReplicationNotTracked,
  1513  				BucketName: bucket,
  1514  				Object:     objInfo,
  1515  				UserAgent:  "Internal: [Replication]",
  1516  				Host:       globalLocalNodeName,
  1517  			})
  1518  			return
  1519  		}
  1520  		var headerSize int
  1521  		for k, v := range putOpts.Header() {
  1522  			headerSize += len(k) + len(v)
  1523  		}
  1524  
  1525  		opts := &bandwidth.MonitorReaderOptions{
  1526  			BucketOptions: bandwidth.BucketOptions{
  1527  				Name:           objInfo.Bucket,
  1528  				ReplicationARN: tgt.ARN,
  1529  			},
  1530  			HeaderSize: headerSize,
  1531  		}
  1532  		newCtx := ctx
  1533  		if globalBucketMonitor.IsThrottled(bucket, tgt.ARN) {
  1534  			var cancel context.CancelFunc
  1535  			newCtx, cancel = context.WithTimeout(ctx, throttleDeadline)
  1536  			defer cancel()
  1537  		}
  1538  		r := bandwidth.NewMonitoredReader(newCtx, globalBucketMonitor, gr, opts)
  1539  		if objInfo.isMultipart() {
  1540  			if rinfo.Err = replicateObjectWithMultipart(ctx, c, tgt.Bucket, object,
  1541  				r, objInfo, putOpts); rinfo.Err != nil {
  1542  				if minio.ToErrorResponse(rinfo.Err).Code != "PreconditionFailed" {
  1543  					rinfo.ReplicationStatus = replication.Failed
  1544  					logger.LogIf(ctx, fmt.Errorf("unable to replicate for object %s/%s(%s) to target %s: %w", bucket, objInfo.Name, objInfo.VersionID, tgt.EndpointURL(), rinfo.Err))
  1545  				} else {
  1546  					rinfo.ReplicationStatus = replication.Completed
  1547  				}
  1548  			}
  1549  		} else {
  1550  			if _, rinfo.Err = c.PutObject(ctx, tgt.Bucket, object, r, size, "", "", putOpts); rinfo.Err != nil {
  1551  				if minio.ToErrorResponse(rinfo.Err).Code != "PreconditionFailed" {
  1552  					rinfo.ReplicationStatus = replication.Failed
  1553  					logger.LogIf(ctx, fmt.Errorf("unable to replicate for object %s/%s(%s) to target %s: %w", bucket, objInfo.Name, objInfo.VersionID, tgt.EndpointURL(), rinfo.Err))
  1554  				} else {
  1555  					rinfo.ReplicationStatus = replication.Completed
  1556  				}
  1557  			}
  1558  		}
  1559  		if rinfo.Err != nil && minio.IsNetworkOrHostDown(rinfo.Err, true) && !globalBucketTargetSys.isOffline(tgt.EndpointURL()) {
  1560  			globalBucketTargetSys.markOffline(tgt.EndpointURL())
  1561  		}
  1562  	}
  1563  	return
  1564  }
  1565  
  1566  func replicateObjectWithMultipart(ctx context.Context, c *minio.Core, bucket, object string, r io.Reader, objInfo ObjectInfo, opts minio.PutObjectOptions) (err error) {
  1567  	var uploadedParts []minio.CompletePart
  1568  	// new multipart must not set mtime as it may lead to erroneous cleanups at various intervals.
  1569  	opts.Internal.SourceMTime = time.Time{} // this value is saved properly in CompleteMultipartUpload()
  1570  	var uploadID string
  1571  	attempts := 1
  1572  	for attempts <= 3 {
  1573  		nctx, cancel := context.WithTimeout(ctx, time.Minute)
  1574  		uploadID, err = c.NewMultipartUpload(nctx, bucket, object, opts)
  1575  		cancel()
  1576  		if err == nil {
  1577  			break
  1578  		}
  1579  		if minio.ToErrorResponse(err).Code == "PreconditionFailed" {
  1580  			return err
  1581  		}
  1582  		attempts++
  1583  		time.Sleep(time.Duration(rand.Int63n(int64(time.Second))))
  1584  	}
  1585  	if err != nil {
  1586  		return err
  1587  	}
  1588  
  1589  	defer func() {
  1590  		if err != nil {
  1591  			// block and abort remote upload upon failure.
  1592  			attempts := 1
  1593  			for attempts <= 3 {
  1594  				actx, acancel := context.WithTimeout(ctx, time.Minute)
  1595  				aerr := c.AbortMultipartUpload(actx, bucket, object, uploadID)
  1596  				if aerr == nil {
  1597  					acancel()
  1598  					return
  1599  				}
  1600  				acancel()
  1601  				logger.LogIf(actx,
  1602  					fmt.Errorf("trying %s: Unable to cleanup failed multipart replication %s on remote %s/%s: %w - this may consume space on remote cluster",
  1603  						humanize.Ordinal(attempts), uploadID, bucket, object, aerr))
  1604  				attempts++
  1605  				time.Sleep(time.Duration(rand.Int63n(int64(time.Second))))
  1606  			}
  1607  		}
  1608  	}()
  1609  
  1610  	var (
  1611  		hr    *hash.Reader
  1612  		pInfo minio.ObjectPart
  1613  	)
  1614  
  1615  	var objectSize int64
  1616  	for _, partInfo := range objInfo.Parts {
  1617  		if crypto.SSEC.IsEncrypted(objInfo.UserDefined) {
  1618  			hr, err = hash.NewReader(ctx, io.LimitReader(r, partInfo.Size), partInfo.Size, "", "", partInfo.ActualSize)
  1619  		} else {
  1620  			hr, err = hash.NewReader(ctx, io.LimitReader(r, partInfo.ActualSize), partInfo.ActualSize, "", "", partInfo.ActualSize)
  1621  		}
  1622  		if err != nil {
  1623  			return err
  1624  		}
  1625  
  1626  		cHeader := http.Header{}
  1627  		cHeader.Add(xhttp.MinIOSourceReplicationRequest, "true")
  1628  		popts := minio.PutObjectPartOptions{
  1629  			SSE:          opts.ServerSideEncryption,
  1630  			CustomHeader: cHeader,
  1631  		}
  1632  
  1633  		if crypto.SSEC.IsEncrypted(objInfo.UserDefined) {
  1634  			objectSize += partInfo.Size
  1635  			pInfo, err = c.PutObjectPart(ctx, bucket, object, uploadID, partInfo.Number, hr, partInfo.Size, popts)
  1636  		} else {
  1637  			objectSize += partInfo.ActualSize
  1638  			pInfo, err = c.PutObjectPart(ctx, bucket, object, uploadID, partInfo.Number, hr, partInfo.ActualSize, popts)
  1639  		}
  1640  		if err != nil {
  1641  			return err
  1642  		}
  1643  		if !crypto.SSEC.IsEncrypted(objInfo.UserDefined) && pInfo.Size != partInfo.ActualSize {
  1644  			return fmt.Errorf("Part size mismatch: got %d, want %d", pInfo.Size, partInfo.ActualSize)
  1645  		}
  1646  		uploadedParts = append(uploadedParts, minio.CompletePart{
  1647  			PartNumber: pInfo.PartNumber,
  1648  			ETag:       pInfo.ETag,
  1649  		})
  1650  	}
  1651  	cctx, ccancel := context.WithTimeout(ctx, 10*time.Minute)
  1652  	defer ccancel()
  1653  	_, err = c.CompleteMultipartUpload(cctx, bucket, object, uploadID, uploadedParts, minio.PutObjectOptions{
  1654  		UserMetadata: map[string]string{validSSEReplicationHeaders[ReservedMetadataPrefix+"Actual-Object-Size"]: objInfo.UserDefined[ReservedMetadataPrefix+"actual-size"]},
  1655  		Internal: minio.AdvancedPutOptions{
  1656  			SourceMTime: objInfo.ModTime,
  1657  			// always set this to distinguish between `mc mirror` replication and serverside
  1658  			ReplicationRequest: true,
  1659  		},
  1660  	})
  1661  	return err
  1662  }
  1663  
  1664  // filterReplicationStatusMetadata filters replication status metadata for COPY
  1665  func filterReplicationStatusMetadata(metadata map[string]string) map[string]string {
  1666  	// Copy on write
  1667  	dst := metadata
  1668  	var copied bool
  1669  	delKey := func(key string) {
  1670  		if _, ok := metadata[key]; !ok {
  1671  			return
  1672  		}
  1673  		if !copied {
  1674  			dst = make(map[string]string, len(metadata))
  1675  			for k, v := range metadata {
  1676  				dst[k] = v
  1677  			}
  1678  			copied = true
  1679  		}
  1680  		delete(dst, key)
  1681  	}
  1682  
  1683  	delKey(xhttp.AmzBucketReplicationStatus)
  1684  	return dst
  1685  }
  1686  
  1687  // DeletedObjectReplicationInfo has info on deleted object
  1688  type DeletedObjectReplicationInfo struct {
  1689  	DeletedObject
  1690  	Bucket    string
  1691  	EventType string
  1692  	OpType    replication.Type
  1693  	ResetID   string
  1694  	TargetArn string
  1695  }
  1696  
  1697  // ToMRFEntry returns the relevant info needed by MRF
  1698  func (di DeletedObjectReplicationInfo) ToMRFEntry() MRFReplicateEntry {
  1699  	versionID := di.DeleteMarkerVersionID
  1700  	if versionID == "" {
  1701  		versionID = di.VersionID
  1702  	}
  1703  	return MRFReplicateEntry{
  1704  		Bucket:    di.Bucket,
  1705  		Object:    di.ObjectName,
  1706  		versionID: versionID,
  1707  	}
  1708  }
  1709  
  1710  // Replication specific APIName
  1711  const (
  1712  	ReplicateObjectAPI = "ReplicateObject"
  1713  	ReplicateDeleteAPI = "ReplicateDelete"
  1714  )
  1715  
  1716  const (
  1717  	// ReplicateQueued - replication being queued trail
  1718  	ReplicateQueued = "replicate:queue"
  1719  
  1720  	// ReplicateExisting - audit trail for existing objects replication
  1721  	ReplicateExisting = "replicate:existing"
  1722  	// ReplicateExistingDelete - audit trail for delete replication triggered for existing delete markers
  1723  	ReplicateExistingDelete = "replicate:existing:delete"
  1724  
  1725  	// ReplicateMRF - audit trail for replication from Most Recent Failures (MRF) queue
  1726  	ReplicateMRF = "replicate:mrf"
  1727  	// ReplicateIncoming - audit trail of inline replication
  1728  	ReplicateIncoming = "replicate:incoming"
  1729  	// ReplicateIncomingDelete - audit trail of inline replication of deletes.
  1730  	ReplicateIncomingDelete = "replicate:incoming:delete"
  1731  
  1732  	// ReplicateHeal - audit trail for healing of failed/pending replications
  1733  	ReplicateHeal = "replicate:heal"
  1734  	// ReplicateHealDelete - audit trail of healing of failed/pending delete replications.
  1735  	ReplicateHealDelete = "replicate:heal:delete"
  1736  )
  1737  
  1738  var (
  1739  	globalReplicationPool  *ReplicationPool
  1740  	globalReplicationStats *ReplicationStats
  1741  )
  1742  
  1743  // ReplicationPool describes replication pool
  1744  type ReplicationPool struct {
  1745  	// atomic ops:
  1746  	activeWorkers    int32
  1747  	activeMRFWorkers int32
  1748  
  1749  	objLayer   ObjectLayer
  1750  	ctx        context.Context
  1751  	priority   string
  1752  	maxWorkers int
  1753  	mu         sync.RWMutex
  1754  	mrfMU      sync.Mutex
  1755  	resyncer   *replicationResyncer
  1756  
  1757  	// workers:
  1758  	workers    []chan ReplicationWorkerOperation
  1759  	lrgworkers []chan ReplicationWorkerOperation
  1760  
  1761  	// mrf:
  1762  	mrfWorkerKillCh chan struct{}
  1763  	mrfReplicaCh    chan ReplicationWorkerOperation
  1764  	mrfSaveCh       chan MRFReplicateEntry
  1765  	mrfStopCh       chan struct{}
  1766  	mrfWorkerSize   int
  1767  }
  1768  
  1769  // ReplicationWorkerOperation is a shared interface of replication operations.
  1770  type ReplicationWorkerOperation interface {
  1771  	ToMRFEntry() MRFReplicateEntry
  1772  }
  1773  
  1774  const (
  1775  	// WorkerMaxLimit max number of workers per node for "fast" mode
  1776  	WorkerMaxLimit = 500
  1777  
  1778  	// WorkerMinLimit min number of workers per node for "slow" mode
  1779  	WorkerMinLimit = 50
  1780  
  1781  	// WorkerAutoDefault is default number of workers for "auto" mode
  1782  	WorkerAutoDefault = 100
  1783  
  1784  	// MRFWorkerMaxLimit max number of mrf workers per node for "fast" mode
  1785  	MRFWorkerMaxLimit = 8
  1786  
  1787  	// MRFWorkerMinLimit min number of mrf workers per node for "slow" mode
  1788  	MRFWorkerMinLimit = 2
  1789  
  1790  	// MRFWorkerAutoDefault is default number of mrf workers for "auto" mode
  1791  	MRFWorkerAutoDefault = 4
  1792  
  1793  	// LargeWorkerCount is default number of workers assigned to large uploads ( >= 128MiB)
  1794  	LargeWorkerCount = 10
  1795  )
  1796  
  1797  // NewReplicationPool creates a pool of replication workers of specified size
  1798  func NewReplicationPool(ctx context.Context, o ObjectLayer, opts replicationPoolOpts) *ReplicationPool {
  1799  	var workers, failedWorkers int
  1800  	priority := "auto"
  1801  	maxWorkers := WorkerMaxLimit
  1802  	if opts.Priority != "" {
  1803  		priority = opts.Priority
  1804  	}
  1805  	if opts.MaxWorkers > 0 {
  1806  		maxWorkers = opts.MaxWorkers
  1807  	}
  1808  	switch priority {
  1809  	case "fast":
  1810  		workers = WorkerMaxLimit
  1811  		failedWorkers = MRFWorkerMaxLimit
  1812  	case "slow":
  1813  		workers = WorkerMinLimit
  1814  		failedWorkers = MRFWorkerMinLimit
  1815  	default:
  1816  		workers = WorkerAutoDefault
  1817  		failedWorkers = MRFWorkerAutoDefault
  1818  	}
  1819  	if maxWorkers > 0 && workers > maxWorkers {
  1820  		workers = maxWorkers
  1821  	}
  1822  
  1823  	if maxWorkers > 0 && failedWorkers > maxWorkers {
  1824  		failedWorkers = maxWorkers
  1825  	}
  1826  	pool := &ReplicationPool{
  1827  		workers:         make([]chan ReplicationWorkerOperation, 0, workers),
  1828  		lrgworkers:      make([]chan ReplicationWorkerOperation, 0, LargeWorkerCount),
  1829  		mrfReplicaCh:    make(chan ReplicationWorkerOperation, 100000),
  1830  		mrfWorkerKillCh: make(chan struct{}, failedWorkers),
  1831  		resyncer:        newresyncer(),
  1832  		mrfSaveCh:       make(chan MRFReplicateEntry, 100000),
  1833  		mrfStopCh:       make(chan struct{}, 1),
  1834  		ctx:             ctx,
  1835  		objLayer:        o,
  1836  		priority:        priority,
  1837  		maxWorkers:      maxWorkers,
  1838  	}
  1839  
  1840  	pool.AddLargeWorkers()
  1841  	pool.ResizeWorkers(workers, 0)
  1842  	pool.ResizeFailedWorkers(failedWorkers)
  1843  	go pool.resyncer.PersistToDisk(ctx, o)
  1844  	go pool.processMRF()
  1845  	go pool.persistMRF()
  1846  	return pool
  1847  }
  1848  
  1849  // AddMRFWorker adds a pending/failed replication worker to handle requests that could not be queued
  1850  // to the other workers
  1851  func (p *ReplicationPool) AddMRFWorker() {
  1852  	for {
  1853  		select {
  1854  		case <-p.ctx.Done():
  1855  			return
  1856  		case oi, ok := <-p.mrfReplicaCh:
  1857  			if !ok {
  1858  				return
  1859  			}
  1860  			switch v := oi.(type) {
  1861  			case ReplicateObjectInfo:
  1862  				globalReplicationStats.incQ(v.Bucket, v.Size, v.DeleteMarker, v.OpType)
  1863  				atomic.AddInt32(&p.activeMRFWorkers, 1)
  1864  				replicateObject(p.ctx, v, p.objLayer)
  1865  				atomic.AddInt32(&p.activeMRFWorkers, -1)
  1866  				globalReplicationStats.decQ(v.Bucket, v.Size, v.DeleteMarker, v.OpType)
  1867  
  1868  			default:
  1869  				logger.LogOnceIf(p.ctx, fmt.Errorf("unknown mrf replication type: %T", oi), "unknown-mrf-replicate-type")
  1870  			}
  1871  		case <-p.mrfWorkerKillCh:
  1872  			return
  1873  		}
  1874  	}
  1875  }
  1876  
  1877  // AddWorker adds a replication worker to the pool.
  1878  // An optional pointer to a tracker that will be atomically
  1879  // incremented when operations are running can be provided.
  1880  func (p *ReplicationPool) AddWorker(input <-chan ReplicationWorkerOperation, opTracker *int32) {
  1881  	for {
  1882  		select {
  1883  		case <-p.ctx.Done():
  1884  			return
  1885  		case oi, ok := <-input:
  1886  			if !ok {
  1887  				return
  1888  			}
  1889  			switch v := oi.(type) {
  1890  			case ReplicateObjectInfo:
  1891  				if opTracker != nil {
  1892  					atomic.AddInt32(opTracker, 1)
  1893  				}
  1894  				globalReplicationStats.incQ(v.Bucket, v.Size, v.DeleteMarker, v.OpType)
  1895  				replicateObject(p.ctx, v, p.objLayer)
  1896  				globalReplicationStats.decQ(v.Bucket, v.Size, v.DeleteMarker, v.OpType)
  1897  				if opTracker != nil {
  1898  					atomic.AddInt32(opTracker, -1)
  1899  				}
  1900  			case DeletedObjectReplicationInfo:
  1901  				if opTracker != nil {
  1902  					atomic.AddInt32(opTracker, 1)
  1903  				}
  1904  				globalReplicationStats.incQ(v.Bucket, 0, true, v.OpType)
  1905  
  1906  				replicateDelete(p.ctx, v, p.objLayer)
  1907  				globalReplicationStats.decQ(v.Bucket, 0, true, v.OpType)
  1908  
  1909  				if opTracker != nil {
  1910  					atomic.AddInt32(opTracker, -1)
  1911  				}
  1912  			default:
  1913  				logger.LogOnceIf(p.ctx, fmt.Errorf("unknown replication type: %T", oi), "unknown-replicate-type")
  1914  			}
  1915  		}
  1916  	}
  1917  }
  1918  
  1919  // AddLargeWorkers adds a static number of workers to handle large uploads
  1920  func (p *ReplicationPool) AddLargeWorkers() {
  1921  	for i := 0; i < LargeWorkerCount; i++ {
  1922  		p.lrgworkers = append(p.lrgworkers, make(chan ReplicationWorkerOperation, 100000))
  1923  		i := i
  1924  		go p.AddLargeWorker(p.lrgworkers[i])
  1925  	}
  1926  	go func() {
  1927  		<-p.ctx.Done()
  1928  		for i := 0; i < LargeWorkerCount; i++ {
  1929  			xioutil.SafeClose(p.lrgworkers[i])
  1930  		}
  1931  	}()
  1932  }
  1933  
  1934  // AddLargeWorker adds a replication worker to the static pool for large uploads.
  1935  func (p *ReplicationPool) AddLargeWorker(input <-chan ReplicationWorkerOperation) {
  1936  	for {
  1937  		select {
  1938  		case <-p.ctx.Done():
  1939  			return
  1940  		case oi, ok := <-input:
  1941  			if !ok {
  1942  				return
  1943  			}
  1944  			switch v := oi.(type) {
  1945  			case ReplicateObjectInfo:
  1946  				globalReplicationStats.incQ(v.Bucket, v.Size, v.DeleteMarker, v.OpType)
  1947  				replicateObject(p.ctx, v, p.objLayer)
  1948  				globalReplicationStats.decQ(v.Bucket, v.Size, v.DeleteMarker, v.OpType)
  1949  			case DeletedObjectReplicationInfo:
  1950  				replicateDelete(p.ctx, v, p.objLayer)
  1951  			default:
  1952  				logger.LogOnceIf(p.ctx, fmt.Errorf("unknown replication type: %T", oi), "unknown-replicate-type")
  1953  			}
  1954  		}
  1955  	}
  1956  }
  1957  
  1958  // ActiveWorkers returns the number of active workers handling replication traffic.
  1959  func (p *ReplicationPool) ActiveWorkers() int {
  1960  	return int(atomic.LoadInt32(&p.activeWorkers))
  1961  }
  1962  
  1963  // ActiveMRFWorkers returns the number of active workers handling replication failures.
  1964  func (p *ReplicationPool) ActiveMRFWorkers() int {
  1965  	return int(atomic.LoadInt32(&p.activeMRFWorkers))
  1966  }
  1967  
  1968  // ResizeWorkers sets replication workers pool to new size.
  1969  // checkOld can be set to an expected value.
  1970  // If the worker count changed
  1971  func (p *ReplicationPool) ResizeWorkers(n, checkOld int) {
  1972  	p.mu.Lock()
  1973  	defer p.mu.Unlock()
  1974  
  1975  	if (checkOld > 0 && len(p.workers) != checkOld) || n == len(p.workers) || n < 1 {
  1976  		// Either already satisfied or worker count changed while we waited for the lock.
  1977  		return
  1978  	}
  1979  	for len(p.workers) < n {
  1980  		input := make(chan ReplicationWorkerOperation, 10000)
  1981  		p.workers = append(p.workers, input)
  1982  
  1983  		go p.AddWorker(input, &p.activeWorkers)
  1984  	}
  1985  	for len(p.workers) > n {
  1986  		worker := p.workers[len(p.workers)-1]
  1987  		p.workers = p.workers[:len(p.workers)-1]
  1988  		xioutil.SafeClose(worker)
  1989  	}
  1990  }
  1991  
  1992  // ResizeWorkerPriority sets replication failed workers pool size
  1993  func (p *ReplicationPool) ResizeWorkerPriority(pri string, maxWorkers int) {
  1994  	var workers, mrfWorkers int
  1995  	p.mu.Lock()
  1996  	switch pri {
  1997  	case "fast":
  1998  		workers = WorkerMaxLimit
  1999  		mrfWorkers = MRFWorkerMaxLimit
  2000  	case "slow":
  2001  		workers = WorkerMinLimit
  2002  		mrfWorkers = MRFWorkerMinLimit
  2003  	default:
  2004  		workers = WorkerAutoDefault
  2005  		mrfWorkers = MRFWorkerAutoDefault
  2006  		if len(p.workers) < WorkerAutoDefault {
  2007  			workers = min(len(p.workers)+1, WorkerAutoDefault)
  2008  		}
  2009  		if p.mrfWorkerSize < MRFWorkerAutoDefault {
  2010  			mrfWorkers = min(p.mrfWorkerSize+1, MRFWorkerAutoDefault)
  2011  		}
  2012  	}
  2013  	if maxWorkers > 0 && workers > maxWorkers {
  2014  		workers = maxWorkers
  2015  	}
  2016  
  2017  	if maxWorkers > 0 && mrfWorkers > maxWorkers {
  2018  		mrfWorkers = maxWorkers
  2019  	}
  2020  	p.priority = pri
  2021  	p.maxWorkers = maxWorkers
  2022  	p.mu.Unlock()
  2023  	p.ResizeWorkers(workers, 0)
  2024  	p.ResizeFailedWorkers(mrfWorkers)
  2025  }
  2026  
  2027  // ResizeFailedWorkers sets replication failed workers pool size
  2028  func (p *ReplicationPool) ResizeFailedWorkers(n int) {
  2029  	p.mu.Lock()
  2030  	defer p.mu.Unlock()
  2031  
  2032  	for p.mrfWorkerSize < n {
  2033  		p.mrfWorkerSize++
  2034  		go p.AddMRFWorker()
  2035  	}
  2036  	for p.mrfWorkerSize > n {
  2037  		p.mrfWorkerSize--
  2038  		go func() { p.mrfWorkerKillCh <- struct{}{} }()
  2039  	}
  2040  }
  2041  
  2042  const (
  2043  	minLargeObjSize = 128 * humanize.MiByte // 128MiB
  2044  )
  2045  
  2046  // getWorkerCh gets a worker channel deterministically based on bucket and object names.
  2047  // Must be able to grab read lock from p.
  2048  
  2049  func (p *ReplicationPool) getWorkerCh(bucket, object string, sz int64) chan<- ReplicationWorkerOperation {
  2050  	h := xxh3.HashString(bucket + object)
  2051  	p.mu.RLock()
  2052  	defer p.mu.RUnlock()
  2053  	if len(p.workers) == 0 {
  2054  		return nil
  2055  	}
  2056  	return p.workers[h%uint64(len(p.workers))]
  2057  }
  2058  
  2059  func (p *ReplicationPool) queueReplicaTask(ri ReplicateObjectInfo) {
  2060  	if p == nil {
  2061  		return
  2062  	}
  2063  	// if object is large, queue it to a static set of large workers
  2064  	if ri.Size >= int64(minLargeObjSize) {
  2065  		h := xxh3.HashString(ri.Bucket + ri.Name)
  2066  		select {
  2067  		case <-p.ctx.Done():
  2068  		case p.lrgworkers[h%LargeWorkerCount] <- ri:
  2069  		default:
  2070  			globalReplicationPool.queueMRFSave(ri.ToMRFEntry())
  2071  		}
  2072  		return
  2073  	}
  2074  
  2075  	var ch, healCh chan<- ReplicationWorkerOperation
  2076  	switch ri.OpType {
  2077  	case replication.HealReplicationType, replication.ExistingObjectReplicationType:
  2078  		ch = p.mrfReplicaCh
  2079  		healCh = p.getWorkerCh(ri.Name, ri.Bucket, ri.Size)
  2080  	default:
  2081  		ch = p.getWorkerCh(ri.Name, ri.Bucket, ri.Size)
  2082  	}
  2083  	if ch == nil && healCh == nil {
  2084  		return
  2085  	}
  2086  
  2087  	select {
  2088  	case <-p.ctx.Done():
  2089  	case healCh <- ri:
  2090  	case ch <- ri:
  2091  	default:
  2092  		globalReplicationPool.queueMRFSave(ri.ToMRFEntry())
  2093  		p.mu.RLock()
  2094  		prio := p.priority
  2095  		maxWorkers := p.maxWorkers
  2096  		p.mu.RUnlock()
  2097  		switch prio {
  2098  		case "fast":
  2099  			logger.LogOnceIf(GlobalContext, fmt.Errorf("WARNING: Unable to keep up with incoming traffic"), string(replicationSubsystem))
  2100  		case "slow":
  2101  			logger.LogOnceIf(GlobalContext, fmt.Errorf("WARNING: Unable to keep up with incoming traffic - we recommend increasing replication priority with `mc admin config set api replication_priority=auto`"), string(replicationSubsystem))
  2102  		default:
  2103  			maxWorkers = min(maxWorkers, WorkerMaxLimit)
  2104  			if p.ActiveWorkers() < maxWorkers {
  2105  				p.mu.RLock()
  2106  				workers := min(len(p.workers)+1, maxWorkers)
  2107  				existing := len(p.workers)
  2108  				p.mu.RUnlock()
  2109  				p.ResizeWorkers(workers, existing)
  2110  			}
  2111  			maxMRFWorkers := min(maxWorkers, MRFWorkerMaxLimit)
  2112  			if p.ActiveMRFWorkers() < maxMRFWorkers {
  2113  				p.mu.RLock()
  2114  				workers := min(p.mrfWorkerSize+1, maxMRFWorkers)
  2115  				p.mu.RUnlock()
  2116  				p.ResizeFailedWorkers(workers)
  2117  			}
  2118  		}
  2119  	}
  2120  }
  2121  
  2122  func queueReplicateDeletesWrapper(doi DeletedObjectReplicationInfo, existingObjectResync ResyncDecision) {
  2123  	for k, v := range existingObjectResync.targets {
  2124  		if v.Replicate {
  2125  			doi.ResetID = v.ResetID
  2126  			doi.TargetArn = k
  2127  
  2128  			globalReplicationPool.queueReplicaDeleteTask(doi)
  2129  		}
  2130  	}
  2131  }
  2132  
  2133  func (p *ReplicationPool) queueReplicaDeleteTask(doi DeletedObjectReplicationInfo) {
  2134  	if p == nil {
  2135  		return
  2136  	}
  2137  	var ch chan<- ReplicationWorkerOperation
  2138  	switch doi.OpType {
  2139  	case replication.HealReplicationType, replication.ExistingObjectReplicationType:
  2140  		fallthrough
  2141  	default:
  2142  		ch = p.getWorkerCh(doi.Bucket, doi.ObjectName, 0)
  2143  	}
  2144  
  2145  	select {
  2146  	case <-p.ctx.Done():
  2147  	case ch <- doi:
  2148  	default:
  2149  		globalReplicationPool.queueMRFSave(doi.ToMRFEntry())
  2150  		p.mu.RLock()
  2151  		prio := p.priority
  2152  		maxWorkers := p.maxWorkers
  2153  		p.mu.RUnlock()
  2154  		switch prio {
  2155  		case "fast":
  2156  			logger.LogOnceIf(GlobalContext, fmt.Errorf("WARNING: Unable to keep up with incoming deletes"), string(replicationSubsystem))
  2157  		case "slow":
  2158  			logger.LogOnceIf(GlobalContext, fmt.Errorf("WARNING: Unable to keep up with incoming deletes - we recommend increasing replication priority with `mc admin config set api replication_priority=auto`"), string(replicationSubsystem))
  2159  		default:
  2160  			maxWorkers = min(maxWorkers, WorkerMaxLimit)
  2161  			if p.ActiveWorkers() < maxWorkers {
  2162  				p.mu.RLock()
  2163  				workers := min(len(p.workers)+1, maxWorkers)
  2164  				existing := len(p.workers)
  2165  				p.mu.RUnlock()
  2166  				p.ResizeWorkers(workers, existing)
  2167  			}
  2168  		}
  2169  	}
  2170  }
  2171  
  2172  type replicationPoolOpts struct {
  2173  	Priority   string
  2174  	MaxWorkers int
  2175  }
  2176  
  2177  func initBackgroundReplication(ctx context.Context, objectAPI ObjectLayer) {
  2178  	globalReplicationPool = NewReplicationPool(ctx, objectAPI, globalAPIConfig.getReplicationOpts())
  2179  	globalReplicationStats = NewReplicationStats(ctx, objectAPI)
  2180  	go globalReplicationStats.trackEWMA()
  2181  }
  2182  
  2183  type proxyResult struct {
  2184  	Proxy bool
  2185  	Err   error
  2186  }
  2187  
  2188  // get Reader from replication target if active-active replication is in place and
  2189  // this node returns a 404
  2190  func proxyGetToReplicationTarget(ctx context.Context, bucket, object string, rs *HTTPRangeSpec, _ http.Header, opts ObjectOptions, proxyTargets *madmin.BucketTargets) (gr *GetObjectReader, proxy proxyResult, err error) {
  2191  	tgt, oi, proxy := proxyHeadToRepTarget(ctx, bucket, object, rs, opts, proxyTargets)
  2192  	if !proxy.Proxy {
  2193  		return nil, proxy, nil
  2194  	}
  2195  	fn, _, _, err := NewGetObjectReader(nil, oi, opts)
  2196  	if err != nil {
  2197  		return nil, proxy, err
  2198  	}
  2199  	gopts := minio.GetObjectOptions{
  2200  		VersionID:            opts.VersionID,
  2201  		ServerSideEncryption: opts.ServerSideEncryption,
  2202  		Internal: minio.AdvancedGetOptions{
  2203  			ReplicationProxyRequest: "true",
  2204  		},
  2205  		PartNumber: opts.PartNumber,
  2206  	}
  2207  	// get correct offsets for encrypted object
  2208  	if rs != nil {
  2209  		h, err := rs.ToHeader()
  2210  		if err != nil {
  2211  			return nil, proxy, err
  2212  		}
  2213  		gopts.Set(xhttp.Range, h)
  2214  	}
  2215  	// Make sure to match ETag when proxying.
  2216  	if err = gopts.SetMatchETag(oi.ETag); err != nil {
  2217  		return nil, proxy, err
  2218  	}
  2219  	c := minio.Core{Client: tgt.Client}
  2220  	obj, _, h, err := c.GetObject(ctx, tgt.Bucket, object, gopts)
  2221  	if err != nil {
  2222  		return nil, proxy, err
  2223  	}
  2224  	closeReader := func() { obj.Close() }
  2225  	reader, err := fn(obj, h, closeReader)
  2226  	if err != nil {
  2227  		return nil, proxy, err
  2228  	}
  2229  	reader.ObjInfo = oi.Clone()
  2230  	if rs != nil {
  2231  		contentSize, err := parseSizeFromContentRange(h)
  2232  		if err != nil {
  2233  			return nil, proxy, err
  2234  		}
  2235  		reader.ObjInfo.Size = contentSize
  2236  	}
  2237  
  2238  	return reader, proxyResult{Proxy: true}, nil
  2239  }
  2240  
  2241  func getProxyTargets(ctx context.Context, bucket, object string, opts ObjectOptions) (tgts *madmin.BucketTargets) {
  2242  	if opts.VersionSuspended {
  2243  		return &madmin.BucketTargets{}
  2244  	}
  2245  	if opts.ProxyRequest || (opts.ProxyHeaderSet && !opts.ProxyRequest) {
  2246  		return &madmin.BucketTargets{}
  2247  	}
  2248  	cfg, err := getReplicationConfig(ctx, bucket)
  2249  	if err != nil || cfg == nil {
  2250  		return &madmin.BucketTargets{}
  2251  	}
  2252  	topts := replication.ObjectOpts{Name: object}
  2253  	tgtArns := cfg.FilterTargetArns(topts)
  2254  	tgts = &madmin.BucketTargets{Targets: make([]madmin.BucketTarget, len(tgtArns))}
  2255  	for i, tgtArn := range tgtArns {
  2256  		tgt := globalBucketTargetSys.GetRemoteBucketTargetByArn(ctx, bucket, tgtArn)
  2257  		tgts.Targets[i] = tgt
  2258  	}
  2259  
  2260  	return tgts
  2261  }
  2262  
  2263  func proxyHeadToRepTarget(ctx context.Context, bucket, object string, rs *HTTPRangeSpec, opts ObjectOptions, proxyTargets *madmin.BucketTargets) (tgt *TargetClient, oi ObjectInfo, proxy proxyResult) {
  2264  	// this option is set when active-active replication is in place between site A -> B,
  2265  	// and site B does not have the object yet.
  2266  	if opts.ProxyRequest || (opts.ProxyHeaderSet && !opts.ProxyRequest) { // true only when site B sets MinIOSourceProxyRequest header
  2267  		return nil, oi, proxy
  2268  	}
  2269  	var perr error
  2270  	for _, t := range proxyTargets.Targets {
  2271  		tgt = globalBucketTargetSys.GetRemoteTargetClient(bucket, t.Arn)
  2272  		if tgt == nil || globalBucketTargetSys.isOffline(tgt.EndpointURL()) {
  2273  			continue
  2274  		}
  2275  		// if proxying explicitly disabled on remote target
  2276  		if tgt.disableProxy {
  2277  			continue
  2278  		}
  2279  
  2280  		gopts := minio.GetObjectOptions{
  2281  			VersionID:            opts.VersionID,
  2282  			ServerSideEncryption: opts.ServerSideEncryption,
  2283  			Internal: minio.AdvancedGetOptions{
  2284  				ReplicationProxyRequest: "true",
  2285  			},
  2286  			PartNumber: opts.PartNumber,
  2287  		}
  2288  		if rs != nil {
  2289  			h, err := rs.ToHeader()
  2290  			if err != nil {
  2291  				logger.LogIf(ctx, fmt.Errorf("invalid range header for %s/%s(%s) - %w", bucket, object, opts.VersionID, err))
  2292  				continue
  2293  			}
  2294  			gopts.Set(xhttp.Range, h)
  2295  		}
  2296  
  2297  		objInfo, err := tgt.StatObject(ctx, t.TargetBucket, object, gopts)
  2298  		if err != nil {
  2299  			perr = err
  2300  			if isErrInvalidRange(ErrorRespToObjectError(err, bucket, object)) {
  2301  				return nil, oi, proxyResult{Err: err}
  2302  			}
  2303  			continue
  2304  		}
  2305  
  2306  		tags, _ := tags.MapToObjectTags(objInfo.UserTags)
  2307  		oi = ObjectInfo{
  2308  			Bucket:                    bucket,
  2309  			Name:                      object,
  2310  			ModTime:                   objInfo.LastModified,
  2311  			Size:                      objInfo.Size,
  2312  			ETag:                      objInfo.ETag,
  2313  			VersionID:                 objInfo.VersionID,
  2314  			IsLatest:                  objInfo.IsLatest,
  2315  			DeleteMarker:              objInfo.IsDeleteMarker,
  2316  			ContentType:               objInfo.ContentType,
  2317  			Expires:                   objInfo.Expires,
  2318  			StorageClass:              objInfo.StorageClass,
  2319  			ReplicationStatusInternal: objInfo.ReplicationStatus,
  2320  			UserTags:                  tags.String(),
  2321  			ReplicationStatus:         replication.StatusType(objInfo.ReplicationStatus),
  2322  		}
  2323  		oi.UserDefined = make(map[string]string, len(objInfo.Metadata))
  2324  		for k, v := range objInfo.Metadata {
  2325  			oi.UserDefined[k] = v[0]
  2326  		}
  2327  		ce, ok := oi.UserDefined[xhttp.ContentEncoding]
  2328  		if !ok {
  2329  			ce, ok = oi.UserDefined[strings.ToLower(xhttp.ContentEncoding)]
  2330  		}
  2331  		if ok {
  2332  			oi.ContentEncoding = ce
  2333  		}
  2334  		return tgt, oi, proxyResult{Proxy: true}
  2335  	}
  2336  	proxy.Err = perr
  2337  	return nil, oi, proxy
  2338  }
  2339  
  2340  // get object info from replication target if active-active replication is in place and
  2341  // this node returns a 404
  2342  func proxyHeadToReplicationTarget(ctx context.Context, bucket, object string, rs *HTTPRangeSpec, opts ObjectOptions, proxyTargets *madmin.BucketTargets) (oi ObjectInfo, proxy proxyResult) {
  2343  	_, oi, proxy = proxyHeadToRepTarget(ctx, bucket, object, rs, opts, proxyTargets)
  2344  	return oi, proxy
  2345  }
  2346  
  2347  func scheduleReplication(ctx context.Context, oi ObjectInfo, o ObjectLayer, dsc ReplicateDecision, opType replication.Type) {
  2348  	tgtStatuses := replicationStatusesMap(oi.ReplicationStatusInternal)
  2349  	purgeStatuses := versionPurgeStatusesMap(oi.VersionPurgeStatusInternal)
  2350  	tm, _ := time.Parse(time.RFC3339Nano, oi.UserDefined[ReservedMetadataPrefixLower+ReplicationTimestamp])
  2351  	rstate := oi.ReplicationState()
  2352  	rstate.ReplicateDecisionStr = dsc.String()
  2353  	asz, _ := oi.GetActualSize()
  2354  
  2355  	ri := ReplicateObjectInfo{
  2356  		Name:                       oi.Name,
  2357  		Size:                       oi.Size,
  2358  		ActualSize:                 asz,
  2359  		Bucket:                     oi.Bucket,
  2360  		VersionID:                  oi.VersionID,
  2361  		ETag:                       oi.ETag,
  2362  		ModTime:                    oi.ModTime,
  2363  		ReplicationStatus:          oi.ReplicationStatus,
  2364  		ReplicationStatusInternal:  oi.ReplicationStatusInternal,
  2365  		DeleteMarker:               oi.DeleteMarker,
  2366  		VersionPurgeStatusInternal: oi.VersionPurgeStatusInternal,
  2367  		VersionPurgeStatus:         oi.VersionPurgeStatus,
  2368  
  2369  		ReplicationState:     rstate,
  2370  		OpType:               opType,
  2371  		Dsc:                  dsc,
  2372  		TargetStatuses:       tgtStatuses,
  2373  		TargetPurgeStatuses:  purgeStatuses,
  2374  		ReplicationTimestamp: tm,
  2375  		SSEC:                 crypto.SSEC.IsEncrypted(oi.UserDefined),
  2376  		UserTags:             oi.UserTags,
  2377  	}
  2378  
  2379  	if dsc.Synchronous() {
  2380  		replicateObject(ctx, ri, o)
  2381  	} else {
  2382  		globalReplicationPool.queueReplicaTask(ri)
  2383  	}
  2384  }
  2385  
  2386  // proxyTaggingToRepTarget proxies tagging requests to remote targets for
  2387  // active-active replicated setups
  2388  func proxyTaggingToRepTarget(ctx context.Context, bucket, object string, tags *tags.Tags, opts ObjectOptions, proxyTargets *madmin.BucketTargets) (proxy proxyResult) {
  2389  	// this option is set when active-active replication is in place between site A -> B,
  2390  	// and request hits site B that does not have the object yet.
  2391  	if opts.ProxyRequest || (opts.ProxyHeaderSet && !opts.ProxyRequest) { // true only when site B sets MinIOSourceProxyRequest header
  2392  		return proxy
  2393  	}
  2394  	var wg sync.WaitGroup
  2395  	errs := make([]error, len(proxyTargets.Targets))
  2396  	for idx, t := range proxyTargets.Targets {
  2397  		tgt := globalBucketTargetSys.GetRemoteTargetClient(bucket, t.Arn)
  2398  		if tgt == nil || globalBucketTargetSys.isOffline(tgt.EndpointURL()) {
  2399  			continue
  2400  		}
  2401  		// if proxying explicitly disabled on remote target
  2402  		if tgt.disableProxy {
  2403  			continue
  2404  		}
  2405  		idx := idx
  2406  		wg.Add(1)
  2407  		go func(idx int, tgt *TargetClient) {
  2408  			defer wg.Done()
  2409  			var err error
  2410  			if tags != nil {
  2411  				popts := minio.PutObjectTaggingOptions{
  2412  					VersionID: opts.VersionID,
  2413  					Internal: minio.AdvancedObjectTaggingOptions{
  2414  						ReplicationProxyRequest: "true",
  2415  					},
  2416  				}
  2417  				err = tgt.PutObjectTagging(ctx, tgt.Bucket, object, tags, popts)
  2418  			} else {
  2419  				dopts := minio.RemoveObjectTaggingOptions{
  2420  					VersionID: opts.VersionID,
  2421  					Internal: minio.AdvancedObjectTaggingOptions{
  2422  						ReplicationProxyRequest: "true",
  2423  					},
  2424  				}
  2425  				err = tgt.RemoveObjectTagging(ctx, tgt.Bucket, object, dopts)
  2426  			}
  2427  			if err != nil {
  2428  				errs[idx] = err
  2429  			}
  2430  		}(idx, tgt)
  2431  	}
  2432  	wg.Wait()
  2433  
  2434  	var (
  2435  		terr        error
  2436  		taggedCount int
  2437  	)
  2438  	for _, err := range errs {
  2439  		if err == nil {
  2440  			taggedCount++
  2441  			continue
  2442  		}
  2443  		if err != nil {
  2444  			terr = err
  2445  		}
  2446  	}
  2447  	// don't return error if at least one target was tagged successfully
  2448  	if taggedCount == 0 && terr != nil {
  2449  		proxy.Err = terr
  2450  	}
  2451  	return proxy
  2452  }
  2453  
  2454  // proxyGetTaggingToRepTarget proxies get tagging requests to remote targets for
  2455  // active-active replicated setups
  2456  func proxyGetTaggingToRepTarget(ctx context.Context, bucket, object string, opts ObjectOptions, proxyTargets *madmin.BucketTargets) (tgs *tags.Tags, proxy proxyResult) {
  2457  	// this option is set when active-active replication is in place between site A -> B,
  2458  	// and request hits site B that does not have the object yet.
  2459  	if opts.ProxyRequest || (opts.ProxyHeaderSet && !opts.ProxyRequest) { // true only when site B sets MinIOSourceProxyRequest header
  2460  		return nil, proxy
  2461  	}
  2462  	var wg sync.WaitGroup
  2463  	errs := make([]error, len(proxyTargets.Targets))
  2464  	tagSlc := make([]map[string]string, len(proxyTargets.Targets))
  2465  	for idx, t := range proxyTargets.Targets {
  2466  		tgt := globalBucketTargetSys.GetRemoteTargetClient(bucket, t.Arn)
  2467  		if tgt == nil || globalBucketTargetSys.isOffline(tgt.EndpointURL()) {
  2468  			continue
  2469  		}
  2470  		// if proxying explicitly disabled on remote target
  2471  		if tgt.disableProxy {
  2472  			continue
  2473  		}
  2474  		idx := idx
  2475  		wg.Add(1)
  2476  		go func(idx int, tgt *TargetClient) {
  2477  			defer wg.Done()
  2478  			var err error
  2479  			gopts := minio.GetObjectTaggingOptions{
  2480  				VersionID: opts.VersionID,
  2481  				Internal: minio.AdvancedObjectTaggingOptions{
  2482  					ReplicationProxyRequest: "true",
  2483  				},
  2484  			}
  2485  			tgs, err = tgt.GetObjectTagging(ctx, tgt.Bucket, object, gopts)
  2486  			if err != nil {
  2487  				errs[idx] = err
  2488  			} else {
  2489  				tagSlc[idx] = tgs.ToMap()
  2490  			}
  2491  		}(idx, tgt)
  2492  	}
  2493  	wg.Wait()
  2494  	for idx, err := range errs {
  2495  		errCode := minio.ToErrorResponse(err).Code
  2496  		if err != nil && errCode != "NoSuchKey" && errCode != "NoSuchVersion" {
  2497  			return nil, proxyResult{Err: err}
  2498  		}
  2499  		if err == nil {
  2500  			tgs, _ = tags.MapToObjectTags(tagSlc[idx])
  2501  		}
  2502  	}
  2503  	if len(errs) == 1 {
  2504  		proxy.Err = errs[0]
  2505  	}
  2506  	return tgs, proxy
  2507  }
  2508  
  2509  func scheduleReplicationDelete(ctx context.Context, dv DeletedObjectReplicationInfo, o ObjectLayer) {
  2510  	globalReplicationPool.queueReplicaDeleteTask(dv)
  2511  	for arn := range dv.ReplicationState.Targets {
  2512  		globalReplicationStats.Update(dv.Bucket, replicatedTargetInfo{Arn: arn, Size: 0, Duration: 0, OpType: replication.DeleteReplicationType}, replication.Pending, replication.StatusType(""))
  2513  	}
  2514  }
  2515  
  2516  type replicationConfig struct {
  2517  	Config  *replication.Config
  2518  	remotes *madmin.BucketTargets
  2519  }
  2520  
  2521  func (c replicationConfig) Empty() bool {
  2522  	return c.Config == nil
  2523  }
  2524  
  2525  func (c replicationConfig) Replicate(opts replication.ObjectOpts) bool {
  2526  	return c.Config.Replicate(opts)
  2527  }
  2528  
  2529  // Resync returns true if replication reset is requested
  2530  func (c replicationConfig) Resync(ctx context.Context, oi ObjectInfo, dsc ReplicateDecision, tgtStatuses map[string]replication.StatusType) (r ResyncDecision) {
  2531  	if c.Empty() {
  2532  		return
  2533  	}
  2534  
  2535  	// Now overlay existing object replication choices for target
  2536  	if oi.DeleteMarker {
  2537  		opts := replication.ObjectOpts{
  2538  			Name:           oi.Name,
  2539  			DeleteMarker:   oi.DeleteMarker,
  2540  			VersionID:      oi.VersionID,
  2541  			OpType:         replication.DeleteReplicationType,
  2542  			ExistingObject: true,
  2543  		}
  2544  
  2545  		tgtArns := c.Config.FilterTargetArns(opts)
  2546  		// indicates no matching target with Existing object replication enabled.
  2547  		if len(tgtArns) == 0 {
  2548  			return
  2549  		}
  2550  		for _, t := range tgtArns {
  2551  			opts.TargetArn = t
  2552  			// Update replication decision for target based on existing object replciation rule.
  2553  			dsc.Set(newReplicateTargetDecision(t, c.Replicate(opts), false))
  2554  		}
  2555  		return c.resync(oi, dsc, tgtStatuses)
  2556  	}
  2557  
  2558  	// Ignore previous replication status when deciding if object can be re-replicated
  2559  	userDefined := cloneMSS(oi.UserDefined)
  2560  	delete(userDefined, xhttp.AmzBucketReplicationStatus)
  2561  
  2562  	rdsc := mustReplicate(ctx, oi.Bucket, oi.Name, getMustReplicateOptions(userDefined, oi.UserTags, "", replication.ExistingObjectReplicationType, ObjectOptions{}))
  2563  	return c.resync(oi, rdsc, tgtStatuses)
  2564  }
  2565  
  2566  // wrapper function for testability. Returns true if a new reset is requested on
  2567  // already replicated objects OR object qualifies for existing object replication
  2568  // and no reset requested.
  2569  func (c replicationConfig) resync(oi ObjectInfo, dsc ReplicateDecision, tgtStatuses map[string]replication.StatusType) (r ResyncDecision) {
  2570  	r = ResyncDecision{
  2571  		targets: make(map[string]ResyncTargetDecision, len(dsc.targetsMap)),
  2572  	}
  2573  	if c.remotes == nil {
  2574  		return
  2575  	}
  2576  	for _, tgt := range c.remotes.Targets {
  2577  		d, ok := dsc.targetsMap[tgt.Arn]
  2578  		if !ok {
  2579  			continue
  2580  		}
  2581  		if !d.Replicate {
  2582  			continue
  2583  		}
  2584  		r.targets[d.Arn] = resyncTarget(oi, tgt.Arn, tgt.ResetID, tgt.ResetBeforeDate, tgtStatuses[tgt.Arn])
  2585  	}
  2586  	return
  2587  }
  2588  
  2589  func targetResetHeader(arn string) string {
  2590  	return fmt.Sprintf("%s-%s", ReservedMetadataPrefixLower+ReplicationReset, arn)
  2591  }
  2592  
  2593  func resyncTarget(oi ObjectInfo, arn string, resetID string, resetBeforeDate time.Time, tgtStatus replication.StatusType) (rd ResyncTargetDecision) {
  2594  	rd = ResyncTargetDecision{
  2595  		ResetID:         resetID,
  2596  		ResetBeforeDate: resetBeforeDate,
  2597  	}
  2598  	rs, ok := oi.UserDefined[targetResetHeader(arn)]
  2599  	if !ok {
  2600  		rs, ok = oi.UserDefined[xhttp.MinIOReplicationResetStatus] // for backward compatibility
  2601  	}
  2602  	if !ok { // existing object replication is enabled and object version is unreplicated so far.
  2603  		if resetID != "" && oi.ModTime.Before(resetBeforeDate) { // trigger replication if `mc replicate reset` requested
  2604  			rd.Replicate = true
  2605  			return
  2606  		}
  2607  		// For existing object reset - this condition is needed
  2608  		rd.Replicate = tgtStatus == ""
  2609  		return
  2610  	}
  2611  	if resetID == "" || resetBeforeDate.Equal(timeSentinel) { // no reset in progress
  2612  		return
  2613  	}
  2614  
  2615  	// if already replicated, return true if a new reset was requested.
  2616  	splits := strings.SplitN(rs, ";", 2)
  2617  	if len(splits) != 2 {
  2618  		return
  2619  	}
  2620  	newReset := splits[1] != resetID
  2621  	if !newReset && tgtStatus == replication.Completed {
  2622  		// already replicated and no reset requested
  2623  		return
  2624  	}
  2625  	rd.Replicate = newReset && oi.ModTime.Before(resetBeforeDate)
  2626  	return
  2627  }
  2628  
  2629  const resyncTimeInterval = time.Minute * 1
  2630  
  2631  // PersistToDisk persists in-memory resync metadata stats to disk at periodic intervals
  2632  func (s *replicationResyncer) PersistToDisk(ctx context.Context, objectAPI ObjectLayer) {
  2633  	resyncTimer := time.NewTimer(resyncTimeInterval)
  2634  	defer resyncTimer.Stop()
  2635  
  2636  	// For each bucket name, store the last timestamp of the
  2637  	// successful save of replication status in the backend disks.
  2638  	lastResyncStatusSave := make(map[string]time.Time)
  2639  
  2640  	for {
  2641  		select {
  2642  		case <-resyncTimer.C:
  2643  			s.RLock()
  2644  			for bucket, brs := range s.statusMap {
  2645  				var updt bool
  2646  				// Save the replication status if one resync to any bucket target is still not finished
  2647  				for _, st := range brs.TargetsMap {
  2648  					if st.LastUpdate.Equal(timeSentinel) {
  2649  						updt = true
  2650  						break
  2651  					}
  2652  				}
  2653  				// Save the replication status if a new stats update is found and not saved in the backend yet
  2654  				if brs.LastUpdate.After(lastResyncStatusSave[bucket]) {
  2655  					updt = true
  2656  				}
  2657  				if updt {
  2658  					if err := saveResyncStatus(ctx, bucket, brs, objectAPI); err != nil {
  2659  						logger.LogIf(ctx, fmt.Errorf("could not save resync metadata to drive for %s - %w", bucket, err))
  2660  					} else {
  2661  						lastResyncStatusSave[bucket] = brs.LastUpdate
  2662  					}
  2663  				}
  2664  			}
  2665  			s.RUnlock()
  2666  
  2667  			resyncTimer.Reset(resyncTimeInterval)
  2668  		case <-ctx.Done():
  2669  			// server could be restarting - need
  2670  			// to exit immediately
  2671  			return
  2672  		}
  2673  	}
  2674  }
  2675  
  2676  const (
  2677  	resyncWorkerCnt        = 10 // limit of number of bucket resyncs is progress at any given time
  2678  	resyncParallelRoutines = 10 // number of parallel resync ops per bucket
  2679  )
  2680  
  2681  func newresyncer() *replicationResyncer {
  2682  	rs := replicationResyncer{
  2683  		statusMap:      make(map[string]BucketReplicationResyncStatus),
  2684  		workerSize:     resyncWorkerCnt,
  2685  		resyncCancelCh: make(chan struct{}, resyncWorkerCnt),
  2686  		workerCh:       make(chan struct{}, resyncWorkerCnt),
  2687  	}
  2688  	for i := 0; i < rs.workerSize; i++ {
  2689  		rs.workerCh <- struct{}{}
  2690  	}
  2691  	return &rs
  2692  }
  2693  
  2694  // mark status of replication resync on remote target for the bucket
  2695  func (s *replicationResyncer) markStatus(status ResyncStatusType, opts resyncOpts, objAPI ObjectLayer) {
  2696  	s.Lock()
  2697  	defer s.Unlock()
  2698  
  2699  	m := s.statusMap[opts.bucket]
  2700  	st := m.TargetsMap[opts.arn]
  2701  	st.LastUpdate = UTCNow()
  2702  	st.ResyncStatus = status
  2703  	m.TargetsMap[opts.arn] = st
  2704  	m.LastUpdate = UTCNow()
  2705  	s.statusMap[opts.bucket] = m
  2706  
  2707  	ctx, cancel := context.WithTimeout(context.Background(), time.Second)
  2708  	defer cancel()
  2709  	saveResyncStatus(ctx, opts.bucket, m, objAPI)
  2710  }
  2711  
  2712  // update replication resync stats for bucket's remote target
  2713  func (s *replicationResyncer) incStats(ts TargetReplicationResyncStatus, opts resyncOpts) {
  2714  	s.Lock()
  2715  	defer s.Unlock()
  2716  	m := s.statusMap[opts.bucket]
  2717  	st := m.TargetsMap[opts.arn]
  2718  	st.Object = ts.Object
  2719  	st.ReplicatedCount += ts.ReplicatedCount
  2720  	st.FailedCount += ts.FailedCount
  2721  	st.ReplicatedSize += ts.ReplicatedSize
  2722  	st.FailedSize += ts.FailedSize
  2723  	m.TargetsMap[opts.arn] = st
  2724  	m.LastUpdate = UTCNow()
  2725  	s.statusMap[opts.bucket] = m
  2726  }
  2727  
  2728  // resyncBucket resyncs all qualifying objects as per replication rules for the target
  2729  // ARN
  2730  func (s *replicationResyncer) resyncBucket(ctx context.Context, objectAPI ObjectLayer, heal bool, opts resyncOpts) {
  2731  	select {
  2732  	case <-s.workerCh: // block till a worker is available
  2733  	case <-ctx.Done():
  2734  		return
  2735  	}
  2736  
  2737  	resyncStatus := ResyncFailed
  2738  	defer func() {
  2739  		s.markStatus(resyncStatus, opts, objectAPI)
  2740  		globalSiteResyncMetrics.incBucket(opts, resyncStatus)
  2741  		s.workerCh <- struct{}{}
  2742  	}()
  2743  	// Allocate new results channel to receive ObjectInfo.
  2744  	objInfoCh := make(chan ObjectInfo)
  2745  	cfg, err := getReplicationConfig(ctx, opts.bucket)
  2746  	if err != nil {
  2747  		logger.LogIf(ctx, fmt.Errorf("replication resync of %s for arn %s failed with %w", opts.bucket, opts.arn, err))
  2748  		return
  2749  	}
  2750  	tgts, err := globalBucketTargetSys.ListBucketTargets(ctx, opts.bucket)
  2751  	if err != nil {
  2752  		logger.LogIf(ctx, fmt.Errorf("replication resync of %s for arn %s failed  %w", opts.bucket, opts.arn, err))
  2753  		return
  2754  	}
  2755  	rcfg := replicationConfig{
  2756  		Config:  cfg,
  2757  		remotes: tgts,
  2758  	}
  2759  	tgtArns := cfg.FilterTargetArns(
  2760  		replication.ObjectOpts{
  2761  			OpType:    replication.ResyncReplicationType,
  2762  			TargetArn: opts.arn,
  2763  		})
  2764  	if len(tgtArns) != 1 {
  2765  		logger.LogIf(ctx, fmt.Errorf("replication resync failed for %s - arn specified %s is missing in the replication config", opts.bucket, opts.arn))
  2766  		return
  2767  	}
  2768  	tgt := globalBucketTargetSys.GetRemoteTargetClient(opts.bucket, opts.arn)
  2769  	if tgt == nil {
  2770  		logger.LogIf(ctx, fmt.Errorf("replication resync failed for %s - target could not be created for arn %s", opts.bucket, opts.arn))
  2771  		return
  2772  	}
  2773  	// mark resync status as resync started
  2774  	if !heal {
  2775  		s.markStatus(ResyncStarted, opts, objectAPI)
  2776  	}
  2777  
  2778  	// Walk through all object versions - Walk() is always in ascending order needed to ensure
  2779  	// delete marker replicated to target after object version is first created.
  2780  	if err := objectAPI.Walk(ctx, opts.bucket, "", objInfoCh, WalkOptions{}); err != nil {
  2781  		logger.LogIf(ctx, err)
  2782  		return
  2783  	}
  2784  
  2785  	s.RLock()
  2786  	m := s.statusMap[opts.bucket]
  2787  	st := m.TargetsMap[opts.arn]
  2788  	s.RUnlock()
  2789  	var lastCheckpoint string
  2790  	if st.ResyncStatus == ResyncStarted || st.ResyncStatus == ResyncFailed {
  2791  		lastCheckpoint = st.Object
  2792  	}
  2793  	workers := make([]chan ReplicateObjectInfo, resyncParallelRoutines)
  2794  	resultCh := make(chan TargetReplicationResyncStatus, 1)
  2795  	defer xioutil.SafeClose(resultCh)
  2796  	go func() {
  2797  		for r := range resultCh {
  2798  			s.incStats(r, opts)
  2799  			globalSiteResyncMetrics.updateMetric(r, opts.resyncID)
  2800  		}
  2801  	}()
  2802  
  2803  	var wg sync.WaitGroup
  2804  	for i := 0; i < resyncParallelRoutines; i++ {
  2805  		wg.Add(1)
  2806  		workers[i] = make(chan ReplicateObjectInfo, 100)
  2807  		i := i
  2808  		go func(ctx context.Context, idx int) {
  2809  			defer wg.Done()
  2810  			for roi := range workers[idx] {
  2811  				select {
  2812  				case <-ctx.Done():
  2813  					return
  2814  				case <-s.resyncCancelCh:
  2815  				default:
  2816  				}
  2817  				traceFn := s.trace(tgt.ResetID, fmt.Sprintf("%s/%s (%s)", opts.bucket, roi.Name, roi.VersionID))
  2818  				if roi.DeleteMarker || !roi.VersionPurgeStatus.Empty() {
  2819  					versionID := ""
  2820  					dmVersionID := ""
  2821  					if roi.VersionPurgeStatus.Empty() {
  2822  						dmVersionID = roi.VersionID
  2823  					} else {
  2824  						versionID = roi.VersionID
  2825  					}
  2826  
  2827  					doi := DeletedObjectReplicationInfo{
  2828  						DeletedObject: DeletedObject{
  2829  							ObjectName:            roi.Name,
  2830  							DeleteMarkerVersionID: dmVersionID,
  2831  							VersionID:             versionID,
  2832  							ReplicationState:      roi.ReplicationState,
  2833  							DeleteMarkerMTime:     DeleteMarkerMTime{roi.ModTime},
  2834  							DeleteMarker:          roi.DeleteMarker,
  2835  						},
  2836  						Bucket:    roi.Bucket,
  2837  						OpType:    replication.ExistingObjectReplicationType,
  2838  						EventType: ReplicateExistingDelete,
  2839  					}
  2840  					replicateDelete(ctx, doi, objectAPI)
  2841  				} else {
  2842  					roi.OpType = replication.ExistingObjectReplicationType
  2843  					roi.EventType = ReplicateExisting
  2844  					replicateObject(ctx, roi, objectAPI)
  2845  				}
  2846  
  2847  				st := TargetReplicationResyncStatus{
  2848  					Object: roi.Name,
  2849  					Bucket: roi.Bucket,
  2850  				}
  2851  
  2852  				_, err := tgt.StatObject(ctx, tgt.Bucket, roi.Name, minio.StatObjectOptions{
  2853  					VersionID: roi.VersionID,
  2854  					Internal: minio.AdvancedGetOptions{
  2855  						ReplicationProxyRequest: "false",
  2856  					},
  2857  				})
  2858  				if err != nil {
  2859  					if roi.DeleteMarker && isErrMethodNotAllowed(ErrorRespToObjectError(err, opts.bucket, roi.Name)) {
  2860  						st.ReplicatedCount++
  2861  					} else {
  2862  						st.FailedCount++
  2863  					}
  2864  				} else {
  2865  					st.ReplicatedCount++
  2866  					st.ReplicatedSize += roi.Size
  2867  				}
  2868  				traceFn(err)
  2869  				select {
  2870  				case <-ctx.Done():
  2871  					return
  2872  				case <-s.resyncCancelCh:
  2873  					return
  2874  				case resultCh <- st:
  2875  				}
  2876  			}
  2877  		}(ctx, i)
  2878  	}
  2879  	for obj := range objInfoCh {
  2880  		select {
  2881  		case <-s.resyncCancelCh:
  2882  			resyncStatus = ResyncCanceled
  2883  			return
  2884  		case <-ctx.Done():
  2885  			return
  2886  		default:
  2887  		}
  2888  		if heal && lastCheckpoint != "" && lastCheckpoint != obj.Name {
  2889  			continue
  2890  		}
  2891  		lastCheckpoint = ""
  2892  		roi := getHealReplicateObjectInfo(obj, rcfg)
  2893  		if !roi.ExistingObjResync.mustResync() {
  2894  			continue
  2895  		}
  2896  		select {
  2897  		case <-s.resyncCancelCh:
  2898  			return
  2899  		case <-ctx.Done():
  2900  			return
  2901  		default:
  2902  			h := xxh3.HashString(roi.Bucket + roi.Name)
  2903  			workers[h%uint64(resyncParallelRoutines)] <- roi
  2904  		}
  2905  	}
  2906  	for i := 0; i < resyncParallelRoutines; i++ {
  2907  		xioutil.SafeClose(workers[i])
  2908  	}
  2909  	wg.Wait()
  2910  	resyncStatus = ResyncCompleted
  2911  }
  2912  
  2913  // start replication resync for the remote target ARN specified
  2914  func (s *replicationResyncer) start(ctx context.Context, objAPI ObjectLayer, opts resyncOpts) error {
  2915  	if opts.bucket == "" {
  2916  		return fmt.Errorf("bucket name is empty")
  2917  	}
  2918  	if opts.arn == "" {
  2919  		return fmt.Errorf("target ARN specified for resync is empty")
  2920  	}
  2921  	// Check if the current bucket has quota restrictions, if not skip it
  2922  	cfg, err := getReplicationConfig(ctx, opts.bucket)
  2923  	if err != nil {
  2924  		return err
  2925  	}
  2926  	tgtArns := cfg.FilterTargetArns(
  2927  		replication.ObjectOpts{
  2928  			OpType:    replication.ResyncReplicationType,
  2929  			TargetArn: opts.arn,
  2930  		})
  2931  
  2932  	if len(tgtArns) == 0 {
  2933  		return fmt.Errorf("arn %s specified for resync not found in replication config", opts.arn)
  2934  	}
  2935  	globalReplicationPool.resyncer.RLock()
  2936  	data, ok := globalReplicationPool.resyncer.statusMap[opts.bucket]
  2937  	globalReplicationPool.resyncer.RUnlock()
  2938  	if !ok {
  2939  		data, err = loadBucketResyncMetadata(ctx, opts.bucket, objAPI)
  2940  		if err != nil {
  2941  			return err
  2942  		}
  2943  	}
  2944  	// validate if resync is in progress for this arn
  2945  	for tArn, st := range data.TargetsMap {
  2946  		if opts.arn == tArn && (st.ResyncStatus == ResyncStarted || st.ResyncStatus == ResyncPending) {
  2947  			return fmt.Errorf("Resync of bucket %s is already in progress for remote bucket %s", opts.bucket, opts.arn)
  2948  		}
  2949  	}
  2950  
  2951  	status := TargetReplicationResyncStatus{
  2952  		ResyncID:         opts.resyncID,
  2953  		ResyncBeforeDate: opts.resyncBefore,
  2954  		StartTime:        UTCNow(),
  2955  		ResyncStatus:     ResyncPending,
  2956  		Bucket:           opts.bucket,
  2957  	}
  2958  	data.TargetsMap[opts.arn] = status
  2959  	if err = saveResyncStatus(ctx, opts.bucket, data, objAPI); err != nil {
  2960  		return err
  2961  	}
  2962  
  2963  	globalReplicationPool.resyncer.Lock()
  2964  	defer globalReplicationPool.resyncer.Unlock()
  2965  	brs, ok := globalReplicationPool.resyncer.statusMap[opts.bucket]
  2966  	if !ok {
  2967  		brs = BucketReplicationResyncStatus{
  2968  			Version:    resyncMetaVersion,
  2969  			TargetsMap: make(map[string]TargetReplicationResyncStatus),
  2970  		}
  2971  	}
  2972  	brs.TargetsMap[opts.arn] = status
  2973  	globalReplicationPool.resyncer.statusMap[opts.bucket] = brs
  2974  	go globalReplicationPool.resyncer.resyncBucket(GlobalContext, objAPI, false, opts)
  2975  	return nil
  2976  }
  2977  
  2978  func (s *replicationResyncer) trace(resyncID string, path string) func(err error) {
  2979  	startTime := time.Now()
  2980  	return func(err error) {
  2981  		duration := time.Since(startTime)
  2982  		if globalTrace.NumSubscribers(madmin.TraceReplicationResync) > 0 {
  2983  			globalTrace.Publish(replicationResyncTrace(resyncID, startTime, duration, path, err))
  2984  		}
  2985  	}
  2986  }
  2987  
  2988  func replicationResyncTrace(resyncID string, startTime time.Time, duration time.Duration, path string, err error) madmin.TraceInfo {
  2989  	var errStr string
  2990  	if err != nil {
  2991  		errStr = err.Error()
  2992  	}
  2993  	funcName := fmt.Sprintf("replication.(resyncID=%s)", resyncID)
  2994  	return madmin.TraceInfo{
  2995  		TraceType: madmin.TraceReplicationResync,
  2996  		Time:      startTime,
  2997  		NodeName:  globalLocalNodeName,
  2998  		FuncName:  funcName,
  2999  		Duration:  duration,
  3000  		Path:      path,
  3001  		Error:     errStr,
  3002  	}
  3003  }
  3004  
  3005  // delete resync metadata from replication resync state in memory
  3006  func (p *ReplicationPool) deleteResyncMetadata(ctx context.Context, bucket string) {
  3007  	if p == nil {
  3008  		return
  3009  	}
  3010  	p.resyncer.Lock()
  3011  	delete(p.resyncer.statusMap, bucket)
  3012  	defer p.resyncer.Unlock()
  3013  
  3014  	globalSiteResyncMetrics.deleteBucket(bucket)
  3015  }
  3016  
  3017  // initResync - initializes bucket replication resync for all buckets.
  3018  func (p *ReplicationPool) initResync(ctx context.Context, buckets []BucketInfo, objAPI ObjectLayer) error {
  3019  	if objAPI == nil {
  3020  		return errServerNotInitialized
  3021  	}
  3022  	// Load bucket metadata sys in background
  3023  	go p.startResyncRoutine(ctx, buckets, objAPI)
  3024  	return nil
  3025  }
  3026  
  3027  func (p *ReplicationPool) startResyncRoutine(ctx context.Context, buckets []BucketInfo, objAPI ObjectLayer) {
  3028  	r := rand.New(rand.NewSource(time.Now().UnixNano()))
  3029  	// Run the replication resync in a loop
  3030  	for {
  3031  		if err := p.loadResync(ctx, buckets, objAPI); err == nil {
  3032  			<-ctx.Done()
  3033  			return
  3034  		}
  3035  		duration := time.Duration(r.Float64() * float64(time.Minute))
  3036  		if duration < time.Second {
  3037  			// Make sure to sleep at least a second to avoid high CPU ticks.
  3038  			duration = time.Second
  3039  		}
  3040  		time.Sleep(duration)
  3041  	}
  3042  }
  3043  
  3044  // Loads bucket replication resync statuses into memory.
  3045  func (p *ReplicationPool) loadResync(ctx context.Context, buckets []BucketInfo, objAPI ObjectLayer) error {
  3046  	// Make sure only one node running resync on the cluster.
  3047  	ctx, cancel := globalLeaderLock.GetLock(ctx)
  3048  	defer cancel()
  3049  
  3050  	for index := range buckets {
  3051  		bucket := buckets[index].Name
  3052  
  3053  		meta, err := loadBucketResyncMetadata(ctx, bucket, objAPI)
  3054  		if err != nil {
  3055  			if !errors.Is(err, errVolumeNotFound) {
  3056  				logger.LogIf(ctx, err)
  3057  			}
  3058  			continue
  3059  		}
  3060  
  3061  		p.resyncer.Lock()
  3062  		p.resyncer.statusMap[bucket] = meta
  3063  		p.resyncer.Unlock()
  3064  
  3065  		tgts := meta.cloneTgtStats()
  3066  		for arn, st := range tgts {
  3067  			switch st.ResyncStatus {
  3068  			case ResyncFailed, ResyncStarted, ResyncPending:
  3069  				go p.resyncer.resyncBucket(ctx, objAPI, true, resyncOpts{
  3070  					bucket:       bucket,
  3071  					arn:          arn,
  3072  					resyncID:     st.ResyncID,
  3073  					resyncBefore: st.ResyncBeforeDate,
  3074  				})
  3075  			}
  3076  		}
  3077  	}
  3078  	return nil
  3079  }
  3080  
  3081  // load bucket resync metadata from disk
  3082  func loadBucketResyncMetadata(ctx context.Context, bucket string, objAPI ObjectLayer) (brs BucketReplicationResyncStatus, e error) {
  3083  	brs = newBucketResyncStatus(bucket)
  3084  	resyncDirPath := path.Join(bucketMetaPrefix, bucket, replicationDir)
  3085  	data, err := readConfig(GlobalContext, objAPI, pathJoin(resyncDirPath, resyncFileName))
  3086  	if err != nil && err != errConfigNotFound {
  3087  		return brs, err
  3088  	}
  3089  	if len(data) == 0 {
  3090  		// Seems to be empty.
  3091  		return brs, nil
  3092  	}
  3093  	if len(data) <= 4 {
  3094  		return brs, fmt.Errorf("replication resync: no data")
  3095  	}
  3096  	// Read resync meta header
  3097  	switch binary.LittleEndian.Uint16(data[0:2]) {
  3098  	case resyncMetaFormat:
  3099  	default:
  3100  		return brs, fmt.Errorf("resyncMeta: unknown format: %d", binary.LittleEndian.Uint16(data[0:2]))
  3101  	}
  3102  	switch binary.LittleEndian.Uint16(data[2:4]) {
  3103  	case resyncMetaVersion:
  3104  	default:
  3105  		return brs, fmt.Errorf("resyncMeta: unknown version: %d", binary.LittleEndian.Uint16(data[2:4]))
  3106  	}
  3107  	// OK, parse data.
  3108  	if _, err = brs.UnmarshalMsg(data[4:]); err != nil {
  3109  		return brs, err
  3110  	}
  3111  
  3112  	switch brs.Version {
  3113  	case resyncMetaVersionV1:
  3114  	default:
  3115  		return brs, fmt.Errorf("unexpected resync meta version: %d", brs.Version)
  3116  	}
  3117  	return brs, nil
  3118  }
  3119  
  3120  // save resync status to resync.bin
  3121  func saveResyncStatus(ctx context.Context, bucket string, brs BucketReplicationResyncStatus, objectAPI ObjectLayer) error {
  3122  	data := make([]byte, 4, brs.Msgsize()+4)
  3123  
  3124  	// Initialize the resync meta header.
  3125  	binary.LittleEndian.PutUint16(data[0:2], resyncMetaFormat)
  3126  	binary.LittleEndian.PutUint16(data[2:4], resyncMetaVersion)
  3127  
  3128  	buf, err := brs.MarshalMsg(data)
  3129  	if err != nil {
  3130  		return err
  3131  	}
  3132  
  3133  	configFile := path.Join(bucketMetaPrefix, bucket, replicationDir, resyncFileName)
  3134  	return saveConfig(ctx, objectAPI, configFile, buf)
  3135  }
  3136  
  3137  // getReplicationDiff returns un-replicated objects in a channel.
  3138  // If a non-nil channel is returned it must be consumed fully or
  3139  // the provided context must be canceled.
  3140  func getReplicationDiff(ctx context.Context, objAPI ObjectLayer, bucket string, opts madmin.ReplDiffOpts) (chan madmin.DiffInfo, error) {
  3141  	cfg, err := getReplicationConfig(ctx, bucket)
  3142  	if err != nil {
  3143  		logger.LogIf(ctx, err)
  3144  		return nil, err
  3145  	}
  3146  	tgts, err := globalBucketTargetSys.ListBucketTargets(ctx, bucket)
  3147  	if err != nil {
  3148  		logger.LogIf(ctx, err)
  3149  		return nil, err
  3150  	}
  3151  
  3152  	objInfoCh := make(chan ObjectInfo, 10)
  3153  	if err := objAPI.Walk(ctx, bucket, opts.Prefix, objInfoCh, WalkOptions{}); err != nil {
  3154  		logger.LogIf(ctx, err)
  3155  		return nil, err
  3156  	}
  3157  	rcfg := replicationConfig{
  3158  		Config:  cfg,
  3159  		remotes: tgts,
  3160  	}
  3161  	diffCh := make(chan madmin.DiffInfo, 4000)
  3162  	go func() {
  3163  		defer xioutil.SafeClose(diffCh)
  3164  		for obj := range objInfoCh {
  3165  			if contextCanceled(ctx) {
  3166  				// Just consume input...
  3167  				continue
  3168  			}
  3169  			// Ignore object prefixes which are excluded
  3170  			// from versioning via the MinIO bucket versioning extension.
  3171  			if globalBucketVersioningSys.PrefixSuspended(bucket, obj.Name) {
  3172  				continue
  3173  			}
  3174  			roi := getHealReplicateObjectInfo(obj, rcfg)
  3175  			switch roi.ReplicationStatus {
  3176  			case replication.Completed, replication.Replica:
  3177  				if !opts.Verbose {
  3178  					continue
  3179  				}
  3180  				fallthrough
  3181  			default:
  3182  				// ignore pre-existing objects that don't satisfy replication rule(s)
  3183  				if roi.ReplicationStatus.Empty() && !roi.ExistingObjResync.mustResync() {
  3184  					continue
  3185  				}
  3186  				tgtsMap := make(map[string]madmin.TgtDiffInfo)
  3187  				for arn, st := range roi.TargetStatuses {
  3188  					if opts.ARN == "" || opts.ARN == arn {
  3189  						if !opts.Verbose && (st == replication.Completed || st == replication.Replica) {
  3190  							continue
  3191  						}
  3192  						tgtsMap[arn] = madmin.TgtDiffInfo{
  3193  							ReplicationStatus: st.String(),
  3194  						}
  3195  					}
  3196  				}
  3197  				for arn, st := range roi.TargetPurgeStatuses {
  3198  					if opts.ARN == "" || opts.ARN == arn {
  3199  						if !opts.Verbose && st == Complete {
  3200  							continue
  3201  						}
  3202  						t, ok := tgtsMap[arn]
  3203  						if !ok {
  3204  							t = madmin.TgtDiffInfo{}
  3205  						}
  3206  						t.DeleteReplicationStatus = string(st)
  3207  						tgtsMap[arn] = t
  3208  					}
  3209  				}
  3210  				select {
  3211  				case diffCh <- madmin.DiffInfo{
  3212  					Object:                  obj.Name,
  3213  					VersionID:               obj.VersionID,
  3214  					LastModified:            obj.ModTime,
  3215  					IsDeleteMarker:          obj.DeleteMarker,
  3216  					ReplicationStatus:       string(roi.ReplicationStatus),
  3217  					DeleteReplicationStatus: string(roi.VersionPurgeStatus),
  3218  					ReplicationTimestamp:    roi.ReplicationTimestamp,
  3219  					Targets:                 tgtsMap,
  3220  				}:
  3221  				case <-ctx.Done():
  3222  					continue
  3223  				}
  3224  			}
  3225  		}
  3226  	}()
  3227  	return diffCh, nil
  3228  }
  3229  
  3230  // QueueReplicationHeal is a wrapper for queueReplicationHeal
  3231  func QueueReplicationHeal(ctx context.Context, bucket string, oi ObjectInfo, retryCount int) {
  3232  	// ignore modtime zero objects
  3233  	if oi.ModTime.IsZero() {
  3234  		return
  3235  	}
  3236  	rcfg, _ := getReplicationConfig(ctx, bucket)
  3237  	tgts, _ := globalBucketTargetSys.ListBucketTargets(ctx, bucket)
  3238  	queueReplicationHeal(ctx, bucket, oi, replicationConfig{
  3239  		Config:  rcfg,
  3240  		remotes: tgts,
  3241  	}, retryCount)
  3242  }
  3243  
  3244  // queueReplicationHeal enqueues objects that failed replication OR eligible for resyncing through
  3245  // an ongoing resync operation or via existing objects replication configuration setting.
  3246  func queueReplicationHeal(ctx context.Context, bucket string, oi ObjectInfo, rcfg replicationConfig, retryCount int) (roi ReplicateObjectInfo) {
  3247  	// ignore modtime zero objects
  3248  	if oi.ModTime.IsZero() {
  3249  		return roi
  3250  	}
  3251  
  3252  	if isVeeamSOSAPIObject(oi.Name) {
  3253  		return roi
  3254  	}
  3255  	if rcfg.Config == nil || rcfg.remotes == nil {
  3256  		return roi
  3257  	}
  3258  	roi = getHealReplicateObjectInfo(oi, rcfg)
  3259  	roi.RetryCount = uint32(retryCount)
  3260  	if !roi.Dsc.ReplicateAny() {
  3261  		return
  3262  	}
  3263  	// early return if replication already done, otherwise we need to determine if this
  3264  	// version is an existing object that needs healing.
  3265  	if oi.ReplicationStatus == replication.Completed && oi.VersionPurgeStatus.Empty() && !roi.ExistingObjResync.mustResync() {
  3266  		return
  3267  	}
  3268  
  3269  	if roi.DeleteMarker || !roi.VersionPurgeStatus.Empty() {
  3270  		versionID := ""
  3271  		dmVersionID := ""
  3272  		if roi.VersionPurgeStatus.Empty() {
  3273  			dmVersionID = roi.VersionID
  3274  		} else {
  3275  			versionID = roi.VersionID
  3276  		}
  3277  
  3278  		dv := DeletedObjectReplicationInfo{
  3279  			DeletedObject: DeletedObject{
  3280  				ObjectName:            roi.Name,
  3281  				DeleteMarkerVersionID: dmVersionID,
  3282  				VersionID:             versionID,
  3283  				ReplicationState:      roi.ReplicationState,
  3284  				DeleteMarkerMTime:     DeleteMarkerMTime{roi.ModTime},
  3285  				DeleteMarker:          roi.DeleteMarker,
  3286  			},
  3287  			Bucket:    roi.Bucket,
  3288  			OpType:    replication.HealReplicationType,
  3289  			EventType: ReplicateHealDelete,
  3290  		}
  3291  		// heal delete marker replication failure or versioned delete replication failure
  3292  		if roi.ReplicationStatus == replication.Pending ||
  3293  			roi.ReplicationStatus == replication.Failed ||
  3294  			roi.VersionPurgeStatus == Failed || roi.VersionPurgeStatus == Pending {
  3295  			globalReplicationPool.queueReplicaDeleteTask(dv)
  3296  			return
  3297  		}
  3298  		// if replication status is Complete on DeleteMarker and existing object resync required
  3299  		if roi.ExistingObjResync.mustResync() && (roi.ReplicationStatus == replication.Completed || roi.ReplicationStatus.Empty()) {
  3300  			queueReplicateDeletesWrapper(dv, roi.ExistingObjResync)
  3301  			return
  3302  		}
  3303  		return
  3304  	}
  3305  	if roi.ExistingObjResync.mustResync() {
  3306  		roi.OpType = replication.ExistingObjectReplicationType
  3307  	}
  3308  	switch roi.ReplicationStatus {
  3309  	case replication.Pending, replication.Failed:
  3310  		roi.EventType = ReplicateHeal
  3311  		globalReplicationPool.queueReplicaTask(roi)
  3312  		return
  3313  	}
  3314  	if roi.ExistingObjResync.mustResync() {
  3315  		roi.EventType = ReplicateExisting
  3316  		globalReplicationPool.queueReplicaTask(roi)
  3317  	}
  3318  	return
  3319  }
  3320  
  3321  const (
  3322  	mrfSaveInterval  = 5 * time.Minute
  3323  	mrfQueueInterval = mrfSaveInterval + time.Minute // A minute higher than save interval
  3324  
  3325  	mrfRetryLimit = 3 // max number of retries before letting scanner catch up on this object version
  3326  	mrfMaxEntries = 1000000
  3327  )
  3328  
  3329  func (p *ReplicationPool) persistMRF() {
  3330  	if !p.initialized() {
  3331  		return
  3332  	}
  3333  
  3334  	entries := make(map[string]MRFReplicateEntry)
  3335  	mTimer := time.NewTimer(mrfSaveInterval)
  3336  	defer mTimer.Stop()
  3337  
  3338  	saveMRFToDisk := func() {
  3339  		if len(entries) == 0 {
  3340  			return
  3341  		}
  3342  
  3343  		// queue all entries for healing before overwriting the node mrf file
  3344  		if !contextCanceled(p.ctx) {
  3345  			p.queueMRFHeal()
  3346  		}
  3347  
  3348  		p.saveMRFEntries(p.ctx, entries)
  3349  
  3350  		entries = make(map[string]MRFReplicateEntry)
  3351  	}
  3352  	for {
  3353  		select {
  3354  		case <-mTimer.C:
  3355  			saveMRFToDisk()
  3356  			mTimer.Reset(mrfSaveInterval)
  3357  		case <-p.ctx.Done():
  3358  			p.mrfStopCh <- struct{}{}
  3359  			xioutil.SafeClose(p.mrfSaveCh)
  3360  			// We try to save if possible, but we don't care beyond that.
  3361  			saveMRFToDisk()
  3362  			return
  3363  		case e, ok := <-p.mrfSaveCh:
  3364  			if !ok {
  3365  				return
  3366  			}
  3367  			entries[e.versionID] = e
  3368  
  3369  			if len(entries) >= mrfMaxEntries {
  3370  				saveMRFToDisk()
  3371  			}
  3372  		}
  3373  	}
  3374  }
  3375  
  3376  func (p *ReplicationPool) queueMRFSave(entry MRFReplicateEntry) {
  3377  	if !p.initialized() {
  3378  		return
  3379  	}
  3380  	if entry.RetryCount > mrfRetryLimit { // let scanner catch up if retry count exceeded
  3381  		atomic.AddUint64(&globalReplicationStats.mrfStats.TotalDroppedCount, 1)
  3382  		atomic.AddUint64(&globalReplicationStats.mrfStats.TotalDroppedBytes, uint64(entry.sz))
  3383  		return
  3384  	}
  3385  
  3386  	select {
  3387  	case <-GlobalContext.Done():
  3388  		return
  3389  	case <-p.mrfStopCh:
  3390  		return
  3391  	default:
  3392  		select {
  3393  		case p.mrfSaveCh <- entry:
  3394  		default:
  3395  			atomic.AddUint64(&globalReplicationStats.mrfStats.TotalDroppedCount, 1)
  3396  			atomic.AddUint64(&globalReplicationStats.mrfStats.TotalDroppedBytes, uint64(entry.sz))
  3397  		}
  3398  	}
  3399  }
  3400  
  3401  func (p *ReplicationPool) persistToDrive(ctx context.Context, v MRFReplicateEntries) {
  3402  	newReader := func() io.ReadCloser {
  3403  		r, w := io.Pipe()
  3404  		go func() {
  3405  			// Initialize MRF meta header.
  3406  			var data [4]byte
  3407  			binary.LittleEndian.PutUint16(data[0:2], mrfMetaFormat)
  3408  			binary.LittleEndian.PutUint16(data[2:4], mrfMetaVersion)
  3409  			mw := msgp.NewWriter(w)
  3410  			n, err := mw.Write(data[:])
  3411  			if err != nil {
  3412  				w.CloseWithError(err)
  3413  				return
  3414  			}
  3415  			if n != len(data) {
  3416  				w.CloseWithError(io.ErrShortWrite)
  3417  				return
  3418  			}
  3419  			err = v.EncodeMsg(mw)
  3420  			mw.Flush()
  3421  			w.CloseWithError(err)
  3422  		}()
  3423  		return r
  3424  	}
  3425  
  3426  	globalLocalDrivesMu.RLock()
  3427  	localDrives := cloneDrives(globalLocalDrives)
  3428  	globalLocalDrivesMu.RUnlock()
  3429  
  3430  	for _, localDrive := range localDrives {
  3431  		r := newReader()
  3432  		err := localDrive.CreateFile(ctx, "", minioMetaBucket, pathJoin(replicationMRFDir, globalLocalNodeNameHex+".bin"), -1, r)
  3433  		r.Close()
  3434  		if err == nil {
  3435  			break
  3436  		}
  3437  	}
  3438  }
  3439  
  3440  // save mrf entries to nodenamehex.bin
  3441  func (p *ReplicationPool) saveMRFEntries(ctx context.Context, entries map[string]MRFReplicateEntry) {
  3442  	if !p.initialized() {
  3443  		return
  3444  	}
  3445  	atomic.StoreUint64(&globalReplicationStats.mrfStats.LastFailedCount, uint64(len(entries)))
  3446  	if len(entries) == 0 {
  3447  		return
  3448  	}
  3449  
  3450  	v := MRFReplicateEntries{
  3451  		Entries: entries,
  3452  		Version: mrfMetaVersion,
  3453  	}
  3454  
  3455  	p.persistToDrive(ctx, v)
  3456  }
  3457  
  3458  // load mrf entries from disk
  3459  func (p *ReplicationPool) loadMRF() (mrfRec MRFReplicateEntries, err error) {
  3460  	loadMRF := func(rc io.ReadCloser) (re MRFReplicateEntries, err error) {
  3461  		defer rc.Close()
  3462  
  3463  		if !p.initialized() {
  3464  			return re, nil
  3465  		}
  3466  		var data [4]byte
  3467  		n, err := rc.Read(data[:])
  3468  		if err != nil {
  3469  			return re, err
  3470  		}
  3471  		if n != len(data) {
  3472  			return re, errors.New("replication mrf: no data")
  3473  		}
  3474  		// Read resync meta header
  3475  		switch binary.LittleEndian.Uint16(data[0:2]) {
  3476  		case mrfMetaFormat:
  3477  		default:
  3478  			return re, fmt.Errorf("replication mrf: unknown format: %d", binary.LittleEndian.Uint16(data[0:2]))
  3479  		}
  3480  		switch binary.LittleEndian.Uint16(data[2:4]) {
  3481  		case mrfMetaVersion:
  3482  		default:
  3483  			return re, fmt.Errorf("replication mrf: unknown version: %d", binary.LittleEndian.Uint16(data[2:4]))
  3484  		}
  3485  
  3486  		// OK, parse data.
  3487  		// ignore any parsing errors, we do not care this file is generated again anyways.
  3488  		re.DecodeMsg(msgp.NewReader(rc))
  3489  
  3490  		return re, nil
  3491  	}
  3492  
  3493  	globalLocalDrivesMu.RLock()
  3494  	localDrives := cloneDrives(globalLocalDrives)
  3495  	globalLocalDrivesMu.RUnlock()
  3496  
  3497  	for _, localDrive := range localDrives {
  3498  		rc, err := localDrive.ReadFileStream(p.ctx, minioMetaBucket, pathJoin(replicationMRFDir, globalLocalNodeNameHex+".bin"), 0, -1)
  3499  		if err != nil {
  3500  			continue
  3501  		}
  3502  
  3503  		mrfRec, err = loadMRF(rc)
  3504  		if err != nil {
  3505  			continue
  3506  		}
  3507  
  3508  		// finally delete the file after processing mrf entries
  3509  		localDrive.Delete(p.ctx, minioMetaBucket, pathJoin(replicationMRFDir, globalLocalNodeNameHex+".bin"), DeleteOptions{})
  3510  		break
  3511  	}
  3512  
  3513  	return mrfRec, nil
  3514  }
  3515  
  3516  func (p *ReplicationPool) processMRF() {
  3517  	if !p.initialized() {
  3518  		return
  3519  	}
  3520  	pTimer := time.NewTimer(mrfQueueInterval)
  3521  	defer pTimer.Stop()
  3522  	for {
  3523  		select {
  3524  		case <-pTimer.C:
  3525  			// skip healing if all targets are offline
  3526  			var offlineCnt int
  3527  			tgts := globalBucketTargetSys.ListTargets(p.ctx, "", "")
  3528  			for _, tgt := range tgts {
  3529  				if globalBucketTargetSys.isOffline(tgt.URL()) {
  3530  					offlineCnt++
  3531  				}
  3532  			}
  3533  			if len(tgts) == offlineCnt {
  3534  				pTimer.Reset(mrfQueueInterval)
  3535  				continue
  3536  			}
  3537  			if err := p.queueMRFHeal(); err != nil && !osIsNotExist(err) {
  3538  				logger.LogIf(p.ctx, err)
  3539  			}
  3540  			pTimer.Reset(mrfQueueInterval)
  3541  		case <-p.ctx.Done():
  3542  			return
  3543  		}
  3544  	}
  3545  }
  3546  
  3547  // process sends error logs to the heal channel for an attempt to heal replication.
  3548  func (p *ReplicationPool) queueMRFHeal() error {
  3549  	p.mrfMU.Lock()
  3550  	defer p.mrfMU.Unlock()
  3551  
  3552  	if !p.initialized() {
  3553  		return errServerNotInitialized
  3554  	}
  3555  
  3556  	mrfRec, err := p.loadMRF()
  3557  	if err != nil {
  3558  		return err
  3559  	}
  3560  
  3561  	// queue replication heal in a goroutine to avoid holding up mrf save routine
  3562  	go func() {
  3563  		for vID, e := range mrfRec.Entries {
  3564  			ctx, cancel := context.WithTimeout(p.ctx, time.Second) // Do not waste more than a second on this.
  3565  
  3566  			oi, err := p.objLayer.GetObjectInfo(ctx, e.Bucket, e.Object, ObjectOptions{
  3567  				VersionID: vID,
  3568  			})
  3569  			cancel()
  3570  			if err != nil {
  3571  				continue
  3572  			}
  3573  
  3574  			QueueReplicationHeal(p.ctx, e.Bucket, oi, e.RetryCount)
  3575  		}
  3576  	}()
  3577  
  3578  	return nil
  3579  }
  3580  
  3581  func (p *ReplicationPool) initialized() bool {
  3582  	return !(p == nil || p.objLayer == nil)
  3583  }
  3584  
  3585  // getMRF returns MRF entries for this node.
  3586  func (p *ReplicationPool) getMRF(ctx context.Context, bucket string) (ch <-chan madmin.ReplicationMRF, err error) {
  3587  	mrfRec, err := p.loadMRF()
  3588  	if err != nil {
  3589  		return nil, err
  3590  	}
  3591  
  3592  	mrfCh := make(chan madmin.ReplicationMRF, 100)
  3593  	go func() {
  3594  		defer xioutil.SafeClose(mrfCh)
  3595  		for vID, e := range mrfRec.Entries {
  3596  			if bucket != "" && e.Bucket != bucket {
  3597  				continue
  3598  			}
  3599  			select {
  3600  			case mrfCh <- madmin.ReplicationMRF{
  3601  				NodeName:   globalLocalNodeName,
  3602  				Object:     e.Object,
  3603  				VersionID:  vID,
  3604  				Bucket:     e.Bucket,
  3605  				RetryCount: e.RetryCount,
  3606  			}:
  3607  			case <-ctx.Done():
  3608  				return
  3609  			}
  3610  		}
  3611  	}()
  3612  
  3613  	return mrfCh, nil
  3614  }