go.temporal.io/server@v1.23.0/common/persistence/namespace_replication_queue.go (about)

     1  // The MIT License
     2  //
     3  // Copyright (c) 2020 Temporal Technologies Inc.  All rights reserved.
     4  //
     5  // Copyright (c) 2020 Uber Technologies, Inc.
     6  //
     7  // Permission is hereby granted, free of charge, to any person obtaining a copy
     8  // of this software and associated documentation files (the "Software"), to deal
     9  // in the Software without restriction, including without limitation the rights
    10  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    11  // copies of the Software, and to permit persons to whom the Software is
    12  // furnished to do so, subject to the following conditions:
    13  //
    14  // The above copyright notice and this permission notice shall be included in
    15  // all copies or substantial portions of the Software.
    16  //
    17  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    18  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    19  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    20  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    21  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    22  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    23  // THE SOFTWARE.
    24  
    25  //go:generate mockgen -copyright_file ../../LICENSE -package $GOPACKAGE -source $GOFILE -destination namespace_replication_queue_mock.go
    26  
    27  package persistence
    28  
    29  import (
    30  	"context"
    31  	"fmt"
    32  	"sync/atomic"
    33  	"time"
    34  
    35  	commonpb "go.temporal.io/api/common/v1"
    36  	enumspb "go.temporal.io/api/enums/v1"
    37  
    38  	"go.temporal.io/server/api/persistence/v1"
    39  	"go.temporal.io/server/internal/goro"
    40  
    41  	replicationspb "go.temporal.io/server/api/replication/v1"
    42  	"go.temporal.io/server/common"
    43  	"go.temporal.io/server/common/convert"
    44  	"go.temporal.io/server/common/headers"
    45  	"go.temporal.io/server/common/log"
    46  	"go.temporal.io/server/common/log/tag"
    47  	"go.temporal.io/server/common/metrics"
    48  	"go.temporal.io/server/common/persistence/serialization"
    49  )
    50  
    51  const (
    52  	purgeInterval                    = 5 * time.Minute
    53  	localNamespaceReplicationCluster = "namespaceReplication"
    54  )
    55  
    56  var _ NamespaceReplicationQueue = (*namespaceReplicationQueueImpl)(nil)
    57  
    58  // NewNamespaceReplicationQueue creates a new NamespaceReplicationQueue instance
    59  func NewNamespaceReplicationQueue(
    60  	queue Queue,
    61  	serializer serialization.Serializer,
    62  	clusterName string,
    63  	metricsHandler metrics.Handler,
    64  	logger log.Logger,
    65  ) (NamespaceReplicationQueue, error) {
    66  
    67  	blob, err := serializer.QueueMetadataToBlob(
    68  		&persistence.QueueMetadata{
    69  			ClusterAckLevels: make(map[string]int64),
    70  		}, enumspb.ENCODING_TYPE_PROTO3)
    71  	if err != nil {
    72  		return nil, err
    73  	}
    74  	err = queue.Init(context.TODO(), blob)
    75  	if err != nil {
    76  		return nil, err
    77  	}
    78  
    79  	return &namespaceReplicationQueueImpl{
    80  		queue:               queue,
    81  		clusterName:         clusterName,
    82  		metricsHandler:      metricsHandler,
    83  		logger:              logger,
    84  		ackNotificationChan: make(chan bool),
    85  		done:                make(chan bool),
    86  		status:              common.DaemonStatusInitialized,
    87  		serializer:          serializer,
    88  	}, nil
    89  }
    90  
    91  type (
    92  	namespaceReplicationQueueImpl struct {
    93  		queue               Queue
    94  		clusterName         string
    95  		metricsHandler      metrics.Handler
    96  		logger              log.Logger
    97  		ackLevelUpdated     bool
    98  		ackNotificationChan chan bool
    99  		done                chan bool
   100  		status              int32
   101  		gorogrp             goro.Group
   102  		serializer          serialization.Serializer
   103  	}
   104  
   105  	// NamespaceReplicationQueue is used to publish and list namespace replication tasks
   106  	NamespaceReplicationQueue interface {
   107  		Publish(ctx context.Context, task *replicationspb.ReplicationTask) error
   108  		GetReplicationMessages(
   109  			ctx context.Context,
   110  			lastMessageID int64,
   111  			maxCount int,
   112  		) ([]*replicationspb.ReplicationTask, int64, error)
   113  		UpdateAckLevel(ctx context.Context, lastProcessedMessageID int64, clusterName string) error
   114  		GetAckLevels(ctx context.Context) (map[string]int64, error)
   115  
   116  		PublishToDLQ(ctx context.Context, task *replicationspb.ReplicationTask) error
   117  		GetMessagesFromDLQ(
   118  			ctx context.Context,
   119  			firstMessageID int64,
   120  			lastMessageID int64,
   121  			pageSize int,
   122  			pageToken []byte,
   123  		) ([]*replicationspb.ReplicationTask, []byte, error)
   124  		UpdateDLQAckLevel(ctx context.Context, lastProcessedMessageID int64) error
   125  		GetDLQAckLevel(ctx context.Context) (int64, error)
   126  
   127  		RangeDeleteMessagesFromDLQ(ctx context.Context, firstMessageID int64, lastMessageID int64) error
   128  		DeleteMessageFromDLQ(ctx context.Context, messageID int64) error
   129  		Start()
   130  		Stop()
   131  	}
   132  )
   133  
   134  func (q *namespaceReplicationQueueImpl) Start() {
   135  	if !atomic.CompareAndSwapInt32(&q.status, common.DaemonStatusInitialized, common.DaemonStatusStarted) {
   136  		return
   137  	}
   138  
   139  	q.gorogrp.Go(q.purgeProcessor)
   140  }
   141  
   142  func (q *namespaceReplicationQueueImpl) Stop() {
   143  	if !atomic.CompareAndSwapInt32(&q.status, common.DaemonStatusStarted, common.DaemonStatusStopped) {
   144  		return
   145  	}
   146  	close(q.done)
   147  
   148  	q.gorogrp.Cancel()
   149  }
   150  
   151  func (q *namespaceReplicationQueueImpl) Publish(ctx context.Context, task *replicationspb.ReplicationTask) error {
   152  	blob, err := q.serializer.ReplicationTaskToBlob(task, enumspb.ENCODING_TYPE_PROTO3)
   153  	if err != nil {
   154  		return fmt.Errorf("failed to encode message: %v", err)
   155  	}
   156  	return q.queue.EnqueueMessage(ctx, blob)
   157  }
   158  
   159  func (q *namespaceReplicationQueueImpl) PublishToDLQ(ctx context.Context, task *replicationspb.ReplicationTask) error {
   160  	blob, err := q.serializer.ReplicationTaskToBlob(task, enumspb.ENCODING_TYPE_PROTO3)
   161  	if err != nil {
   162  		return fmt.Errorf("failed to encode message: %v", err)
   163  	}
   164  	messageID, err := q.queue.EnqueueMessageToDLQ(ctx, blob)
   165  	if err != nil {
   166  		return err
   167  	}
   168  
   169  	q.metricsHandler.Gauge(metrics.NamespaceReplicationDLQMaxLevelGauge.Name()).
   170  		Record(float64(messageID), metrics.OperationTag(metrics.PersistenceNamespaceReplicationQueueScope))
   171  	return nil
   172  }
   173  
   174  func (q *namespaceReplicationQueueImpl) GetReplicationMessages(
   175  	ctx context.Context,
   176  	lastMessageID int64,
   177  	pageSize int,
   178  ) ([]*replicationspb.ReplicationTask, int64, error) {
   179  
   180  	messages, err := q.queue.ReadMessages(ctx, lastMessageID, pageSize)
   181  	if err != nil {
   182  		return nil, lastMessageID, err
   183  	}
   184  
   185  	replicationTasks := make([]*replicationspb.ReplicationTask, 0, len(messages))
   186  	for _, message := range messages {
   187  		replicationTask, err := q.serializer.ReplicationTaskFromBlob(NewDataBlob(message.Data, message.Encoding))
   188  		if err != nil {
   189  			return nil, lastMessageID, fmt.Errorf("failed to decode task: %v", err)
   190  		}
   191  
   192  		lastMessageID = message.ID
   193  		replicationTasks = append(replicationTasks, replicationTask)
   194  	}
   195  
   196  	return replicationTasks, lastMessageID, nil
   197  }
   198  
   199  func (q *namespaceReplicationQueueImpl) UpdateAckLevel(
   200  	ctx context.Context,
   201  	lastProcessedMessageID int64,
   202  	clusterName string,
   203  ) error {
   204  	return q.updateAckLevelWithRetry(ctx, lastProcessedMessageID, clusterName, false)
   205  }
   206  
   207  func (q *namespaceReplicationQueueImpl) updateAckLevelWithRetry(
   208  	ctx context.Context,
   209  	lastProcessedMessageID int64,
   210  	clusterName string,
   211  	isDLQ bool,
   212  ) error {
   213  conditionFailedRetry:
   214  	for {
   215  		err := q.updateAckLevel(ctx, lastProcessedMessageID, clusterName, isDLQ)
   216  		switch err.(type) {
   217  		case *ConditionFailedError:
   218  			continue conditionFailedRetry
   219  		}
   220  
   221  		return err
   222  	}
   223  }
   224  
   225  func (q *namespaceReplicationQueueImpl) updateAckLevel(
   226  	ctx context.Context,
   227  	lastProcessedMessageID int64,
   228  	clusterName string,
   229  	isDLQ bool,
   230  ) error {
   231  	var ackLevelErr error
   232  	var internalMetadata *InternalQueueMetadata
   233  	if isDLQ {
   234  		internalMetadata, ackLevelErr = q.queue.GetDLQAckLevels(ctx)
   235  	} else {
   236  		internalMetadata, ackLevelErr = q.queue.GetAckLevels(ctx)
   237  	}
   238  
   239  	if ackLevelErr != nil {
   240  		return ackLevelErr
   241  	}
   242  
   243  	ackLevels, err := q.ackLevelsFromBlob(internalMetadata.Blob)
   244  	if err != nil {
   245  		return err
   246  	}
   247  
   248  	// Ignore possibly delayed message
   249  	if ack, ok := ackLevels[clusterName]; ok && ack > lastProcessedMessageID {
   250  		return nil
   251  	}
   252  
   253  	// TODO remove this block in 1.12.x
   254  	delete(ackLevels, "")
   255  	// TODO remove this block in 1.12.x
   256  
   257  	// update ack level
   258  	ackLevels[clusterName] = lastProcessedMessageID
   259  	blob, err := q.serializer.QueueMetadataToBlob(&persistence.QueueMetadata{
   260  		ClusterAckLevels: ackLevels,
   261  	}, enumspb.ENCODING_TYPE_PROTO3)
   262  	if err != nil {
   263  		return err
   264  	}
   265  
   266  	internalMetadata.Blob = blob
   267  	if isDLQ {
   268  		err = q.queue.UpdateDLQAckLevel(ctx, internalMetadata)
   269  	} else {
   270  		err = q.queue.UpdateAckLevel(ctx, internalMetadata)
   271  	}
   272  	if err != nil {
   273  		return fmt.Errorf("failed to update ack level: %v", err)
   274  	}
   275  
   276  	select {
   277  	case q.ackNotificationChan <- true:
   278  	default:
   279  	}
   280  
   281  	return nil
   282  }
   283  
   284  func (q *namespaceReplicationQueueImpl) GetAckLevels(
   285  	ctx context.Context,
   286  ) (map[string]int64, error) {
   287  	metadata, err := q.queue.GetAckLevels(ctx)
   288  	if err != nil {
   289  		return nil, err
   290  	}
   291  	return q.ackLevelsFromBlob(metadata.Blob)
   292  }
   293  
   294  func (q *namespaceReplicationQueueImpl) ackLevelsFromBlob(blob *commonpb.DataBlob) (map[string]int64, error) {
   295  	if blob == nil {
   296  		return make(map[string]int64), nil
   297  	}
   298  
   299  	metadata, err := q.serializer.QueueMetadataFromBlob(blob)
   300  	if err != nil {
   301  		return nil, err
   302  	}
   303  	ackLevels := metadata.ClusterAckLevels
   304  	if ackLevels == nil {
   305  		ackLevels = make(map[string]int64)
   306  	}
   307  	return ackLevels, nil
   308  }
   309  
   310  func (q *namespaceReplicationQueueImpl) GetMessagesFromDLQ(
   311  	ctx context.Context,
   312  	firstMessageID int64,
   313  	lastMessageID int64,
   314  	pageSize int,
   315  	pageToken []byte,
   316  ) ([]*replicationspb.ReplicationTask, []byte, error) {
   317  
   318  	messages, token, err := q.queue.ReadMessagesFromDLQ(ctx, firstMessageID, lastMessageID, pageSize, pageToken)
   319  	if err != nil {
   320  		return nil, nil, err
   321  	}
   322  
   323  	var replicationTasks []*replicationspb.ReplicationTask
   324  	for _, message := range messages {
   325  		replicationTask, err := q.serializer.ReplicationTaskFromBlob(NewDataBlob(message.Data, message.Encoding))
   326  		if err != nil {
   327  			return nil, nil, fmt.Errorf("failed to decode dlq task: %v", err)
   328  		}
   329  
   330  		// Overwrite to local cluster message id
   331  		replicationTask.SourceTaskId = message.ID
   332  		replicationTasks = append(replicationTasks, replicationTask)
   333  	}
   334  
   335  	return replicationTasks, token, nil
   336  }
   337  
   338  func (q *namespaceReplicationQueueImpl) UpdateDLQAckLevel(
   339  	ctx context.Context,
   340  	lastProcessedMessageID int64,
   341  ) error {
   342  	return q.updateAckLevelWithRetry(ctx, lastProcessedMessageID, localNamespaceReplicationCluster, true)
   343  }
   344  
   345  func (q *namespaceReplicationQueueImpl) GetDLQAckLevel(
   346  	ctx context.Context,
   347  ) (int64, error) {
   348  	metadata, err := q.queue.GetDLQAckLevels(ctx)
   349  	if err != nil {
   350  		return EmptyQueueMessageID, err
   351  	}
   352  	dlqMetadata, err := q.ackLevelsFromBlob(metadata.Blob)
   353  	if err != nil {
   354  		return EmptyQueueMessageID, err
   355  	}
   356  
   357  	ackLevel, ok := dlqMetadata[localNamespaceReplicationCluster]
   358  	if !ok {
   359  		return EmptyQueueMessageID, nil
   360  	}
   361  	return ackLevel, nil
   362  }
   363  
   364  func (q *namespaceReplicationQueueImpl) RangeDeleteMessagesFromDLQ(
   365  	ctx context.Context,
   366  	firstMessageID int64,
   367  	lastMessageID int64,
   368  ) error {
   369  
   370  	return q.queue.RangeDeleteMessagesFromDLQ(
   371  		ctx,
   372  		firstMessageID,
   373  		lastMessageID,
   374  	)
   375  }
   376  
   377  func (q *namespaceReplicationQueueImpl) DeleteMessageFromDLQ(
   378  	ctx context.Context,
   379  	messageID int64,
   380  ) error {
   381  
   382  	return q.queue.DeleteMessageFromDLQ(ctx, messageID)
   383  }
   384  
   385  func (q *namespaceReplicationQueueImpl) purgeAckedMessages(
   386  	ctx context.Context,
   387  ) error {
   388  	ackLevelByCluster, err := q.GetAckLevels(ctx)
   389  	if err != nil {
   390  		return fmt.Errorf("failed to purge messages: %v", err)
   391  	}
   392  
   393  	if len(ackLevelByCluster) == 0 {
   394  		return nil
   395  	}
   396  
   397  	var minAckLevel *int64
   398  	for _, ackLevel := range ackLevelByCluster {
   399  		if minAckLevel == nil || ackLevel < *minAckLevel {
   400  			minAckLevel = convert.Int64Ptr(ackLevel)
   401  		}
   402  	}
   403  	if minAckLevel == nil {
   404  		return nil
   405  	}
   406  
   407  	err = q.queue.DeleteMessagesBefore(ctx, *minAckLevel)
   408  	if err != nil {
   409  		return fmt.Errorf("failed to purge messages: %v", err)
   410  	}
   411  	q.metricsHandler.Gauge(metrics.NamespaceReplicationTaskAckLevelGauge.Name()).
   412  		Record(float64(*minAckLevel), metrics.OperationTag(metrics.PersistenceNamespaceReplicationQueueScope))
   413  	return nil
   414  }
   415  
   416  func (q *namespaceReplicationQueueImpl) purgeProcessor(
   417  	ctx context.Context,
   418  ) error {
   419  	ctx = headers.SetCallerInfo(ctx, headers.SystemPreemptableCallerInfo)
   420  
   421  	ticker := time.NewTicker(purgeInterval)
   422  	defer ticker.Stop()
   423  
   424  	for {
   425  		select {
   426  		case <-q.done:
   427  			return nil
   428  		case <-ticker.C:
   429  			if q.ackLevelUpdated {
   430  				err := q.purgeAckedMessages(ctx)
   431  				if err != nil {
   432  					q.logger.Warn("Failed to purge acked namespace replication messages.", tag.Error(err))
   433  				} else {
   434  					q.ackLevelUpdated = false
   435  				}
   436  			}
   437  		case <-q.ackNotificationChan:
   438  			q.ackLevelUpdated = true
   439  		}
   440  	}
   441  }