go.temporal.io/server@v1.23.0/common/persistence/namespace_replication_queue.go (about) 1 // The MIT License 2 // 3 // Copyright (c) 2020 Temporal Technologies Inc. All rights reserved. 4 // 5 // Copyright (c) 2020 Uber Technologies, Inc. 6 // 7 // Permission is hereby granted, free of charge, to any person obtaining a copy 8 // of this software and associated documentation files (the "Software"), to deal 9 // in the Software without restriction, including without limitation the rights 10 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 // copies of the Software, and to permit persons to whom the Software is 12 // furnished to do so, subject to the following conditions: 13 // 14 // The above copyright notice and this permission notice shall be included in 15 // all copies or substantial portions of the Software. 16 // 17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 // THE SOFTWARE. 24 25 //go:generate mockgen -copyright_file ../../LICENSE -package $GOPACKAGE -source $GOFILE -destination namespace_replication_queue_mock.go 26 27 package persistence 28 29 import ( 30 "context" 31 "fmt" 32 "sync/atomic" 33 "time" 34 35 commonpb "go.temporal.io/api/common/v1" 36 enumspb "go.temporal.io/api/enums/v1" 37 38 "go.temporal.io/server/api/persistence/v1" 39 "go.temporal.io/server/internal/goro" 40 41 replicationspb "go.temporal.io/server/api/replication/v1" 42 "go.temporal.io/server/common" 43 "go.temporal.io/server/common/convert" 44 "go.temporal.io/server/common/headers" 45 "go.temporal.io/server/common/log" 46 "go.temporal.io/server/common/log/tag" 47 "go.temporal.io/server/common/metrics" 48 "go.temporal.io/server/common/persistence/serialization" 49 ) 50 51 const ( 52 purgeInterval = 5 * time.Minute 53 localNamespaceReplicationCluster = "namespaceReplication" 54 ) 55 56 var _ NamespaceReplicationQueue = (*namespaceReplicationQueueImpl)(nil) 57 58 // NewNamespaceReplicationQueue creates a new NamespaceReplicationQueue instance 59 func NewNamespaceReplicationQueue( 60 queue Queue, 61 serializer serialization.Serializer, 62 clusterName string, 63 metricsHandler metrics.Handler, 64 logger log.Logger, 65 ) (NamespaceReplicationQueue, error) { 66 67 blob, err := serializer.QueueMetadataToBlob( 68 &persistence.QueueMetadata{ 69 ClusterAckLevels: make(map[string]int64), 70 }, enumspb.ENCODING_TYPE_PROTO3) 71 if err != nil { 72 return nil, err 73 } 74 err = queue.Init(context.TODO(), blob) 75 if err != nil { 76 return nil, err 77 } 78 79 return &namespaceReplicationQueueImpl{ 80 queue: queue, 81 clusterName: clusterName, 82 metricsHandler: metricsHandler, 83 logger: logger, 84 ackNotificationChan: make(chan bool), 85 done: make(chan bool), 86 status: common.DaemonStatusInitialized, 87 serializer: serializer, 88 }, nil 89 } 90 91 type ( 92 namespaceReplicationQueueImpl struct { 93 queue Queue 94 clusterName string 95 metricsHandler metrics.Handler 96 logger log.Logger 97 ackLevelUpdated bool 98 ackNotificationChan chan bool 99 done chan bool 100 status int32 101 gorogrp goro.Group 102 serializer serialization.Serializer 103 } 104 105 // NamespaceReplicationQueue is used to publish and list namespace replication tasks 106 NamespaceReplicationQueue interface { 107 Publish(ctx context.Context, task *replicationspb.ReplicationTask) error 108 GetReplicationMessages( 109 ctx context.Context, 110 lastMessageID int64, 111 maxCount int, 112 ) ([]*replicationspb.ReplicationTask, int64, error) 113 UpdateAckLevel(ctx context.Context, lastProcessedMessageID int64, clusterName string) error 114 GetAckLevels(ctx context.Context) (map[string]int64, error) 115 116 PublishToDLQ(ctx context.Context, task *replicationspb.ReplicationTask) error 117 GetMessagesFromDLQ( 118 ctx context.Context, 119 firstMessageID int64, 120 lastMessageID int64, 121 pageSize int, 122 pageToken []byte, 123 ) ([]*replicationspb.ReplicationTask, []byte, error) 124 UpdateDLQAckLevel(ctx context.Context, lastProcessedMessageID int64) error 125 GetDLQAckLevel(ctx context.Context) (int64, error) 126 127 RangeDeleteMessagesFromDLQ(ctx context.Context, firstMessageID int64, lastMessageID int64) error 128 DeleteMessageFromDLQ(ctx context.Context, messageID int64) error 129 Start() 130 Stop() 131 } 132 ) 133 134 func (q *namespaceReplicationQueueImpl) Start() { 135 if !atomic.CompareAndSwapInt32(&q.status, common.DaemonStatusInitialized, common.DaemonStatusStarted) { 136 return 137 } 138 139 q.gorogrp.Go(q.purgeProcessor) 140 } 141 142 func (q *namespaceReplicationQueueImpl) Stop() { 143 if !atomic.CompareAndSwapInt32(&q.status, common.DaemonStatusStarted, common.DaemonStatusStopped) { 144 return 145 } 146 close(q.done) 147 148 q.gorogrp.Cancel() 149 } 150 151 func (q *namespaceReplicationQueueImpl) Publish(ctx context.Context, task *replicationspb.ReplicationTask) error { 152 blob, err := q.serializer.ReplicationTaskToBlob(task, enumspb.ENCODING_TYPE_PROTO3) 153 if err != nil { 154 return fmt.Errorf("failed to encode message: %v", err) 155 } 156 return q.queue.EnqueueMessage(ctx, blob) 157 } 158 159 func (q *namespaceReplicationQueueImpl) PublishToDLQ(ctx context.Context, task *replicationspb.ReplicationTask) error { 160 blob, err := q.serializer.ReplicationTaskToBlob(task, enumspb.ENCODING_TYPE_PROTO3) 161 if err != nil { 162 return fmt.Errorf("failed to encode message: %v", err) 163 } 164 messageID, err := q.queue.EnqueueMessageToDLQ(ctx, blob) 165 if err != nil { 166 return err 167 } 168 169 q.metricsHandler.Gauge(metrics.NamespaceReplicationDLQMaxLevelGauge.Name()). 170 Record(float64(messageID), metrics.OperationTag(metrics.PersistenceNamespaceReplicationQueueScope)) 171 return nil 172 } 173 174 func (q *namespaceReplicationQueueImpl) GetReplicationMessages( 175 ctx context.Context, 176 lastMessageID int64, 177 pageSize int, 178 ) ([]*replicationspb.ReplicationTask, int64, error) { 179 180 messages, err := q.queue.ReadMessages(ctx, lastMessageID, pageSize) 181 if err != nil { 182 return nil, lastMessageID, err 183 } 184 185 replicationTasks := make([]*replicationspb.ReplicationTask, 0, len(messages)) 186 for _, message := range messages { 187 replicationTask, err := q.serializer.ReplicationTaskFromBlob(NewDataBlob(message.Data, message.Encoding)) 188 if err != nil { 189 return nil, lastMessageID, fmt.Errorf("failed to decode task: %v", err) 190 } 191 192 lastMessageID = message.ID 193 replicationTasks = append(replicationTasks, replicationTask) 194 } 195 196 return replicationTasks, lastMessageID, nil 197 } 198 199 func (q *namespaceReplicationQueueImpl) UpdateAckLevel( 200 ctx context.Context, 201 lastProcessedMessageID int64, 202 clusterName string, 203 ) error { 204 return q.updateAckLevelWithRetry(ctx, lastProcessedMessageID, clusterName, false) 205 } 206 207 func (q *namespaceReplicationQueueImpl) updateAckLevelWithRetry( 208 ctx context.Context, 209 lastProcessedMessageID int64, 210 clusterName string, 211 isDLQ bool, 212 ) error { 213 conditionFailedRetry: 214 for { 215 err := q.updateAckLevel(ctx, lastProcessedMessageID, clusterName, isDLQ) 216 switch err.(type) { 217 case *ConditionFailedError: 218 continue conditionFailedRetry 219 } 220 221 return err 222 } 223 } 224 225 func (q *namespaceReplicationQueueImpl) updateAckLevel( 226 ctx context.Context, 227 lastProcessedMessageID int64, 228 clusterName string, 229 isDLQ bool, 230 ) error { 231 var ackLevelErr error 232 var internalMetadata *InternalQueueMetadata 233 if isDLQ { 234 internalMetadata, ackLevelErr = q.queue.GetDLQAckLevels(ctx) 235 } else { 236 internalMetadata, ackLevelErr = q.queue.GetAckLevels(ctx) 237 } 238 239 if ackLevelErr != nil { 240 return ackLevelErr 241 } 242 243 ackLevels, err := q.ackLevelsFromBlob(internalMetadata.Blob) 244 if err != nil { 245 return err 246 } 247 248 // Ignore possibly delayed message 249 if ack, ok := ackLevels[clusterName]; ok && ack > lastProcessedMessageID { 250 return nil 251 } 252 253 // TODO remove this block in 1.12.x 254 delete(ackLevels, "") 255 // TODO remove this block in 1.12.x 256 257 // update ack level 258 ackLevels[clusterName] = lastProcessedMessageID 259 blob, err := q.serializer.QueueMetadataToBlob(&persistence.QueueMetadata{ 260 ClusterAckLevels: ackLevels, 261 }, enumspb.ENCODING_TYPE_PROTO3) 262 if err != nil { 263 return err 264 } 265 266 internalMetadata.Blob = blob 267 if isDLQ { 268 err = q.queue.UpdateDLQAckLevel(ctx, internalMetadata) 269 } else { 270 err = q.queue.UpdateAckLevel(ctx, internalMetadata) 271 } 272 if err != nil { 273 return fmt.Errorf("failed to update ack level: %v", err) 274 } 275 276 select { 277 case q.ackNotificationChan <- true: 278 default: 279 } 280 281 return nil 282 } 283 284 func (q *namespaceReplicationQueueImpl) GetAckLevels( 285 ctx context.Context, 286 ) (map[string]int64, error) { 287 metadata, err := q.queue.GetAckLevels(ctx) 288 if err != nil { 289 return nil, err 290 } 291 return q.ackLevelsFromBlob(metadata.Blob) 292 } 293 294 func (q *namespaceReplicationQueueImpl) ackLevelsFromBlob(blob *commonpb.DataBlob) (map[string]int64, error) { 295 if blob == nil { 296 return make(map[string]int64), nil 297 } 298 299 metadata, err := q.serializer.QueueMetadataFromBlob(blob) 300 if err != nil { 301 return nil, err 302 } 303 ackLevels := metadata.ClusterAckLevels 304 if ackLevels == nil { 305 ackLevels = make(map[string]int64) 306 } 307 return ackLevels, nil 308 } 309 310 func (q *namespaceReplicationQueueImpl) GetMessagesFromDLQ( 311 ctx context.Context, 312 firstMessageID int64, 313 lastMessageID int64, 314 pageSize int, 315 pageToken []byte, 316 ) ([]*replicationspb.ReplicationTask, []byte, error) { 317 318 messages, token, err := q.queue.ReadMessagesFromDLQ(ctx, firstMessageID, lastMessageID, pageSize, pageToken) 319 if err != nil { 320 return nil, nil, err 321 } 322 323 var replicationTasks []*replicationspb.ReplicationTask 324 for _, message := range messages { 325 replicationTask, err := q.serializer.ReplicationTaskFromBlob(NewDataBlob(message.Data, message.Encoding)) 326 if err != nil { 327 return nil, nil, fmt.Errorf("failed to decode dlq task: %v", err) 328 } 329 330 // Overwrite to local cluster message id 331 replicationTask.SourceTaskId = message.ID 332 replicationTasks = append(replicationTasks, replicationTask) 333 } 334 335 return replicationTasks, token, nil 336 } 337 338 func (q *namespaceReplicationQueueImpl) UpdateDLQAckLevel( 339 ctx context.Context, 340 lastProcessedMessageID int64, 341 ) error { 342 return q.updateAckLevelWithRetry(ctx, lastProcessedMessageID, localNamespaceReplicationCluster, true) 343 } 344 345 func (q *namespaceReplicationQueueImpl) GetDLQAckLevel( 346 ctx context.Context, 347 ) (int64, error) { 348 metadata, err := q.queue.GetDLQAckLevels(ctx) 349 if err != nil { 350 return EmptyQueueMessageID, err 351 } 352 dlqMetadata, err := q.ackLevelsFromBlob(metadata.Blob) 353 if err != nil { 354 return EmptyQueueMessageID, err 355 } 356 357 ackLevel, ok := dlqMetadata[localNamespaceReplicationCluster] 358 if !ok { 359 return EmptyQueueMessageID, nil 360 } 361 return ackLevel, nil 362 } 363 364 func (q *namespaceReplicationQueueImpl) RangeDeleteMessagesFromDLQ( 365 ctx context.Context, 366 firstMessageID int64, 367 lastMessageID int64, 368 ) error { 369 370 return q.queue.RangeDeleteMessagesFromDLQ( 371 ctx, 372 firstMessageID, 373 lastMessageID, 374 ) 375 } 376 377 func (q *namespaceReplicationQueueImpl) DeleteMessageFromDLQ( 378 ctx context.Context, 379 messageID int64, 380 ) error { 381 382 return q.queue.DeleteMessageFromDLQ(ctx, messageID) 383 } 384 385 func (q *namespaceReplicationQueueImpl) purgeAckedMessages( 386 ctx context.Context, 387 ) error { 388 ackLevelByCluster, err := q.GetAckLevels(ctx) 389 if err != nil { 390 return fmt.Errorf("failed to purge messages: %v", err) 391 } 392 393 if len(ackLevelByCluster) == 0 { 394 return nil 395 } 396 397 var minAckLevel *int64 398 for _, ackLevel := range ackLevelByCluster { 399 if minAckLevel == nil || ackLevel < *minAckLevel { 400 minAckLevel = convert.Int64Ptr(ackLevel) 401 } 402 } 403 if minAckLevel == nil { 404 return nil 405 } 406 407 err = q.queue.DeleteMessagesBefore(ctx, *minAckLevel) 408 if err != nil { 409 return fmt.Errorf("failed to purge messages: %v", err) 410 } 411 q.metricsHandler.Gauge(metrics.NamespaceReplicationTaskAckLevelGauge.Name()). 412 Record(float64(*minAckLevel), metrics.OperationTag(metrics.PersistenceNamespaceReplicationQueueScope)) 413 return nil 414 } 415 416 func (q *namespaceReplicationQueueImpl) purgeProcessor( 417 ctx context.Context, 418 ) error { 419 ctx = headers.SetCallerInfo(ctx, headers.SystemPreemptableCallerInfo) 420 421 ticker := time.NewTicker(purgeInterval) 422 defer ticker.Stop() 423 424 for { 425 select { 426 case <-q.done: 427 return nil 428 case <-ticker.C: 429 if q.ackLevelUpdated { 430 err := q.purgeAckedMessages(ctx) 431 if err != nil { 432 q.logger.Warn("Failed to purge acked namespace replication messages.", tag.Error(err)) 433 } else { 434 q.ackLevelUpdated = false 435 } 436 } 437 case <-q.ackNotificationChan: 438 q.ackLevelUpdated = true 439 } 440 } 441 }