github.com/matrixorigin/matrixone@v0.7.0/pkg/txn/service/service.go (about)

     1  // Copyright 2021 - 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package service
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"fmt"
    21  	"sync"
    22  	"time"
    23  
    24  	"github.com/fagongzi/util/hack"
    25  	"github.com/matrixorigin/matrixone/pkg/common/log"
    26  	"github.com/matrixorigin/matrixone/pkg/common/runtime"
    27  	"github.com/matrixorigin/matrixone/pkg/common/stopper"
    28  	"github.com/matrixorigin/matrixone/pkg/pb/metadata"
    29  	"github.com/matrixorigin/matrixone/pkg/pb/txn"
    30  	"github.com/matrixorigin/matrixone/pkg/txn/rpc"
    31  	"github.com/matrixorigin/matrixone/pkg/txn/storage"
    32  	"github.com/matrixorigin/matrixone/pkg/txn/util"
    33  	"go.uber.org/multierr"
    34  	"go.uber.org/zap"
    35  )
    36  
    37  var _ TxnService = (*service)(nil)
    38  
    39  type service struct {
    40  	rt      runtime.Runtime
    41  	logger  *log.MOLogger
    42  	shard   metadata.DNShard
    43  	storage storage.TxnStorage
    44  	sender  rpc.TxnSender
    45  	stopper *stopper.Stopper
    46  
    47  	// TxnService maintains a sync.Map in memory to record all running transactions. The metadata for each write
    48  	// transaction is initialized when the TxnService receives its first write operation and written to the map.
    49  	// The transaction is removed from the map after it has been committed or aborted.
    50  	//
    51  	// When a transaction's Read operation encounters data written by transactions in Committing and Prepared,
    52  	// it needs to wait for these transactions to reach the final state as it is not sure if the data is visible
    53  	// for the current read transaction. So we need to keep track of all running write transactions and notify the
    54  	// blocked Read operation when the transaction is committed or aborted.
    55  	//
    56  	// In some cases, after a transaction has been committed or rolled back, a previous write request is received
    57  	// due to the network, resulting in the transaction information being written back to the map.
    58  	// We use the zombieTimeout setting to solve this problem, so that when a transaction exceeds the zombieTimeout
    59  	// threshold in the map, it is cleaned up.
    60  	transactions  sync.Map // string(txn.id) -> txnContext
    61  	zombieTimeout time.Duration
    62  	pool          sync.Pool
    63  	recoveryC     chan struct{}
    64  	txnC          chan txn.TxnMeta
    65  }
    66  
    67  // NewTxnService create TxnService
    68  func NewTxnService(
    69  	rt runtime.Runtime,
    70  	shard metadata.DNShard,
    71  	storage storage.TxnStorage,
    72  	sender rpc.TxnSender,
    73  	zombieTimeout time.Duration) TxnService {
    74  	logger := rt.Logger().With(util.TxnDNShardField(shard))
    75  	s := &service{
    76  		rt:      rt,
    77  		logger:  rt.Logger().With(util.TxnDNShardField(shard)),
    78  		shard:   shard,
    79  		sender:  sender,
    80  		storage: storage,
    81  		pool: sync.Pool{
    82  			New: func() any {
    83  				return &txnContext{
    84  					logger: logger,
    85  				}
    86  			}},
    87  		stopper: stopper.NewStopper(fmt.Sprintf("txn-service-%d-%d",
    88  			shard.ShardID,
    89  			shard.ReplicaID),
    90  			stopper.WithLogger(logger.RawLogger())),
    91  		zombieTimeout: zombieTimeout,
    92  		recoveryC:     make(chan struct{}),
    93  		txnC:          make(chan txn.TxnMeta, 16),
    94  	}
    95  	if err := s.stopper.RunTask(s.gcZombieTxn); err != nil {
    96  		s.logger.Fatal("start gc zombie txn failed",
    97  			zap.Error(err))
    98  	}
    99  	return s
   100  }
   101  
   102  func (s *service) Shard() metadata.DNShard {
   103  	return s.shard
   104  }
   105  
   106  func (s *service) Start() error {
   107  	if err := s.storage.Start(); err != nil {
   108  		return err
   109  	}
   110  	s.startRecovery()
   111  	return nil
   112  }
   113  
   114  func (s *service) Close(destroy bool) error {
   115  	s.waitRecoveryCompleted()
   116  	s.stopper.Stop()
   117  	closer := s.storage.Close
   118  	if destroy {
   119  		closer = s.storage.Destroy
   120  	}
   121  	// FIXME: all context.TODO() need to use tracing context
   122  	if err := closer(context.TODO()); err != nil {
   123  		return multierr.Append(err, s.sender.Close())
   124  	}
   125  	return s.sender.Close()
   126  }
   127  
   128  func (s *service) gcZombieTxn(ctx context.Context) {
   129  	s.logger.Info("gc zombie txn task started")
   130  	defer s.logger.Info("gc zombie txn task stopped")
   131  
   132  	timer := time.NewTicker(s.zombieTimeout)
   133  	defer timer.Stop()
   134  
   135  	var cleanTxns []txn.TxnMeta
   136  	for {
   137  		select {
   138  		case <-ctx.Done():
   139  			return
   140  		case <-timer.C:
   141  			s.transactions.Range(func(_, value any) bool {
   142  				txnCtx := value.(*txnContext)
   143  				txnMeta := txnCtx.getTxn()
   144  				// if a txn is not a distributed txn coordinator, wait coordinator dnshard.
   145  				if len(txnMeta.DNShards) == 0 ||
   146  					(len(txnMeta.DNShards) > 0 && s.shard.ShardID != txnMeta.DNShards[0].ShardID) {
   147  					return true
   148  				}
   149  
   150  				now := time.Now()
   151  				if now.Sub(txnCtx.createAt) > s.zombieTimeout {
   152  					cleanTxns = append(cleanTxns, txnMeta)
   153  				}
   154  				return true
   155  			})
   156  			for _, txnMeta := range cleanTxns {
   157  				req := &txn.TxnRequest{
   158  					Method:          txn.TxnMethod_Rollback,
   159  					Txn:             txnMeta,
   160  					RollbackRequest: &txn.TxnRollbackRequest{},
   161  				}
   162  				resp := &txn.TxnResponse{}
   163  				if err := s.Rollback(ctx, req, resp); err != nil || resp.TxnError != nil {
   164  					txnError := ""
   165  					if resp.TxnError != nil {
   166  						txnError = resp.TxnError.DebugString()
   167  					}
   168  					s.logger.Error("rollback zombie txn failed",
   169  						util.TxnField(txnMeta),
   170  						zap.String("txn-err", txnError),
   171  						zap.Error(err))
   172  				}
   173  			}
   174  			cleanTxns = cleanTxns[:0]
   175  		}
   176  	}
   177  }
   178  
   179  func (s *service) maybeAddTxn(meta txn.TxnMeta) (*txnContext, bool) {
   180  	id := hack.SliceToString(meta.ID)
   181  	if v, ok := s.transactions.Load(id); ok {
   182  		return v.(*txnContext), false
   183  	}
   184  
   185  	txnCtx := s.acquireTxnContext()
   186  	v, loaded := s.transactions.LoadOrStore(id, txnCtx)
   187  	if loaded {
   188  		s.releaseTxnContext(txnCtx)
   189  		return v.(*txnContext), false
   190  	}
   191  
   192  	// 1. first transaction write request at current DNShard
   193  	// 2. transaction already committed or aborted, the transcation context will removed by gcZombieTxn.
   194  	txnCtx.init(meta, acquireNotifier())
   195  	util.LogTxnCreateOn(s.logger, meta, s.shard)
   196  	return txnCtx, true
   197  }
   198  
   199  func (s *service) removeTxn(txnID []byte) {
   200  	s.transactions.Delete(hack.SliceToString(txnID))
   201  }
   202  
   203  func (s *service) getTxnContext(txnID []byte) *txnContext {
   204  	id := hack.SliceToString(txnID)
   205  	v, ok := s.transactions.Load(id)
   206  	if !ok {
   207  		return nil
   208  	}
   209  	return v.(*txnContext)
   210  }
   211  
   212  func (s *service) validDNShard(dn metadata.DNShard) bool {
   213  	if !s.shard.Equal(dn) {
   214  		// DNShard not match, so cn need to fetch latest DNShards from hakeeper.
   215  		s.logger.Error("DN metadata not match",
   216  			zap.String("request-dn", dn.DebugString()),
   217  			zap.String("local-dn", s.shard.DebugString()))
   218  		return false
   219  	}
   220  	return true
   221  }
   222  
   223  func (s *service) acquireTxnContext() *txnContext {
   224  	return s.pool.Get().(*txnContext)
   225  }
   226  
   227  func (s *service) releaseTxnContext(txnCtx *txnContext) {
   228  	txnCtx.resetLocked()
   229  	s.pool.Put(txnCtx)
   230  }
   231  
   232  func (s *service) parallelSendWithRetry(
   233  	ctx context.Context,
   234  	txnMeta txn.TxnMeta,
   235  	requests []txn.TxnRequest,
   236  	ignoreTxnErrorCodes map[uint16]struct{}) *rpc.SendResult {
   237  	for {
   238  		select {
   239  		case <-ctx.Done():
   240  			return nil
   241  		default:
   242  			util.LogTxnSendRequests(s.logger, requests)
   243  			result, err := s.sender.Send(ctx, requests)
   244  			if err != nil {
   245  				util.LogTxnSendRequestsFailed(s.logger, requests, err)
   246  				continue
   247  			}
   248  			util.LogTxnReceivedResponses(s.logger, result.Responses)
   249  			hasError := false
   250  			for _, resp := range result.Responses {
   251  				if resp.TxnError != nil {
   252  					_, ok := ignoreTxnErrorCodes[uint16(resp.TxnError.Code)]
   253  					if !ok {
   254  						hasError = true
   255  					}
   256  				}
   257  			}
   258  			if !hasError {
   259  				return result
   260  			}
   261  			result.Release()
   262  		}
   263  	}
   264  }
   265  
   266  type txnContext struct {
   267  	logger   *log.MOLogger
   268  	nt       *notifier
   269  	createAt time.Time
   270  
   271  	mu struct {
   272  		sync.RWMutex
   273  		requests []txn.TxnRequest
   274  		txn      txn.TxnMeta
   275  	}
   276  }
   277  
   278  func (c *txnContext) addWaiter(txnID []byte, w *waiter, waitStatus txn.TxnStatus) bool {
   279  	c.mu.Lock()
   280  	defer c.mu.Unlock()
   281  
   282  	if !bytes.Equal(c.mu.txn.ID, txnID) {
   283  		return false
   284  	}
   285  
   286  	util.LogTxnWaiterAdded(c.logger, c.mu.txn, waitStatus)
   287  	c.nt.addWaiter(w, waitStatus)
   288  	return true
   289  }
   290  
   291  func (c *txnContext) init(txn txn.TxnMeta, nt *notifier) {
   292  	c.mu.Lock()
   293  	defer c.mu.Unlock()
   294  
   295  	c.mu.txn = txn
   296  	c.nt = nt
   297  	c.createAt = time.Now()
   298  }
   299  
   300  func (c *txnContext) getTxn() txn.TxnMeta {
   301  	c.mu.RLock()
   302  	defer c.mu.RUnlock()
   303  	return c.getTxnLocked()
   304  }
   305  
   306  func (c *txnContext) updateTxn(txn txn.TxnMeta) {
   307  	c.mu.Lock()
   308  	defer c.mu.Unlock()
   309  
   310  	c.updateTxnLocked(txn)
   311  }
   312  
   313  func (c *txnContext) getTxnLocked() txn.TxnMeta {
   314  	return c.mu.txn
   315  }
   316  
   317  func (c *txnContext) updateTxnLocked(txn txn.TxnMeta) {
   318  	c.mu.txn = txn
   319  	util.LogTxnUpdated(c.logger, c.mu.txn)
   320  }
   321  
   322  func (c *txnContext) resetLocked() {
   323  	c.nt.close(c.mu.txn.Status)
   324  	c.nt = nil
   325  	c.mu.requests = c.mu.requests[:0]
   326  	c.mu.txn = txn.TxnMeta{}
   327  }
   328  
   329  func (c *txnContext) changeStatusLocked(status txn.TxnStatus) {
   330  	if c.mu.txn.Status != status {
   331  		c.mu.txn.Status = status
   332  		util.LogTxnUpdated(c.logger, c.mu.txn)
   333  		c.nt.notify(status)
   334  	}
   335  }