github.com/matrixorigin/matrixone@v1.2.0/pkg/tnservice/store.go (about)

     1  // Copyright 2021 - 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tnservice
    16  
    17  import (
    18  	"context"
    19  	"errors"
    20  	"sync"
    21  	"time"
    22  
    23  	"github.com/matrixorigin/matrixone/pkg/clusterservice"
    24  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    25  	"github.com/matrixorigin/matrixone/pkg/common/morpc"
    26  	"github.com/matrixorigin/matrixone/pkg/common/runtime"
    27  	"github.com/matrixorigin/matrixone/pkg/common/stopper"
    28  	"github.com/matrixorigin/matrixone/pkg/defines"
    29  	"github.com/matrixorigin/matrixone/pkg/fileservice"
    30  	"github.com/matrixorigin/matrixone/pkg/lockservice"
    31  	"github.com/matrixorigin/matrixone/pkg/logservice"
    32  	logservicepb "github.com/matrixorigin/matrixone/pkg/pb/logservice"
    33  	"github.com/matrixorigin/matrixone/pkg/pb/metadata"
    34  	"github.com/matrixorigin/matrixone/pkg/pb/query"
    35  	"github.com/matrixorigin/matrixone/pkg/pb/txn"
    36  	"github.com/matrixorigin/matrixone/pkg/perfcounter"
    37  	"github.com/matrixorigin/matrixone/pkg/queryservice"
    38  	"github.com/matrixorigin/matrixone/pkg/taskservice"
    39  	"github.com/matrixorigin/matrixone/pkg/txn/rpc"
    40  	"github.com/matrixorigin/matrixone/pkg/txn/service"
    41  	"github.com/matrixorigin/matrixone/pkg/util"
    42  	"github.com/matrixorigin/matrixone/pkg/util/address"
    43  	"github.com/matrixorigin/matrixone/pkg/util/status"
    44  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/blockio"
    45  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/common"
    46  	"go.uber.org/zap"
    47  )
    48  
    49  var (
    50  	retryCreateStorageInterval = time.Second * 5
    51  )
    52  
    53  // WithConfigAdjust set adjust config func
    54  func WithConfigAdjust(adjustConfigFunc func(c *Config)) Option {
    55  	return func(s *store) {
    56  		s.options.adjustConfigFunc = adjustConfigFunc
    57  	}
    58  }
    59  
    60  // WithBackendFilter set filtering txn.TxnRequest sent to other DNShard
    61  func WithBackendFilter(filter func(morpc.Message, string) bool) Option {
    62  	return func(s *store) {
    63  		s.options.backendFilter = filter
    64  	}
    65  }
    66  
    67  // WithHAKeeperClientFactory set hakeeper client factory
    68  func WithHAKeeperClientFactory(factory func() (logservice.TNHAKeeperClient, error)) Option {
    69  	return func(s *store) {
    70  		s.options.hakeekerClientFactory = factory
    71  	}
    72  }
    73  
    74  // WithLogServiceClientFactory set log service client factory
    75  func WithLogServiceClientFactory(factory func(metadata.TNShard) (logservice.Client, error)) Option {
    76  	return func(s *store) {
    77  		s.options.logServiceClientFactory = factory
    78  	}
    79  }
    80  
    81  // WithTaskStorageFactory setup the special task strorage factory
    82  func WithTaskStorageFactory(factory taskservice.TaskStorageFactory) Option {
    83  	return func(s *store) {
    84  		s.task.storageFactory = factory
    85  	}
    86  }
    87  
    88  // WithConfigData saves the data from the config file
    89  func WithConfigData(data map[string]*logservicepb.ConfigItem) Option {
    90  	return func(s *store) {
    91  		if s.config == nil {
    92  			s.config = util.NewConfigData(data)
    93  		} else {
    94  			util.MergeConfig(s.config, data)
    95  		}
    96  	}
    97  }
    98  
    99  type store struct {
   100  	cfg                 *Config
   101  	rt                  runtime.Runtime
   102  	sender              rpc.TxnSender
   103  	server              rpc.TxnServer
   104  	hakeeperClient      logservice.TNHAKeeperClient
   105  	fileService         fileservice.FileService
   106  	metadataFileService fileservice.ReplaceableFileService
   107  	lockTableAllocator  lockservice.LockTableAllocator
   108  	moCluster           clusterservice.MOCluster
   109  	replicas            *sync.Map
   110  	stopper             *stopper.Stopper
   111  	shutdownC           chan struct{}
   112  
   113  	options struct {
   114  		logServiceClientFactory func(metadata.TNShard) (logservice.Client, error)
   115  		hakeekerClientFactory   func() (logservice.TNHAKeeperClient, error)
   116  		backendFilter           func(msg morpc.Message, backendAddr string) bool
   117  		adjustConfigFunc        func(c *Config)
   118  	}
   119  
   120  	mu struct {
   121  		sync.RWMutex
   122  		metadata metadata.TNStore
   123  	}
   124  
   125  	task struct {
   126  		sync.RWMutex
   127  		serviceCreated bool
   128  		serviceHolder  taskservice.TaskServiceHolder
   129  		storageFactory taskservice.TaskStorageFactory
   130  	}
   131  
   132  	addressMgr address.AddressManager
   133  
   134  	config *util.ConfigData
   135  	// queryService for getting cache info from tnservice
   136  	queryService queryservice.QueryService
   137  }
   138  
   139  // NewService create TN Service
   140  func NewService(
   141  	cfg *Config,
   142  	rt runtime.Runtime,
   143  	fileService fileservice.FileService,
   144  	shutdownC chan struct{},
   145  	opts ...Option) (Service, error) {
   146  	if err := cfg.Validate(); err != nil {
   147  		return nil, err
   148  	}
   149  
   150  	configKVMap, _ := dumpTnConfig(*cfg)
   151  	opts = append(opts, WithConfigData(configKVMap))
   152  
   153  	// start common stuff
   154  	common.InitTAEMPool()
   155  
   156  	// get metadata fs
   157  	metadataFS, err := fileservice.Get[fileservice.ReplaceableFileService](fileService, defines.LocalFileServiceName)
   158  	if err != nil {
   159  		return nil, err
   160  	}
   161  
   162  	// start I/O pipeline
   163  	blockio.Start()
   164  
   165  	s := &store{
   166  		cfg:                 cfg,
   167  		rt:                  rt,
   168  		fileService:         fileService,
   169  		metadataFileService: metadataFS,
   170  		shutdownC:           shutdownC,
   171  		addressMgr:          address.NewAddressManager(cfg.ServiceHost, cfg.PortBase),
   172  	}
   173  	for _, opt := range opts {
   174  		opt(s)
   175  	}
   176  	s.registerServices()
   177  	s.replicas = &sync.Map{}
   178  	s.stopper = stopper.NewStopper("dn-store",
   179  		stopper.WithLogger(s.rt.Logger().RawLogger()))
   180  	s.mu.metadata = metadata.TNStore{UUID: cfg.UUID}
   181  	if s.options.adjustConfigFunc != nil {
   182  		s.options.adjustConfigFunc(s.cfg)
   183  	}
   184  
   185  	if err := s.initClocker(); err != nil {
   186  		return nil, err
   187  	}
   188  	if err := s.initHAKeeperClient(); err != nil {
   189  		return nil, err
   190  	}
   191  	if err := s.initLockTableAllocator(); err != nil {
   192  		return nil, err
   193  	}
   194  	if err := s.initTxnSender(); err != nil {
   195  		return nil, err
   196  	}
   197  	if err := s.initTxnServer(); err != nil {
   198  		return nil, err
   199  	}
   200  	if err := s.initMetadata(); err != nil {
   201  		return nil, err
   202  	}
   203  
   204  	s.initQueryService(cfg.InStandalone)
   205  
   206  	s.initTaskHolder()
   207  	s.initSqlWriterFactory()
   208  	s.setupStatusServer()
   209  
   210  	return s, nil
   211  }
   212  
   213  func (s *store) Start() error {
   214  	if err := s.startTNShards(); err != nil {
   215  		return err
   216  	}
   217  	if err := s.server.Start(); err != nil {
   218  		return err
   219  	}
   220  	if s.queryService != nil {
   221  		if err := s.queryService.Start(); err != nil {
   222  			return err
   223  		}
   224  	}
   225  	s.rt.SubLogger(runtime.SystemInit).Info("dn heartbeat task started")
   226  	return s.stopper.RunTask(s.heartbeatTask)
   227  }
   228  
   229  func (s *store) Close() error {
   230  	s.stopper.Stop()
   231  	s.moCluster.Close()
   232  	err := errors.Join(
   233  		s.hakeeperClient.Close(),
   234  		s.sender.Close(),
   235  		s.server.Close(),
   236  		s.lockTableAllocator.Close(),
   237  	)
   238  	s.replicas.Range(func(_, value any) bool {
   239  		r := value.(*replica)
   240  		if e := r.close(false); e != nil {
   241  			err = errors.Join(e, err)
   242  		}
   243  		return true
   244  	})
   245  	s.task.RLock()
   246  	ts := s.task.serviceHolder
   247  	s.task.RUnlock()
   248  	if ts != nil {
   249  		err = errors.Join(err, ts.Close())
   250  	}
   251  	// stop I/O pipeline
   252  	blockio.Stop()
   253  	return err
   254  }
   255  
   256  func (s *store) StartTNReplica(shard metadata.TNShard) error {
   257  	return s.createReplica(shard)
   258  }
   259  
   260  func (s *store) CloseTNReplica(shard metadata.TNShard) error {
   261  	return s.removeReplica(shard.ShardID)
   262  }
   263  
   264  func (s *store) startTNShards() error {
   265  	s.mu.Lock()
   266  	defer s.mu.Unlock()
   267  
   268  	for _, shard := range s.mu.metadata.Shards {
   269  		if err := s.createReplica(shard); err != nil {
   270  			return err
   271  		}
   272  	}
   273  	return nil
   274  }
   275  
   276  func (s *store) getTNShardInfo() []logservicepb.TNShardInfo {
   277  	var shards []logservicepb.TNShardInfo
   278  	s.replicas.Range(func(_, value any) bool {
   279  		r := value.(*replica)
   280  		shards = append(shards, logservicepb.TNShardInfo{
   281  			ShardID:   r.shard.ShardID,
   282  			ReplicaID: r.shard.ReplicaID,
   283  		})
   284  		return true
   285  	})
   286  	return shards
   287  }
   288  
   289  func (s *store) createReplica(shard metadata.TNShard) error {
   290  	r := newReplica(shard, s.rt)
   291  	v, ok := s.replicas.LoadOrStore(shard.ShardID, r)
   292  	if ok {
   293  		s.rt.Logger().Debug("DNShard already created",
   294  			zap.String("new", shard.DebugString()),
   295  			zap.String("exist", v.(*replica).shard.DebugString()))
   296  		return nil
   297  	}
   298  
   299  	err := s.stopper.RunTask(func(ctx context.Context) {
   300  		for {
   301  			select {
   302  			case <-ctx.Done():
   303  				return
   304  			default:
   305  				storage, err := s.createTxnStorage(ctx, shard)
   306  				if err != nil {
   307  					r.logger.Error("start DNShard failed",
   308  						zap.Error(err))
   309  					time.Sleep(retryCreateStorageInterval)
   310  					continue
   311  				}
   312  
   313  				err = r.start(service.NewTxnService(shard, storage, s.sender, s.cfg.Txn.ZombieTimeout.Duration, s.lockTableAllocator))
   314  				if err != nil {
   315  					r.logger.Fatal("start DNShard failed",
   316  						zap.Error(err))
   317  				}
   318  				return
   319  			}
   320  		}
   321  	})
   322  	if err != nil {
   323  		return err
   324  	}
   325  
   326  	s.addTNShardLocked(shard)
   327  	return nil
   328  }
   329  
   330  func (s *store) removeReplica(tnShardID uint64) error {
   331  	if r := s.getReplica(tnShardID); r != nil {
   332  		err := r.close(true)
   333  		s.replicas.Delete(tnShardID)
   334  		s.removeTNShard(tnShardID)
   335  		return err
   336  	}
   337  	return nil
   338  }
   339  
   340  func (s *store) getReplica(id uint64) *replica {
   341  	v, ok := s.replicas.Load(id)
   342  	if !ok {
   343  		return nil
   344  	}
   345  	return v.(*replica)
   346  }
   347  
   348  func (s *store) initTxnSender() error {
   349  	s.cfg.RPC.BackendOptions = append(s.cfg.RPC.BackendOptions,
   350  		morpc.WithBackendFilter(func(m morpc.Message, backendAddr string) bool {
   351  			return s.options.backendFilter == nil || s.options.backendFilter(m.(*txn.TxnRequest), backendAddr)
   352  		}))
   353  	sender, err := rpc.NewSender(
   354  		s.cfg.RPC,
   355  		s.rt,
   356  		rpc.WithSenderLocalDispatch(s.dispatchLocalRequest))
   357  	if err != nil {
   358  		return err
   359  	}
   360  	s.sender = sender
   361  	return nil
   362  }
   363  
   364  func (s *store) initTxnServer() error {
   365  	server, err := rpc.NewTxnServer(
   366  		s.txnServiceListenAddr(),
   367  		s.rt,
   368  		rpc.WithServerQueueBufferSize(s.cfg.RPC.ServerBufferQueueSize),
   369  		rpc.WithServerQueueWorkers(s.cfg.RPC.ServerWorkers),
   370  		rpc.WithServerMaxMessageSize(int(s.cfg.RPC.MaxMessageSize)),
   371  		rpc.WithServerEnableCompress(s.cfg.RPC.EnableCompress))
   372  	if err != nil {
   373  		return err
   374  	}
   375  	s.server = server
   376  	s.registerRPCHandlers()
   377  	return nil
   378  }
   379  
   380  func (s *store) initClocker() error {
   381  	if s.rt.Clock() == nil {
   382  		return moerr.NewBadConfigNoCtx("missing txn clock")
   383  	}
   384  	return nil
   385  }
   386  
   387  func (s *store) initLockTableAllocator() error {
   388  	s.lockTableAllocator = lockservice.NewLockTableAllocator(
   389  		s.lockServiceListenAddr(),
   390  		s.cfg.LockService.KeepBindTimeout.Duration,
   391  		s.cfg.RPC)
   392  	return nil
   393  }
   394  
   395  func (s *store) initHAKeeperClient() error {
   396  	if s.options.hakeekerClientFactory != nil {
   397  		client, err := s.options.hakeekerClientFactory()
   398  		if err != nil {
   399  			return err
   400  		}
   401  		s.hakeeperClient = client
   402  		s.initClusterService()
   403  		return nil
   404  	}
   405  
   406  	ctx, cancel := context.WithTimeout(context.Background(), s.cfg.HAKeeper.DiscoveryTimeout.Duration)
   407  	defer cancel()
   408  	client, err := logservice.NewTNHAKeeperClient(ctx, s.cfg.HAKeeper.ClientConfig)
   409  	if err != nil {
   410  		return err
   411  	}
   412  	s.hakeeperClient = client
   413  	s.initClusterService()
   414  	return nil
   415  }
   416  
   417  func (s *store) initClusterService() {
   418  	s.moCluster = clusterservice.NewMOCluster(s.hakeeperClient,
   419  		s.cfg.Cluster.RefreshInterval.Duration)
   420  	runtime.ProcessLevelRuntime().SetGlobalVariables(runtime.ClusterService, s.moCluster)
   421  }
   422  
   423  // initQueryService
   424  // inStandalone:
   425  //
   426  //	true: tn is boosted in a standalone cluster. cn has a queryservice already.
   427  //	false: tn is boosted in an independent process. tn needs a queryservice.
   428  func (s *store) initQueryService(inStandalone bool) {
   429  	if inStandalone {
   430  		s.queryService = nil
   431  		return
   432  	}
   433  	var err error
   434  	s.queryService, err = queryservice.NewQueryService(s.cfg.UUID,
   435  		s.queryServiceListenAddr(), s.cfg.RPC)
   436  	if err != nil {
   437  		panic(err)
   438  	}
   439  	s.initQueryCommandHandler()
   440  }
   441  
   442  func (s *store) initQueryCommandHandler() {
   443  	s.queryService.AddHandleFunc(query.CmdMethod_GetCacheInfo, s.handleGetCacheInfo, false)
   444  	s.queryService.AddHandleFunc(query.CmdMethod_GetLatestBind, s.handleGetLatestBind, false)
   445  }
   446  
   447  func (s *store) handleGetCacheInfo(ctx context.Context, req *query.Request, resp *query.Response) error {
   448  	resp.GetCacheInfoResponse = new(query.GetCacheInfoResponse)
   449  	perfcounter.GetCacheStats(func(infos []*query.CacheInfo) {
   450  		for _, info := range infos {
   451  			if info != nil {
   452  				resp.GetCacheInfoResponse.CacheInfoList = append(resp.GetCacheInfoResponse.CacheInfoList, info)
   453  			}
   454  		}
   455  	})
   456  
   457  	return nil
   458  }
   459  
   460  func (s *store) handleGetLatestBind(ctx context.Context, req *query.Request, resp *query.Response) error {
   461  	resp.GetLatestBind = &query.GetLatestBindResponse{
   462  		Bind: s.lockTableAllocator.GetLatest(
   463  			req.GetLatestBind.GroupID,
   464  			req.GetLatestBind.TableID).
   465  			DebugString(),
   466  	}
   467  	return nil
   468  }
   469  
   470  func (s *store) setupStatusServer() {
   471  	ss, ok := runtime.ProcessLevelRuntime().GetGlobalVariables(runtime.StatusServer)
   472  	if ok {
   473  		ss.(*status.Server).SetHAKeeperClient(s.hakeeperClient)
   474  	}
   475  
   476  }