github.com/matrixorigin/matrixone@v1.2.0/pkg/tnservice/store.go (about) 1 // Copyright 2021 - 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tnservice 16 17 import ( 18 "context" 19 "errors" 20 "sync" 21 "time" 22 23 "github.com/matrixorigin/matrixone/pkg/clusterservice" 24 "github.com/matrixorigin/matrixone/pkg/common/moerr" 25 "github.com/matrixorigin/matrixone/pkg/common/morpc" 26 "github.com/matrixorigin/matrixone/pkg/common/runtime" 27 "github.com/matrixorigin/matrixone/pkg/common/stopper" 28 "github.com/matrixorigin/matrixone/pkg/defines" 29 "github.com/matrixorigin/matrixone/pkg/fileservice" 30 "github.com/matrixorigin/matrixone/pkg/lockservice" 31 "github.com/matrixorigin/matrixone/pkg/logservice" 32 logservicepb "github.com/matrixorigin/matrixone/pkg/pb/logservice" 33 "github.com/matrixorigin/matrixone/pkg/pb/metadata" 34 "github.com/matrixorigin/matrixone/pkg/pb/query" 35 "github.com/matrixorigin/matrixone/pkg/pb/txn" 36 "github.com/matrixorigin/matrixone/pkg/perfcounter" 37 "github.com/matrixorigin/matrixone/pkg/queryservice" 38 "github.com/matrixorigin/matrixone/pkg/taskservice" 39 "github.com/matrixorigin/matrixone/pkg/txn/rpc" 40 "github.com/matrixorigin/matrixone/pkg/txn/service" 41 "github.com/matrixorigin/matrixone/pkg/util" 42 "github.com/matrixorigin/matrixone/pkg/util/address" 43 "github.com/matrixorigin/matrixone/pkg/util/status" 44 "github.com/matrixorigin/matrixone/pkg/vm/engine/tae/blockio" 45 "github.com/matrixorigin/matrixone/pkg/vm/engine/tae/common" 46 "go.uber.org/zap" 47 ) 48 49 var ( 50 retryCreateStorageInterval = time.Second * 5 51 ) 52 53 // WithConfigAdjust set adjust config func 54 func WithConfigAdjust(adjustConfigFunc func(c *Config)) Option { 55 return func(s *store) { 56 s.options.adjustConfigFunc = adjustConfigFunc 57 } 58 } 59 60 // WithBackendFilter set filtering txn.TxnRequest sent to other DNShard 61 func WithBackendFilter(filter func(morpc.Message, string) bool) Option { 62 return func(s *store) { 63 s.options.backendFilter = filter 64 } 65 } 66 67 // WithHAKeeperClientFactory set hakeeper client factory 68 func WithHAKeeperClientFactory(factory func() (logservice.TNHAKeeperClient, error)) Option { 69 return func(s *store) { 70 s.options.hakeekerClientFactory = factory 71 } 72 } 73 74 // WithLogServiceClientFactory set log service client factory 75 func WithLogServiceClientFactory(factory func(metadata.TNShard) (logservice.Client, error)) Option { 76 return func(s *store) { 77 s.options.logServiceClientFactory = factory 78 } 79 } 80 81 // WithTaskStorageFactory setup the special task strorage factory 82 func WithTaskStorageFactory(factory taskservice.TaskStorageFactory) Option { 83 return func(s *store) { 84 s.task.storageFactory = factory 85 } 86 } 87 88 // WithConfigData saves the data from the config file 89 func WithConfigData(data map[string]*logservicepb.ConfigItem) Option { 90 return func(s *store) { 91 if s.config == nil { 92 s.config = util.NewConfigData(data) 93 } else { 94 util.MergeConfig(s.config, data) 95 } 96 } 97 } 98 99 type store struct { 100 cfg *Config 101 rt runtime.Runtime 102 sender rpc.TxnSender 103 server rpc.TxnServer 104 hakeeperClient logservice.TNHAKeeperClient 105 fileService fileservice.FileService 106 metadataFileService fileservice.ReplaceableFileService 107 lockTableAllocator lockservice.LockTableAllocator 108 moCluster clusterservice.MOCluster 109 replicas *sync.Map 110 stopper *stopper.Stopper 111 shutdownC chan struct{} 112 113 options struct { 114 logServiceClientFactory func(metadata.TNShard) (logservice.Client, error) 115 hakeekerClientFactory func() (logservice.TNHAKeeperClient, error) 116 backendFilter func(msg morpc.Message, backendAddr string) bool 117 adjustConfigFunc func(c *Config) 118 } 119 120 mu struct { 121 sync.RWMutex 122 metadata metadata.TNStore 123 } 124 125 task struct { 126 sync.RWMutex 127 serviceCreated bool 128 serviceHolder taskservice.TaskServiceHolder 129 storageFactory taskservice.TaskStorageFactory 130 } 131 132 addressMgr address.AddressManager 133 134 config *util.ConfigData 135 // queryService for getting cache info from tnservice 136 queryService queryservice.QueryService 137 } 138 139 // NewService create TN Service 140 func NewService( 141 cfg *Config, 142 rt runtime.Runtime, 143 fileService fileservice.FileService, 144 shutdownC chan struct{}, 145 opts ...Option) (Service, error) { 146 if err := cfg.Validate(); err != nil { 147 return nil, err 148 } 149 150 configKVMap, _ := dumpTnConfig(*cfg) 151 opts = append(opts, WithConfigData(configKVMap)) 152 153 // start common stuff 154 common.InitTAEMPool() 155 156 // get metadata fs 157 metadataFS, err := fileservice.Get[fileservice.ReplaceableFileService](fileService, defines.LocalFileServiceName) 158 if err != nil { 159 return nil, err 160 } 161 162 // start I/O pipeline 163 blockio.Start() 164 165 s := &store{ 166 cfg: cfg, 167 rt: rt, 168 fileService: fileService, 169 metadataFileService: metadataFS, 170 shutdownC: shutdownC, 171 addressMgr: address.NewAddressManager(cfg.ServiceHost, cfg.PortBase), 172 } 173 for _, opt := range opts { 174 opt(s) 175 } 176 s.registerServices() 177 s.replicas = &sync.Map{} 178 s.stopper = stopper.NewStopper("dn-store", 179 stopper.WithLogger(s.rt.Logger().RawLogger())) 180 s.mu.metadata = metadata.TNStore{UUID: cfg.UUID} 181 if s.options.adjustConfigFunc != nil { 182 s.options.adjustConfigFunc(s.cfg) 183 } 184 185 if err := s.initClocker(); err != nil { 186 return nil, err 187 } 188 if err := s.initHAKeeperClient(); err != nil { 189 return nil, err 190 } 191 if err := s.initLockTableAllocator(); err != nil { 192 return nil, err 193 } 194 if err := s.initTxnSender(); err != nil { 195 return nil, err 196 } 197 if err := s.initTxnServer(); err != nil { 198 return nil, err 199 } 200 if err := s.initMetadata(); err != nil { 201 return nil, err 202 } 203 204 s.initQueryService(cfg.InStandalone) 205 206 s.initTaskHolder() 207 s.initSqlWriterFactory() 208 s.setupStatusServer() 209 210 return s, nil 211 } 212 213 func (s *store) Start() error { 214 if err := s.startTNShards(); err != nil { 215 return err 216 } 217 if err := s.server.Start(); err != nil { 218 return err 219 } 220 if s.queryService != nil { 221 if err := s.queryService.Start(); err != nil { 222 return err 223 } 224 } 225 s.rt.SubLogger(runtime.SystemInit).Info("dn heartbeat task started") 226 return s.stopper.RunTask(s.heartbeatTask) 227 } 228 229 func (s *store) Close() error { 230 s.stopper.Stop() 231 s.moCluster.Close() 232 err := errors.Join( 233 s.hakeeperClient.Close(), 234 s.sender.Close(), 235 s.server.Close(), 236 s.lockTableAllocator.Close(), 237 ) 238 s.replicas.Range(func(_, value any) bool { 239 r := value.(*replica) 240 if e := r.close(false); e != nil { 241 err = errors.Join(e, err) 242 } 243 return true 244 }) 245 s.task.RLock() 246 ts := s.task.serviceHolder 247 s.task.RUnlock() 248 if ts != nil { 249 err = errors.Join(err, ts.Close()) 250 } 251 // stop I/O pipeline 252 blockio.Stop() 253 return err 254 } 255 256 func (s *store) StartTNReplica(shard metadata.TNShard) error { 257 return s.createReplica(shard) 258 } 259 260 func (s *store) CloseTNReplica(shard metadata.TNShard) error { 261 return s.removeReplica(shard.ShardID) 262 } 263 264 func (s *store) startTNShards() error { 265 s.mu.Lock() 266 defer s.mu.Unlock() 267 268 for _, shard := range s.mu.metadata.Shards { 269 if err := s.createReplica(shard); err != nil { 270 return err 271 } 272 } 273 return nil 274 } 275 276 func (s *store) getTNShardInfo() []logservicepb.TNShardInfo { 277 var shards []logservicepb.TNShardInfo 278 s.replicas.Range(func(_, value any) bool { 279 r := value.(*replica) 280 shards = append(shards, logservicepb.TNShardInfo{ 281 ShardID: r.shard.ShardID, 282 ReplicaID: r.shard.ReplicaID, 283 }) 284 return true 285 }) 286 return shards 287 } 288 289 func (s *store) createReplica(shard metadata.TNShard) error { 290 r := newReplica(shard, s.rt) 291 v, ok := s.replicas.LoadOrStore(shard.ShardID, r) 292 if ok { 293 s.rt.Logger().Debug("DNShard already created", 294 zap.String("new", shard.DebugString()), 295 zap.String("exist", v.(*replica).shard.DebugString())) 296 return nil 297 } 298 299 err := s.stopper.RunTask(func(ctx context.Context) { 300 for { 301 select { 302 case <-ctx.Done(): 303 return 304 default: 305 storage, err := s.createTxnStorage(ctx, shard) 306 if err != nil { 307 r.logger.Error("start DNShard failed", 308 zap.Error(err)) 309 time.Sleep(retryCreateStorageInterval) 310 continue 311 } 312 313 err = r.start(service.NewTxnService(shard, storage, s.sender, s.cfg.Txn.ZombieTimeout.Duration, s.lockTableAllocator)) 314 if err != nil { 315 r.logger.Fatal("start DNShard failed", 316 zap.Error(err)) 317 } 318 return 319 } 320 } 321 }) 322 if err != nil { 323 return err 324 } 325 326 s.addTNShardLocked(shard) 327 return nil 328 } 329 330 func (s *store) removeReplica(tnShardID uint64) error { 331 if r := s.getReplica(tnShardID); r != nil { 332 err := r.close(true) 333 s.replicas.Delete(tnShardID) 334 s.removeTNShard(tnShardID) 335 return err 336 } 337 return nil 338 } 339 340 func (s *store) getReplica(id uint64) *replica { 341 v, ok := s.replicas.Load(id) 342 if !ok { 343 return nil 344 } 345 return v.(*replica) 346 } 347 348 func (s *store) initTxnSender() error { 349 s.cfg.RPC.BackendOptions = append(s.cfg.RPC.BackendOptions, 350 morpc.WithBackendFilter(func(m morpc.Message, backendAddr string) bool { 351 return s.options.backendFilter == nil || s.options.backendFilter(m.(*txn.TxnRequest), backendAddr) 352 })) 353 sender, err := rpc.NewSender( 354 s.cfg.RPC, 355 s.rt, 356 rpc.WithSenderLocalDispatch(s.dispatchLocalRequest)) 357 if err != nil { 358 return err 359 } 360 s.sender = sender 361 return nil 362 } 363 364 func (s *store) initTxnServer() error { 365 server, err := rpc.NewTxnServer( 366 s.txnServiceListenAddr(), 367 s.rt, 368 rpc.WithServerQueueBufferSize(s.cfg.RPC.ServerBufferQueueSize), 369 rpc.WithServerQueueWorkers(s.cfg.RPC.ServerWorkers), 370 rpc.WithServerMaxMessageSize(int(s.cfg.RPC.MaxMessageSize)), 371 rpc.WithServerEnableCompress(s.cfg.RPC.EnableCompress)) 372 if err != nil { 373 return err 374 } 375 s.server = server 376 s.registerRPCHandlers() 377 return nil 378 } 379 380 func (s *store) initClocker() error { 381 if s.rt.Clock() == nil { 382 return moerr.NewBadConfigNoCtx("missing txn clock") 383 } 384 return nil 385 } 386 387 func (s *store) initLockTableAllocator() error { 388 s.lockTableAllocator = lockservice.NewLockTableAllocator( 389 s.lockServiceListenAddr(), 390 s.cfg.LockService.KeepBindTimeout.Duration, 391 s.cfg.RPC) 392 return nil 393 } 394 395 func (s *store) initHAKeeperClient() error { 396 if s.options.hakeekerClientFactory != nil { 397 client, err := s.options.hakeekerClientFactory() 398 if err != nil { 399 return err 400 } 401 s.hakeeperClient = client 402 s.initClusterService() 403 return nil 404 } 405 406 ctx, cancel := context.WithTimeout(context.Background(), s.cfg.HAKeeper.DiscoveryTimeout.Duration) 407 defer cancel() 408 client, err := logservice.NewTNHAKeeperClient(ctx, s.cfg.HAKeeper.ClientConfig) 409 if err != nil { 410 return err 411 } 412 s.hakeeperClient = client 413 s.initClusterService() 414 return nil 415 } 416 417 func (s *store) initClusterService() { 418 s.moCluster = clusterservice.NewMOCluster(s.hakeeperClient, 419 s.cfg.Cluster.RefreshInterval.Duration) 420 runtime.ProcessLevelRuntime().SetGlobalVariables(runtime.ClusterService, s.moCluster) 421 } 422 423 // initQueryService 424 // inStandalone: 425 // 426 // true: tn is boosted in a standalone cluster. cn has a queryservice already. 427 // false: tn is boosted in an independent process. tn needs a queryservice. 428 func (s *store) initQueryService(inStandalone bool) { 429 if inStandalone { 430 s.queryService = nil 431 return 432 } 433 var err error 434 s.queryService, err = queryservice.NewQueryService(s.cfg.UUID, 435 s.queryServiceListenAddr(), s.cfg.RPC) 436 if err != nil { 437 panic(err) 438 } 439 s.initQueryCommandHandler() 440 } 441 442 func (s *store) initQueryCommandHandler() { 443 s.queryService.AddHandleFunc(query.CmdMethod_GetCacheInfo, s.handleGetCacheInfo, false) 444 s.queryService.AddHandleFunc(query.CmdMethod_GetLatestBind, s.handleGetLatestBind, false) 445 } 446 447 func (s *store) handleGetCacheInfo(ctx context.Context, req *query.Request, resp *query.Response) error { 448 resp.GetCacheInfoResponse = new(query.GetCacheInfoResponse) 449 perfcounter.GetCacheStats(func(infos []*query.CacheInfo) { 450 for _, info := range infos { 451 if info != nil { 452 resp.GetCacheInfoResponse.CacheInfoList = append(resp.GetCacheInfoResponse.CacheInfoList, info) 453 } 454 } 455 }) 456 457 return nil 458 } 459 460 func (s *store) handleGetLatestBind(ctx context.Context, req *query.Request, resp *query.Response) error { 461 resp.GetLatestBind = &query.GetLatestBindResponse{ 462 Bind: s.lockTableAllocator.GetLatest( 463 req.GetLatestBind.GroupID, 464 req.GetLatestBind.TableID). 465 DebugString(), 466 } 467 return nil 468 } 469 470 func (s *store) setupStatusServer() { 471 ss, ok := runtime.ProcessLevelRuntime().GetGlobalVariables(runtime.StatusServer) 472 if ok { 473 ss.(*status.Server).SetHAKeeperClient(s.hakeeperClient) 474 } 475 476 }