github.com/matrixorigin/matrixone@v1.2.0/pkg/logservice/store_hakeeper_check.go (about)

     1  // Copyright 2021 - 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package logservice
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"os"
    21  	"sync/atomic"
    22  	"time"
    23  
    24  	"go.uber.org/zap"
    25  
    26  	"github.com/google/uuid"
    27  	"github.com/matrixorigin/matrixone/pkg/hakeeper"
    28  	"github.com/matrixorigin/matrixone/pkg/hakeeper/bootstrap"
    29  	pb "github.com/matrixorigin/matrixone/pkg/pb/logservice"
    30  )
    31  
    32  const (
    33  	minIDAllocCapacity   uint64 = 1024
    34  	defaultIDBatchSize   uint64 = 1024 * 10
    35  	checkBootstrapCycles        = 100
    36  )
    37  
    38  var (
    39  	hakeeperDefaultTimeout = 2 * time.Second
    40  )
    41  
    42  type idAllocator struct {
    43  	// [nextID, lastID] is the range of IDs that can be assigned.
    44  	// the next ID to be assigned is nextID
    45  	nextID uint64
    46  	lastID uint64
    47  }
    48  
    49  var _ hakeeper.IDAllocator = (*idAllocator)(nil)
    50  
    51  func newIDAllocator() hakeeper.IDAllocator {
    52  	return &idAllocator{nextID: 1, lastID: 0}
    53  }
    54  
    55  func (a *idAllocator) Next() (uint64, bool) {
    56  	if a.nextID <= a.lastID {
    57  		v := a.nextID
    58  		a.nextID++
    59  		return v, true
    60  	}
    61  	return 0, false
    62  }
    63  
    64  func (a *idAllocator) Set(next uint64, last uint64) {
    65  	// make sure that this id allocator never emit any id smaller than
    66  	// K8SIDRangeEnd
    67  	if next < hakeeper.K8SIDRangeEnd {
    68  		panic("invalid id allocator range")
    69  	}
    70  	a.nextID = next
    71  	a.lastID = last
    72  }
    73  
    74  func (a *idAllocator) Capacity() uint64 {
    75  	if a.nextID <= a.lastID {
    76  		return (a.lastID - a.nextID) + 1
    77  	}
    78  	return 0
    79  }
    80  
    81  func (l *store) setInitialClusterInfo(numOfLogShards uint64,
    82  	numOfTNShards uint64, numOfLogReplicas uint64, nextID uint64, nextIDByKey map[string]uint64) error {
    83  	cmd := hakeeper.GetInitialClusterRequestCmd(numOfLogShards,
    84  		numOfTNShards, numOfLogReplicas, nextID, nextIDByKey)
    85  	ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout)
    86  	defer cancel()
    87  	session := l.nh.GetNoOPSession(hakeeper.DefaultHAKeeperShardID)
    88  	result, err := l.propose(ctx, session, cmd)
    89  	if err != nil {
    90  		l.runtime.Logger().Error("failed to propose initial cluster info", zap.Error(err))
    91  		return err
    92  	}
    93  	if result.Value == uint64(pb.HAKeeperBootstrapFailed) {
    94  		panic("bootstrap failed")
    95  	}
    96  	if result.Value != uint64(pb.HAKeeperCreated) {
    97  		l.runtime.Logger().Error("initial cluster info already set")
    98  	}
    99  	return nil
   100  }
   101  
   102  func (l *store) updateIDAlloc(count uint64) error {
   103  	cmd := hakeeper.GetAllocateIDCmd(pb.CNAllocateID{Batch: count})
   104  	ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout)
   105  	defer cancel()
   106  	session := l.nh.GetNoOPSession(hakeeper.DefaultHAKeeperShardID)
   107  	result, err := l.propose(ctx, session, cmd)
   108  	if err != nil {
   109  		l.runtime.Logger().Error("propose get id failed", zap.Error(err))
   110  		return err
   111  	}
   112  	// TODO: add a test for this
   113  	l.alloc.Set(result.Value, result.Value+count-1)
   114  	return nil
   115  }
   116  
   117  func (l *store) getCheckerStateFromLeader() (*pb.CheckerState, uint64) {
   118  	isLeader, term, err := l.isLeaderHAKeeper()
   119  	if err != nil {
   120  		l.runtime.Logger().Error("failed to get HAKeeper Leader ID", zap.Error(err))
   121  		return nil, term
   122  	}
   123  
   124  	if !isLeader {
   125  		l.taskScheduler.StopScheduleCronTask()
   126  		return nil, term
   127  	}
   128  	state, err := l.getCheckerState()
   129  	if err != nil {
   130  		// TODO: check whether this is temp error
   131  		l.runtime.Logger().Error("failed to get checker state", zap.Error(err))
   132  		return nil, term
   133  	}
   134  
   135  	return state, term
   136  }
   137  
   138  var debugPrintHAKeeperState atomic.Bool
   139  
   140  func (l *store) hakeeperCheck() {
   141  	state, term := l.getCheckerStateFromLeader()
   142  	if state == nil {
   143  		return
   144  	}
   145  
   146  	switch state.State {
   147  	case pb.HAKeeperCreated:
   148  		l.runtime.Logger().Warn("waiting for initial cluster info to be set, check skipped")
   149  		return
   150  	case pb.HAKeeperBootstrapping:
   151  		l.bootstrap(term, state)
   152  	case pb.HAKeeperBootstrapCommandsReceived:
   153  		l.checkBootstrap(state)
   154  	case pb.HAKeeperBootstrapFailed:
   155  		l.handleBootstrapFailure()
   156  	case pb.HAKeeperRunning:
   157  		if debugPrintHAKeeperState.CompareAndSwap(false, true) {
   158  			l.runtime.Logger().Info("HAKeeper is running",
   159  				zap.Uint64("next id", state.NextId))
   160  		}
   161  		l.healthCheck(term, state)
   162  	default:
   163  		panic("unknown HAKeeper state")
   164  	}
   165  }
   166  
   167  func (l *store) assertHAKeeperState(s pb.HAKeeperState) {
   168  	state, err := l.getCheckerState()
   169  	if err != nil {
   170  		// TODO: check whether this is temp error
   171  		l.runtime.Logger().Error("failed to get checker state", zap.Error(err))
   172  		return
   173  	}
   174  	if state.State != s {
   175  		l.runtime.Logger().Panic("unexpected state",
   176  			zap.String("expected", s.String()),
   177  			zap.String("got", state.State.String()))
   178  	}
   179  }
   180  
   181  func (l *store) handleBootstrapFailure() {
   182  	panic("failed to bootstrap the cluster")
   183  }
   184  
   185  func (l *store) healthCheck(term uint64, state *pb.CheckerState) {
   186  	l.assertHAKeeperState(pb.HAKeeperRunning)
   187  	defer l.assertHAKeeperState(pb.HAKeeperRunning)
   188  	cmds, err := l.getScheduleCommand(true, term, state)
   189  	if err != nil {
   190  		l.runtime.Logger().Error("failed to get check schedule commands", zap.Error(err))
   191  		return
   192  	}
   193  	l.runtime.Logger().Debug(fmt.Sprintf("cluster health check generated %d schedule commands", len(cmds)))
   194  	if len(cmds) > 0 {
   195  		ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout)
   196  		defer cancel()
   197  		for _, cmd := range cmds {
   198  			l.runtime.Logger().Debug("adding schedule command to hakeeper", zap.String("command", cmd.LogString()))
   199  		}
   200  		if err := l.addScheduleCommands(ctx, term, cmds); err != nil {
   201  			// TODO: check whether this is temp error
   202  			l.runtime.Logger().Debug("failed to add schedule commands", zap.Error(err))
   203  			return
   204  		}
   205  	}
   206  }
   207  
   208  func (l *store) taskSchedule(state *pb.CheckerState) {
   209  	l.assertHAKeeperState(pb.HAKeeperRunning)
   210  	defer l.assertHAKeeperState(pb.HAKeeperRunning)
   211  
   212  	switch state.TaskSchedulerState {
   213  	case pb.TaskSchedulerCreated:
   214  		l.registerTaskUser()
   215  	case pb.TaskSchedulerRunning:
   216  		l.taskScheduler.StartScheduleCronTask()
   217  		l.taskScheduler.Schedule(state.CNState, state.Tick)
   218  	case pb.TaskSchedulerStopped:
   219  	default:
   220  		panic("unknown TaskScheduler state")
   221  	}
   222  }
   223  
   224  func (l *store) registerTaskUser() {
   225  	user, ok := getTaskTableUserFromEnv()
   226  	if !ok {
   227  		user = randomUser()
   228  	}
   229  
   230  	// TODO: rename TaskTableUser to moadmin
   231  	if err := l.setTaskTableUser(user); err != nil {
   232  		l.runtime.Logger().Error("failed to set task table user", zap.Error(err))
   233  	}
   234  }
   235  
   236  func (l *store) bootstrap(term uint64, state *pb.CheckerState) {
   237  	cmds, err := l.getScheduleCommand(false, term, state)
   238  	if err != nil {
   239  		l.runtime.Logger().Error("failed to get bootstrap schedule commands", zap.Error(err))
   240  		return
   241  	}
   242  	if len(cmds) > 0 {
   243  		for _, c := range cmds {
   244  			l.runtime.Logger().Debug("bootstrap cmd", zap.String("cmd", c.LogString()))
   245  		}
   246  		ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout)
   247  		defer cancel()
   248  		if err := l.addScheduleCommands(ctx, term, cmds); err != nil {
   249  			// TODO: check whether this is temp error
   250  			l.runtime.Logger().Debug("failed to add schedule commands", zap.Error(err))
   251  			return
   252  		}
   253  		l.bootstrapCheckCycles = checkBootstrapCycles
   254  		l.bootstrapMgr = bootstrap.NewBootstrapManager(state.ClusterInfo)
   255  		l.assertHAKeeperState(pb.HAKeeperBootstrapCommandsReceived)
   256  	}
   257  }
   258  
   259  func (l *store) checkBootstrap(state *pb.CheckerState) {
   260  	if l.bootstrapCheckCycles == 0 {
   261  		if err := l.setBootstrapState(false); err != nil {
   262  			panic(err)
   263  		}
   264  		l.assertHAKeeperState(pb.HAKeeperBootstrapFailed)
   265  	}
   266  
   267  	if l.bootstrapMgr == nil {
   268  		l.bootstrapMgr = bootstrap.NewBootstrapManager(state.ClusterInfo)
   269  	}
   270  	if !l.bootstrapMgr.CheckBootstrap(state.LogState) {
   271  		l.bootstrapCheckCycles--
   272  	} else {
   273  		if err := l.setBootstrapState(true); err != nil {
   274  			panic(err)
   275  		}
   276  		l.assertHAKeeperState(pb.HAKeeperRunning)
   277  	}
   278  }
   279  
   280  func (l *store) setBootstrapState(success bool) error {
   281  	state := pb.HAKeeperRunning
   282  	if !success {
   283  		state = pb.HAKeeperBootstrapFailed
   284  	}
   285  	cmd := hakeeper.GetSetStateCmd(state)
   286  	ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout)
   287  	defer cancel()
   288  	session := l.nh.GetNoOPSession(hakeeper.DefaultHAKeeperShardID)
   289  	_, err := l.propose(ctx, session, cmd)
   290  	return err
   291  }
   292  
   293  func (l *store) getCheckerState() (*pb.CheckerState, error) {
   294  	ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout)
   295  	defer cancel()
   296  	s, err := l.read(ctx, hakeeper.DefaultHAKeeperShardID, &hakeeper.StateQuery{})
   297  	if err != nil {
   298  		return &pb.CheckerState{}, err
   299  	}
   300  	return s.(*pb.CheckerState), nil
   301  }
   302  
   303  func (l *store) getScheduleCommand(check bool,
   304  	term uint64, state *pb.CheckerState) ([]pb.ScheduleCommand, error) {
   305  	if l.alloc.Capacity() < minIDAllocCapacity {
   306  		if err := l.updateIDAlloc(defaultIDBatchSize); err != nil {
   307  			return nil, err
   308  		}
   309  	}
   310  
   311  	if check {
   312  		return l.checker.Check(l.alloc, *state), nil
   313  	}
   314  	m := bootstrap.NewBootstrapManager(state.ClusterInfo)
   315  	return m.Bootstrap(l.alloc, state.TNState, state.LogState)
   316  }
   317  
   318  func (l *store) setTaskTableUser(user pb.TaskTableUser) error {
   319  	cmd := hakeeper.GetTaskTableUserCmd(user)
   320  	ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout)
   321  	defer cancel()
   322  	session := l.nh.GetNoOPSession(hakeeper.DefaultHAKeeperShardID)
   323  	result, err := l.propose(ctx, session, cmd)
   324  	if err != nil {
   325  		l.runtime.Logger().Error("failed to propose task user info", zap.Error(err))
   326  		return err
   327  	}
   328  	if result.Value == uint64(pb.TaskSchedulerStopped) {
   329  		panic("failed to set task user")
   330  	}
   331  	if result.Value != uint64(pb.TaskSchedulerCreated) {
   332  		l.runtime.Logger().Error("task user info already set")
   333  	}
   334  	return nil
   335  }
   336  
   337  const (
   338  	moAdminUser     = "mo_admin_user"
   339  	moAdminPassword = "mo_admin_password"
   340  )
   341  
   342  func getTaskTableUserFromEnv() (pb.TaskTableUser, bool) {
   343  	username, ok := os.LookupEnv(moAdminUser)
   344  	if !ok {
   345  		return pb.TaskTableUser{}, false
   346  	}
   347  	password, ok := os.LookupEnv(moAdminPassword)
   348  	if !ok {
   349  		return pb.TaskTableUser{}, false
   350  	}
   351  	if username == "" || password == "" {
   352  		return pb.TaskTableUser{}, false
   353  	}
   354  	return pb.TaskTableUser{Username: username, Password: password}, true
   355  }
   356  
   357  func randomUser() pb.TaskTableUser {
   358  	return pb.TaskTableUser{
   359  		Username: uuid.NewString(),
   360  		Password: uuid.NewString(),
   361  	}
   362  }