github.com/matrixorigin/matrixone@v0.7.0/pkg/logservice/store_hakeeper_check.go (about)

     1  // Copyright 2021 - 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package logservice
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"os"
    21  	"time"
    22  
    23  	"go.uber.org/zap"
    24  
    25  	"github.com/google/uuid"
    26  	"github.com/matrixorigin/matrixone/pkg/hakeeper"
    27  	"github.com/matrixorigin/matrixone/pkg/hakeeper/bootstrap"
    28  	pb "github.com/matrixorigin/matrixone/pkg/pb/logservice"
    29  )
    30  
    31  const (
    32  	minIDAllocCapacity uint64 = 1024
    33  	defaultIDBatchSize uint64 = 1024 * 10
    34  
    35  	hakeeperDefaultTimeout = 2 * time.Second
    36  	checkBootstrapCycles   = 100
    37  )
    38  
    39  type idAllocator struct {
    40  	// [nextID, lastID] is the range of IDs that can be assigned.
    41  	// the next ID to be assigned is nextID
    42  	nextID uint64
    43  	lastID uint64
    44  }
    45  
    46  var _ hakeeper.IDAllocator = (*idAllocator)(nil)
    47  
    48  func newIDAllocator() hakeeper.IDAllocator {
    49  	return &idAllocator{nextID: 1, lastID: 0}
    50  }
    51  
    52  func (a *idAllocator) Next() (uint64, bool) {
    53  	if a.nextID <= a.lastID {
    54  		v := a.nextID
    55  		a.nextID++
    56  		return v, true
    57  	}
    58  	return 0, false
    59  }
    60  
    61  func (a *idAllocator) Set(next uint64, last uint64) {
    62  	// make sure that this id allocator never emit any id smaller than
    63  	// K8SIDRangeEnd
    64  	if next < hakeeper.K8SIDRangeEnd {
    65  		panic("invalid id allocator range")
    66  	}
    67  	a.nextID = next
    68  	a.lastID = last
    69  }
    70  
    71  func (a *idAllocator) Capacity() uint64 {
    72  	if a.nextID <= a.lastID {
    73  		return (a.lastID - a.nextID) + 1
    74  	}
    75  	return 0
    76  }
    77  
    78  func (l *store) setInitialClusterInfo(numOfLogShards uint64,
    79  	numOfDNShards uint64, numOfLogReplicas uint64) error {
    80  	cmd := hakeeper.GetInitialClusterRequestCmd(numOfLogShards,
    81  		numOfDNShards, numOfLogReplicas)
    82  	ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout)
    83  	defer cancel()
    84  	session := l.nh.GetNoOPSession(hakeeper.DefaultHAKeeperShardID)
    85  	result, err := l.propose(ctx, session, cmd)
    86  	if err != nil {
    87  		l.runtime.Logger().Error("failed to propose initial cluster info", zap.Error(err))
    88  		return err
    89  	}
    90  	if result.Value == uint64(pb.HAKeeperBootstrapFailed) {
    91  		panic("bootstrap failed")
    92  	}
    93  	if result.Value != uint64(pb.HAKeeperCreated) {
    94  		l.runtime.Logger().Error("initial cluster info already set")
    95  	}
    96  	return nil
    97  }
    98  
    99  func (l *store) updateIDAlloc(count uint64) error {
   100  	cmd := hakeeper.GetGetIDCmd(count)
   101  	ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout)
   102  	defer cancel()
   103  	session := l.nh.GetNoOPSession(hakeeper.DefaultHAKeeperShardID)
   104  	result, err := l.propose(ctx, session, cmd)
   105  	if err != nil {
   106  		l.runtime.Logger().Error("propose get id failed", zap.Error(err))
   107  		return err
   108  	}
   109  	// TODO: add a test for this
   110  	l.alloc.Set(result.Value, result.Value+count-1)
   111  	return nil
   112  }
   113  
   114  func (l *store) hakeeperCheck() {
   115  	isLeader, term, err := l.isLeaderHAKeeper()
   116  	if err != nil {
   117  		l.runtime.Logger().Error("failed to get HAKeeper Leader ID", zap.Error(err))
   118  		return
   119  	}
   120  
   121  	if !isLeader {
   122  		l.taskScheduler.StopScheduleCronTask()
   123  		return
   124  	}
   125  	state, err := l.getCheckerState()
   126  	if err != nil {
   127  		// TODO: check whether this is temp error
   128  		l.runtime.Logger().Error("failed to get checker state", zap.Error(err))
   129  		return
   130  	}
   131  	switch state.State {
   132  	case pb.HAKeeperCreated:
   133  		l.runtime.Logger().Warn("waiting for initial cluster info to be set, check skipped")
   134  		return
   135  	case pb.HAKeeperBootstrapping:
   136  		l.bootstrap(term, state)
   137  	case pb.HAKeeperBootstrapCommandsReceived:
   138  		l.checkBootstrap(state)
   139  	case pb.HAKeeperBootstrapFailed:
   140  		l.handleBootstrapFailure()
   141  	case pb.HAKeeperRunning:
   142  		l.healthCheck(term, state)
   143  		l.taskSchedule(state)
   144  	default:
   145  		panic("unknown HAKeeper state")
   146  	}
   147  }
   148  
   149  func (l *store) assertHAKeeperState(s pb.HAKeeperState) {
   150  	state, err := l.getCheckerState()
   151  	if err != nil {
   152  		// TODO: check whether this is temp error
   153  		l.runtime.Logger().Error("failed to get checker state", zap.Error(err))
   154  		return
   155  	}
   156  	if state.State != s {
   157  		l.runtime.Logger().Panic("unexpected state",
   158  			zap.String("expected", s.String()),
   159  			zap.String("got", state.State.String()))
   160  	}
   161  }
   162  
   163  func (l *store) handleBootstrapFailure() {
   164  	panic("failed to bootstrap the cluster")
   165  }
   166  
   167  func (l *store) healthCheck(term uint64, state *pb.CheckerState) {
   168  	l.assertHAKeeperState(pb.HAKeeperRunning)
   169  	defer l.assertHAKeeperState(pb.HAKeeperRunning)
   170  	cmds, err := l.getScheduleCommand(true, term, state)
   171  	if err != nil {
   172  		l.runtime.Logger().Error("failed to get check schedule commands", zap.Error(err))
   173  		return
   174  	}
   175  	l.runtime.Logger().Debug(fmt.Sprintf("cluster health check generated %d schedule commands", len(cmds)))
   176  	if len(cmds) > 0 {
   177  		ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout)
   178  		defer cancel()
   179  		for _, cmd := range cmds {
   180  			l.runtime.Logger().Debug("adding schedule command to hakeeper", zap.String("command", cmd.LogString()))
   181  		}
   182  		if err := l.addScheduleCommands(ctx, term, cmds); err != nil {
   183  			// TODO: check whether this is temp error
   184  			l.runtime.Logger().Debug("failed to add schedule commands", zap.Error(err))
   185  			return
   186  		}
   187  	}
   188  }
   189  
   190  func (l *store) taskSchedule(state *pb.CheckerState) {
   191  	l.assertHAKeeperState(pb.HAKeeperRunning)
   192  	defer l.assertHAKeeperState(pb.HAKeeperRunning)
   193  
   194  	switch state.TaskSchedulerState {
   195  	case pb.TaskSchedulerCreated:
   196  		l.registerTaskUser()
   197  	case pb.TaskSchedulerRunning:
   198  		l.taskScheduler.StartScheduleCronTask()
   199  		l.taskScheduler.Schedule(state.CNState, state.Tick)
   200  	case pb.TaskSchedulerStopped:
   201  	default:
   202  		panic("unknown TaskScheduler state")
   203  	}
   204  }
   205  
   206  func (l *store) registerTaskUser() {
   207  	user, ok := getTaskTableUserFromEnv()
   208  	if !ok {
   209  		user = randomUser()
   210  	}
   211  
   212  	// TODO: rename TaskTableUser to moadmin
   213  	if err := l.setTaskTableUser(user); err != nil {
   214  		l.runtime.Logger().Error("failed to set task table user", zap.Error(err))
   215  	}
   216  }
   217  
   218  func (l *store) bootstrap(term uint64, state *pb.CheckerState) {
   219  	cmds, err := l.getScheduleCommand(false, term, state)
   220  	if err != nil {
   221  		l.runtime.Logger().Error("failed to get bootstrap schedule commands", zap.Error(err))
   222  		return
   223  	}
   224  	if len(cmds) > 0 {
   225  		for _, c := range cmds {
   226  			l.runtime.Logger().Debug("bootstrap cmd", zap.String("cmd", c.LogString()))
   227  		}
   228  		ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout)
   229  		defer cancel()
   230  		if err := l.addScheduleCommands(ctx, term, cmds); err != nil {
   231  			// TODO: check whether this is temp error
   232  			l.runtime.Logger().Debug("failed to add schedule commands", zap.Error(err))
   233  			return
   234  		}
   235  		l.bootstrapCheckCycles = checkBootstrapCycles
   236  		l.bootstrapMgr = bootstrap.NewBootstrapManager(state.ClusterInfo)
   237  		l.assertHAKeeperState(pb.HAKeeperBootstrapCommandsReceived)
   238  	}
   239  }
   240  
   241  func (l *store) checkBootstrap(state *pb.CheckerState) {
   242  	if l.bootstrapCheckCycles == 0 {
   243  		if err := l.setBootstrapState(false); err != nil {
   244  			panic(err)
   245  		}
   246  		l.assertHAKeeperState(pb.HAKeeperBootstrapFailed)
   247  	}
   248  
   249  	if l.bootstrapMgr == nil {
   250  		l.bootstrapMgr = bootstrap.NewBootstrapManager(state.ClusterInfo)
   251  	}
   252  	if !l.bootstrapMgr.CheckBootstrap(state.LogState) {
   253  		l.bootstrapCheckCycles--
   254  	} else {
   255  		if err := l.setBootstrapState(true); err != nil {
   256  			panic(err)
   257  		}
   258  		l.assertHAKeeperState(pb.HAKeeperRunning)
   259  	}
   260  }
   261  
   262  func (l *store) setBootstrapState(success bool) error {
   263  	state := pb.HAKeeperRunning
   264  	if !success {
   265  		state = pb.HAKeeperBootstrapFailed
   266  	}
   267  	cmd := hakeeper.GetSetStateCmd(state)
   268  	ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout)
   269  	defer cancel()
   270  	session := l.nh.GetNoOPSession(hakeeper.DefaultHAKeeperShardID)
   271  	_, err := l.propose(ctx, session, cmd)
   272  	return err
   273  }
   274  
   275  func (l *store) getCheckerState() (*pb.CheckerState, error) {
   276  	ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout)
   277  	defer cancel()
   278  	s, err := l.read(ctx, hakeeper.DefaultHAKeeperShardID, &hakeeper.StateQuery{})
   279  	if err != nil {
   280  		return &pb.CheckerState{}, err
   281  	}
   282  	return s.(*pb.CheckerState), nil
   283  }
   284  
   285  func (l *store) getScheduleCommand(check bool,
   286  	term uint64, state *pb.CheckerState) ([]pb.ScheduleCommand, error) {
   287  	if l.alloc.Capacity() < minIDAllocCapacity {
   288  		if err := l.updateIDAlloc(defaultIDBatchSize); err != nil {
   289  			return nil, err
   290  		}
   291  	}
   292  
   293  	if check {
   294  		return l.checker.Check(l.alloc, *state), nil
   295  	}
   296  	m := bootstrap.NewBootstrapManager(state.ClusterInfo)
   297  	return m.Bootstrap(l.alloc, state.DNState, state.LogState)
   298  }
   299  
   300  func (l *store) setTaskTableUser(user pb.TaskTableUser) error {
   301  	cmd := hakeeper.GetTaskTableUserCmd(user)
   302  	ctx, cancel := context.WithTimeout(context.Background(), hakeeperDefaultTimeout)
   303  	defer cancel()
   304  	session := l.nh.GetNoOPSession(hakeeper.DefaultHAKeeperShardID)
   305  	result, err := l.propose(ctx, session, cmd)
   306  	if err != nil {
   307  		l.runtime.Logger().Error("failed to propose task user info", zap.Error(err))
   308  		return err
   309  	}
   310  	if result.Value == uint64(pb.TaskSchedulerStopped) {
   311  		panic("failed to set task user")
   312  	}
   313  	if result.Value != uint64(pb.TaskSchedulerCreated) {
   314  		l.runtime.Logger().Error("task user info already set")
   315  	}
   316  	return nil
   317  }
   318  
   319  const (
   320  	moAdminUser     = "mo_admin_user"
   321  	moAdminPassword = "mo_admin_password"
   322  )
   323  
   324  func getTaskTableUserFromEnv() (pb.TaskTableUser, bool) {
   325  	username, ok := os.LookupEnv(moAdminUser)
   326  	if !ok {
   327  		return pb.TaskTableUser{}, false
   328  	}
   329  	password, ok := os.LookupEnv(moAdminPassword)
   330  	if !ok {
   331  		return pb.TaskTableUser{}, false
   332  	}
   333  	if username == "" || password == "" {
   334  		return pb.TaskTableUser{}, false
   335  	}
   336  	return pb.TaskTableUser{Username: username, Password: password}, true
   337  }
   338  
   339  func randomUser() pb.TaskTableUser {
   340  	return pb.TaskTableUser{
   341  		Username: uuid.NewString(),
   342  		Password: uuid.NewString(),
   343  	}
   344  }