github.com/matrixorigin/matrixone@v1.2.0/pkg/bootstrap/service_upgrade.go (about)

     1  // Copyright 2023 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package bootstrap
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"time"
    21  
    22  	"github.com/matrixorigin/matrixone/pkg/bootstrap/versions"
    23  	"github.com/matrixorigin/matrixone/pkg/catalog"
    24  	"github.com/matrixorigin/matrixone/pkg/util/executor"
    25  	"go.uber.org/zap"
    26  )
    27  
    28  var (
    29  	defaultUpgradeTenantBatch         = 16
    30  	defaultCheckUpgradeDuration       = time.Second * 5
    31  	defaultCheckUpgradeTenantDuration = time.Second * 10
    32  	defaultUpgradeTenantTasks         = 4
    33  )
    34  
    35  func (s *service) BootstrapUpgrade(ctx context.Context) error {
    36  	getUpgradeLogger().Info("start bootstrap upgrade")
    37  	s.adjustUpgrade()
    38  	// MO's upgrade framework is automated, requiring no manual execution of any
    39  	// upgrade commands, and supports cross-version upgrades. All upgrade processes
    40  	// are executed at the CN node. Currently, rollback upgrade is not supported.
    41  	//
    42  	// When a new version of the CN node is started, it will first get the current
    43  	// version of the cluster running, and determine the upgrade route before this
    44  	// version and the current new version of the CN. When upgrading across versions,
    45  	// this upgrade route will go through multiple versions of upgrades, and finally
    46  	// upgrade to the current version of the CN.
    47  	//
    48  	// Each version upgrade, for the previous version, contains 2 parts, one is to
    49  	// upgrade the cluster metadata and the other is to upgrade the tenant metadata.
    50  	//
    51  	// For upgrading cluster metadata, it is usually very fast, usually it is creating
    52  	// some new metadata tables or updating the structure of some metadata tables, and
    53  	// this process is performed on one CN.
    54  	//
    55  	// For upgrading tenant metadata, the time consuming upgrade depends on the number
    56  	// of tenants, and since MO is a meta-native multi-tenant database, our default
    57  	// number of tenants is huge. So the whole tenant upgrade is asynchronous and will
    58  	// be grouped for all tenants and concurrently executed on multiple CNs at the same
    59  	// time.
    60  	if err := retryRun(ctx, "doCheckUpgrade", s.doCheckUpgrade); err != nil {
    61  		getUpgradeLogger().Error("check upgrade failed", zap.Error(err))
    62  		return err
    63  	}
    64  	if err := s.stopper.RunTask(s.asyncUpgradeTask); err != nil {
    65  		return err
    66  	}
    67  	for i := 0; i < s.upgrade.upgradeTenantTasks; i++ {
    68  		if err := s.stopper.RunTask(s.asyncUpgradeTenantTask); err != nil {
    69  			return err
    70  		}
    71  	}
    72  	return nil
    73  }
    74  
    75  // doCheckUpgrade get the current version of the cluster running, and determine the upgrade
    76  // route before this version and the current new version of the CN.
    77  //
    78  // Note that this logic will execute concurrently if more than one CN starts at the same
    79  // time, but it doesn't matter, we use select for update to make it so that only one CN can
    80  // create the upgrade step.
    81  func (s *service) doCheckUpgrade(ctx context.Context) error {
    82  	opts := executor.Options{}.
    83  		WithDatabase(catalog.MO_CATALOG).
    84  		WithMinCommittedTS(s.now()).
    85  		WithWaitCommittedLogApplied().
    86  		WithTimeZone(time.Local)
    87  	return s.exec.ExecTxn(
    88  		ctx,
    89  		func(txn executor.TxnExecutor) error {
    90  			final := s.getFinalVersionHandle().Metadata()
    91  
    92  			// Deploy mo first time without 1.2.0, init framework first.
    93  			// And upgrade to current version.
    94  			created, err := versions.IsFrameworkTablesCreated(txn)
    95  			if err != nil {
    96  				getUpgradeLogger().Error("failed to check upgrade framework",
    97  					zap.Error(err))
    98  				return err
    99  			}
   100  
   101  			// First version as a genesis version, always need to be PREPARE.
   102  			// Because the first version need to init upgrade framework tables.
   103  			if !created {
   104  				getUpgradeLogger().Info("init upgrade framework",
   105  					zap.String("final-version", final.Version))
   106  
   107  				// create new upgrade framework tables for the first time,
   108  				// which means using v1.2.0 for the first time
   109  				err = s.getFinalVersionHandle().HandleCreateFrameworkDeps(txn)
   110  				if err != nil {
   111  					getLogger().Error("execute pre dependencies error when creating a new upgrade framework", zap.Error(err))
   112  					return err
   113  				}
   114  
   115  				// Many cn maybe create framework tables parallel, only one can create success.
   116  				// Just return error, and upgrade framework will retry.
   117  				err = createFrameworkTables(txn, final)
   118  				if err != nil {
   119  					getLogger().Error("create upgrade framework tables error", zap.Error(err))
   120  					return err
   121  				}
   122  				getLogger().Info("create upgrade framework tables success")
   123  			}
   124  
   125  			// lock version table
   126  			if err := txn.LockTable(catalog.MOVersionTable); err != nil {
   127  				getUpgradeLogger().Error("failed to lock table",
   128  					zap.String("table", catalog.MOVersionTable),
   129  					zap.Error(err))
   130  				return err
   131  			}
   132  
   133  			v, err := versions.GetLatestVersion(txn)
   134  			if err != nil {
   135  				getUpgradeLogger().Error("failed to get latest version",
   136  					zap.Error(err))
   137  				return err
   138  			}
   139  
   140  			getUpgradeLogger().Info("get current mo cluster latest version",
   141  				zap.String("latest", v.Version),
   142  				zap.String("final", final.Version))
   143  
   144  			// cluster is upgrading to v1, only v1's cn can start up.
   145  			if !v.IsReady() && v.Version != final.Version {
   146  				panic(fmt.Sprintf("cannot upgrade to version %s, because version %s is in upgrading",
   147  					final.Version,
   148  					v.Version))
   149  			}
   150  			// cluster is running at v1, cannot startup a old version to join cluster.
   151  			if v.IsReady() && versions.Compare(final.Version, v.Version) < 0 {
   152  				panic(fmt.Sprintf("cannot startup a old version %s to join cluster, current version is %s",
   153  					final.Version,
   154  					v.Version))
   155  			}
   156  
   157  			// check upgrade has 2 step:
   158  			// 1: already checked, version exists
   159  			// 2: add upgrades from latest version to final version
   160  			checker := func() (bool, error) {
   161  				if v.Version == final.Version && v.VersionOffset >= final.VersionOffset {
   162  					return true, nil
   163  				}
   164  
   165  				state, ok, err := versions.GetVersionState(final.Version, final.VersionOffset, txn, false)
   166  				if err == nil && ok && state == versions.StateReady {
   167  					s.upgrade.finalVersionCompleted.Store(true)
   168  				}
   169  				if err != nil {
   170  					getUpgradeLogger().Error("failed to get final version state",
   171  						zap.String("final", final.Version),
   172  						zap.Error(err))
   173  				}
   174  				return ok, err
   175  			}
   176  
   177  			addUpgradesToFinalVersion := func() error {
   178  				if err := versions.AddVersion(final.Version, final.VersionOffset, versions.StateCreated, txn); err != nil {
   179  					getUpgradeLogger().Error("failed to add final version",
   180  						zap.String("final", final.Version),
   181  						zap.Error(err))
   182  					return err
   183  				}
   184  
   185  				getUpgradeLogger().Error("final version added",
   186  					zap.String("final", final.Version))
   187  
   188  				latest, err := versions.MustGetLatestReadyVersion(txn)
   189  				if err != nil {
   190  					getUpgradeLogger().Error("failed to get latest ready version",
   191  						zap.String("latest", latest),
   192  						zap.Error(err))
   193  					return err
   194  				}
   195  
   196  				getUpgradeLogger().Info("current latest ready version loaded",
   197  					zap.String("latest", latest),
   198  					zap.String("final", final.Version),
   199  					zap.Int32("versionOffset", int32(final.VersionOffset)))
   200  
   201  				var upgrades []versions.VersionUpgrade
   202  				from := latest
   203  				append := func(v versions.Version) {
   204  					order := int32(len(upgrades))
   205  					u := versions.VersionUpgrade{
   206  						FromVersion:        from,
   207  						ToVersion:          v.Version,
   208  						FinalVersion:       final.Version,
   209  						FinalVersionOffset: final.VersionOffset,
   210  						State:              versions.StateCreated,
   211  						UpgradeOrder:       order,
   212  						UpgradeCluster:     v.UpgradeCluster,
   213  						UpgradeTenant:      v.UpgradeTenant,
   214  					}
   215  					upgrades = append(upgrades, u)
   216  
   217  					getUpgradeLogger().Info("version upgrade added",
   218  						zap.String("upgrade", u.String()),
   219  						zap.String("final", final.Version))
   220  				}
   221  
   222  				// can upgrade to final version directly.
   223  				if final.CanDirectUpgrade(latest) {
   224  					append(final)
   225  				} else {
   226  					for _, v := range s.handles {
   227  						if versions.Compare(v.Metadata().Version, from) > 0 &&
   228  							v.Metadata().CanDirectUpgrade(from) {
   229  							append(v.Metadata())
   230  							from = v.Metadata().Version
   231  						}
   232  					}
   233  				}
   234  				return versions.AddVersionUpgrades(upgrades, txn)
   235  			}
   236  
   237  			// step 1
   238  			if versionAdded, err := checker(); err != nil || versionAdded {
   239  				return err
   240  			}
   241  
   242  			// step 2
   243  			return addUpgradesToFinalVersion()
   244  		},
   245  		opts)
   246  }
   247  
   248  // asyncUpgradeTask is a task that executes the upgrade logic step by step
   249  // according to the created upgrade steps
   250  func (s *service) asyncUpgradeTask(ctx context.Context) {
   251  	fn := func() (bool, error) {
   252  		ctx, cancel := context.WithTimeout(ctx, time.Hour*24)
   253  		defer cancel()
   254  
   255  		var err error
   256  		var completed bool
   257  		opts := executor.Options{}.
   258  			WithDatabase(catalog.MO_CATALOG).
   259  			WithMinCommittedTS(s.now()).
   260  			WithWaitCommittedLogApplied().
   261  			WithTimeZone(time.Local)
   262  		err = s.exec.ExecTxn(
   263  			ctx,
   264  			func(txn executor.TxnExecutor) error {
   265  				completed, err = s.performUpgrade(ctx, txn)
   266  				return err
   267  			},
   268  			opts)
   269  		return completed, err
   270  	}
   271  
   272  	timer := time.NewTimer(s.upgrade.checkUpgradeDuration)
   273  	defer timer.Stop()
   274  
   275  	defer func() {
   276  		getUpgradeLogger().Info("upgrade task exit",
   277  			zap.String("final", s.getFinalVersionHandle().Metadata().Version))
   278  	}()
   279  
   280  	for {
   281  		select {
   282  		case <-ctx.Done():
   283  			return
   284  		case <-timer.C:
   285  			if s.upgrade.finalVersionCompleted.Load() {
   286  				return
   287  			}
   288  
   289  			completed, err := fn()
   290  			if err == nil && completed {
   291  				s.upgrade.finalVersionCompleted.Store(true)
   292  				return
   293  			}
   294  			timer.Reset(s.upgrade.checkUpgradeDuration)
   295  		}
   296  	}
   297  }
   298  
   299  func (s *service) performUpgrade(
   300  	ctx context.Context,
   301  	txn executor.TxnExecutor) (bool, error) {
   302  	final := s.getFinalVersionHandle().Metadata()
   303  
   304  	// make sure only one cn can execute upgrade logic
   305  	state, ok, err := versions.GetVersionState(final.Version, final.VersionOffset, txn, true)
   306  	if err != nil {
   307  		getUpgradeLogger().Error("failed to load final version state",
   308  			zap.String("final", final.Version),
   309  			zap.Int32("versionOffset", int32(final.VersionOffset)),
   310  			zap.Error(err))
   311  		return false, err
   312  	}
   313  	if !ok {
   314  		getUpgradeLogger().Info("final version not found, retry later",
   315  			zap.String("final", final.Version),
   316  			zap.Int32("versionOffset", int32(final.VersionOffset)))
   317  		return false, nil
   318  	}
   319  
   320  	getUpgradeLogger().Info("final version state loaded",
   321  		zap.String("final", final.Version),
   322  		zap.Int32("versionOffset", int32(final.VersionOffset)),
   323  		zap.Int32("state", state))
   324  
   325  	if state == versions.StateReady {
   326  		return true, nil
   327  	}
   328  
   329  	// get upgrade steps, and perform upgrade one by one
   330  	upgrades, err := versions.GetUpgradeVersions(final.Version, final.VersionOffset, txn, true, true)
   331  	if err != nil {
   332  		getUpgradeLogger().Error("failed to load upgrades",
   333  			zap.String("final", final.Version),
   334  			zap.Error(err))
   335  		return false, err
   336  	}
   337  
   338  	for _, u := range upgrades {
   339  		getUpgradeLogger().Info("handle version upgrade",
   340  			zap.String("upgrade", u.String()))
   341  
   342  		state, err := s.doUpgrade(ctx, u, txn)
   343  		if err != nil {
   344  			getUpgradeLogger().Error("failed to handle version upgrade",
   345  				zap.String("upgrade", u.String()),
   346  				zap.String("final", final.Version),
   347  				zap.Error(err))
   348  			return false, err
   349  		}
   350  
   351  		switch state {
   352  		case versions.StateReady:
   353  			// upgrade was completed
   354  			getUpgradeLogger().Info("upgrade version completed",
   355  				zap.String("upgrade", u.String()),
   356  				zap.String("final", final.Version))
   357  		case versions.StateUpgradingTenant:
   358  			// we must wait all tenant upgrade completed, and then upgrade to
   359  			// next version
   360  			getUpgradeLogger().Info("upgrade version in tenant upgrading",
   361  				zap.String("upgrade", u.String()),
   362  				zap.String("final", final.Version))
   363  			return false, nil
   364  		default:
   365  			panic(fmt.Sprintf("BUG: invalid state %d", state))
   366  		}
   367  	}
   368  
   369  	// all upgrades completed, update final version to ready state.
   370  	if err := versions.UpdateVersionState(final.Version, final.VersionOffset, versions.StateReady, txn); err != nil {
   371  		getUpgradeLogger().Error("failed to update state",
   372  			zap.String("final", final.Version),
   373  			zap.Error(err))
   374  
   375  		return false, err
   376  	}
   377  
   378  	getUpgradeLogger().Info("upgrade to final version completed",
   379  		zap.String("final", final.Version))
   380  	return true, nil
   381  }
   382  
   383  // doUpgrade Corresponding to one upgrade step in a version upgrade
   384  func (s *service) doUpgrade(
   385  	ctx context.Context,
   386  	upgrade versions.VersionUpgrade,
   387  	txn executor.TxnExecutor) (int32, error) {
   388  	if upgrade.State == versions.StateReady {
   389  		return upgrade.State, nil
   390  	}
   391  
   392  	if (upgrade.UpgradeCluster == versions.No && upgrade.UpgradeTenant == versions.No) ||
   393  		(upgrade.State == versions.StateUpgradingTenant && upgrade.TotalTenant == upgrade.ReadyTenant) {
   394  		if err := versions.UpdateVersionUpgradeState(upgrade, versions.StateReady, txn); err != nil {
   395  			return 0, err
   396  		}
   397  		return versions.StateReady, nil
   398  	}
   399  
   400  	if upgrade.State == versions.StateUpgradingTenant {
   401  		return upgrade.State, nil
   402  	}
   403  
   404  	state := versions.StateReady
   405  	h := s.getVersionHandle(upgrade.ToVersion)
   406  
   407  	getUpgradeLogger().Info("execute upgrade prepare",
   408  		zap.String("upgrade", upgrade.String()))
   409  	if err := h.Prepare(ctx, txn, h.Metadata().Version == s.getFinalVersionHandle().Metadata().Version); err != nil {
   410  		return 0, err
   411  	}
   412  	getUpgradeLogger().Info("execute upgrade prepare completed",
   413  		zap.String("upgrade", upgrade.String()))
   414  
   415  	if upgrade.UpgradeCluster == versions.Yes {
   416  		getUpgradeLogger().Info("execute upgrade cluster",
   417  			zap.String("upgrade", upgrade.String()))
   418  		if err := h.HandleClusterUpgrade(ctx, txn); err != nil {
   419  			return 0, err
   420  		}
   421  		getUpgradeLogger().Info("execute upgrade cluster completed",
   422  			zap.String("upgrade", upgrade.String()))
   423  	}
   424  
   425  	if upgrade.UpgradeTenant == versions.Yes {
   426  		state = versions.StateUpgradingTenant
   427  		err := fetchTenants(
   428  			s.upgrade.upgradeTenantBatch,
   429  			func(ids []int32) error {
   430  				upgrade.TotalTenant += int32(len(ids))
   431  				getUpgradeLogger().Info("add tenants to upgrade",
   432  					zap.String("upgrade", upgrade.String()),
   433  					zap.Int32("from", ids[0]),
   434  					zap.Int32("to", ids[len(ids)-1]))
   435  				return versions.AddUpgradeTenantTask(upgrade.ID, upgrade.ToVersion, ids[0], ids[len(ids)-1], txn)
   436  			},
   437  			txn)
   438  		if err != nil {
   439  			return 0, err
   440  		}
   441  		if err := versions.UpdateVersionUpgradeTasks(upgrade, txn); err != nil {
   442  			return 0, err
   443  		}
   444  		getUpgradeLogger().Info("upgrade tenants task updated",
   445  			zap.String("upgrade", upgrade.String()))
   446  		if upgrade.TotalTenant == upgrade.ReadyTenant {
   447  			state = versions.StateReady
   448  		}
   449  	}
   450  
   451  	getUpgradeLogger().Info("upgrade update state",
   452  		zap.String("upgrade", upgrade.String()),
   453  		zap.Int32("state", state))
   454  	return state, versions.UpdateVersionUpgradeState(upgrade, state, txn)
   455  }
   456  
   457  func retryRun(
   458  	ctx context.Context,
   459  	name string,
   460  	fn func(ctx context.Context) error) error {
   461  	wait := time.Second
   462  	maxWait := time.Second * 10
   463  	for {
   464  		err := fn(ctx)
   465  		if err == nil {
   466  			return nil
   467  		}
   468  		getUpgradeLogger().Error("execute task failed, retry later",
   469  			zap.String("task", name),
   470  			zap.Duration("wait", wait),
   471  			zap.Error(err))
   472  		time.Sleep(wait)
   473  		wait *= 2
   474  		if wait > maxWait {
   475  			wait = maxWait
   476  		}
   477  		select {
   478  		case <-ctx.Done():
   479  			return ctx.Err()
   480  		default:
   481  		}
   482  	}
   483  }
   484  
   485  func (s *service) adjustUpgrade() {
   486  	if s.upgrade.upgradeTenantBatch == 0 {
   487  		s.upgrade.upgradeTenantBatch = defaultUpgradeTenantBatch
   488  	}
   489  	if s.upgrade.checkUpgradeDuration == 0 {
   490  		s.upgrade.checkUpgradeDuration = defaultCheckUpgradeDuration
   491  	}
   492  	if s.upgrade.checkUpgradeTenantDuration == 0 {
   493  		s.upgrade.checkUpgradeTenantDuration = defaultCheckUpgradeTenantDuration
   494  	}
   495  	if s.upgrade.upgradeTenantTasks == 0 {
   496  		s.upgrade.upgradeTenantTasks = defaultUpgradeTenantTasks
   497  	}
   498  	getUpgradeLogger().Info("upgrade config",
   499  		zap.Duration("check-upgrade-duration", s.upgrade.checkUpgradeDuration),
   500  		zap.Duration("check-upgrade-tenant-duration", s.upgrade.checkUpgradeTenantDuration),
   501  		zap.Int("upgrade-tenant-tasks", s.upgrade.upgradeTenantTasks),
   502  		zap.Int("tenant-batch", s.upgrade.upgradeTenantBatch))
   503  }
   504  
   505  // createFrameworkTables When init upgrade framework for the first time,
   506  // create the tables that the upgrade framework depends on
   507  func createFrameworkTables(
   508  	txn executor.TxnExecutor,
   509  	final versions.Version) error {
   510  	values := versions.FrameworkInitSQLs
   511  	values = append(values, final.GetInitVersionSQL(versions.StateReady))
   512  
   513  	for _, sql := range values {
   514  		r, err := txn.Exec(sql, executor.StatementOption{})
   515  		if err != nil {
   516  			return err
   517  		}
   518  		r.Close()
   519  	}
   520  	return nil
   521  }