github.com/matrixorigin/matrixone@v1.2.0/pkg/bootstrap/service_upgrade.go (about) 1 // Copyright 2023 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package bootstrap 16 17 import ( 18 "context" 19 "fmt" 20 "time" 21 22 "github.com/matrixorigin/matrixone/pkg/bootstrap/versions" 23 "github.com/matrixorigin/matrixone/pkg/catalog" 24 "github.com/matrixorigin/matrixone/pkg/util/executor" 25 "go.uber.org/zap" 26 ) 27 28 var ( 29 defaultUpgradeTenantBatch = 16 30 defaultCheckUpgradeDuration = time.Second * 5 31 defaultCheckUpgradeTenantDuration = time.Second * 10 32 defaultUpgradeTenantTasks = 4 33 ) 34 35 func (s *service) BootstrapUpgrade(ctx context.Context) error { 36 getUpgradeLogger().Info("start bootstrap upgrade") 37 s.adjustUpgrade() 38 // MO's upgrade framework is automated, requiring no manual execution of any 39 // upgrade commands, and supports cross-version upgrades. All upgrade processes 40 // are executed at the CN node. Currently, rollback upgrade is not supported. 41 // 42 // When a new version of the CN node is started, it will first get the current 43 // version of the cluster running, and determine the upgrade route before this 44 // version and the current new version of the CN. When upgrading across versions, 45 // this upgrade route will go through multiple versions of upgrades, and finally 46 // upgrade to the current version of the CN. 47 // 48 // Each version upgrade, for the previous version, contains 2 parts, one is to 49 // upgrade the cluster metadata and the other is to upgrade the tenant metadata. 50 // 51 // For upgrading cluster metadata, it is usually very fast, usually it is creating 52 // some new metadata tables or updating the structure of some metadata tables, and 53 // this process is performed on one CN. 54 // 55 // For upgrading tenant metadata, the time consuming upgrade depends on the number 56 // of tenants, and since MO is a meta-native multi-tenant database, our default 57 // number of tenants is huge. So the whole tenant upgrade is asynchronous and will 58 // be grouped for all tenants and concurrently executed on multiple CNs at the same 59 // time. 60 if err := retryRun(ctx, "doCheckUpgrade", s.doCheckUpgrade); err != nil { 61 getUpgradeLogger().Error("check upgrade failed", zap.Error(err)) 62 return err 63 } 64 if err := s.stopper.RunTask(s.asyncUpgradeTask); err != nil { 65 return err 66 } 67 for i := 0; i < s.upgrade.upgradeTenantTasks; i++ { 68 if err := s.stopper.RunTask(s.asyncUpgradeTenantTask); err != nil { 69 return err 70 } 71 } 72 return nil 73 } 74 75 // doCheckUpgrade get the current version of the cluster running, and determine the upgrade 76 // route before this version and the current new version of the CN. 77 // 78 // Note that this logic will execute concurrently if more than one CN starts at the same 79 // time, but it doesn't matter, we use select for update to make it so that only one CN can 80 // create the upgrade step. 81 func (s *service) doCheckUpgrade(ctx context.Context) error { 82 opts := executor.Options{}. 83 WithDatabase(catalog.MO_CATALOG). 84 WithMinCommittedTS(s.now()). 85 WithWaitCommittedLogApplied(). 86 WithTimeZone(time.Local) 87 return s.exec.ExecTxn( 88 ctx, 89 func(txn executor.TxnExecutor) error { 90 final := s.getFinalVersionHandle().Metadata() 91 92 // Deploy mo first time without 1.2.0, init framework first. 93 // And upgrade to current version. 94 created, err := versions.IsFrameworkTablesCreated(txn) 95 if err != nil { 96 getUpgradeLogger().Error("failed to check upgrade framework", 97 zap.Error(err)) 98 return err 99 } 100 101 // First version as a genesis version, always need to be PREPARE. 102 // Because the first version need to init upgrade framework tables. 103 if !created { 104 getUpgradeLogger().Info("init upgrade framework", 105 zap.String("final-version", final.Version)) 106 107 // create new upgrade framework tables for the first time, 108 // which means using v1.2.0 for the first time 109 err = s.getFinalVersionHandle().HandleCreateFrameworkDeps(txn) 110 if err != nil { 111 getLogger().Error("execute pre dependencies error when creating a new upgrade framework", zap.Error(err)) 112 return err 113 } 114 115 // Many cn maybe create framework tables parallel, only one can create success. 116 // Just return error, and upgrade framework will retry. 117 err = createFrameworkTables(txn, final) 118 if err != nil { 119 getLogger().Error("create upgrade framework tables error", zap.Error(err)) 120 return err 121 } 122 getLogger().Info("create upgrade framework tables success") 123 } 124 125 // lock version table 126 if err := txn.LockTable(catalog.MOVersionTable); err != nil { 127 getUpgradeLogger().Error("failed to lock table", 128 zap.String("table", catalog.MOVersionTable), 129 zap.Error(err)) 130 return err 131 } 132 133 v, err := versions.GetLatestVersion(txn) 134 if err != nil { 135 getUpgradeLogger().Error("failed to get latest version", 136 zap.Error(err)) 137 return err 138 } 139 140 getUpgradeLogger().Info("get current mo cluster latest version", 141 zap.String("latest", v.Version), 142 zap.String("final", final.Version)) 143 144 // cluster is upgrading to v1, only v1's cn can start up. 145 if !v.IsReady() && v.Version != final.Version { 146 panic(fmt.Sprintf("cannot upgrade to version %s, because version %s is in upgrading", 147 final.Version, 148 v.Version)) 149 } 150 // cluster is running at v1, cannot startup a old version to join cluster. 151 if v.IsReady() && versions.Compare(final.Version, v.Version) < 0 { 152 panic(fmt.Sprintf("cannot startup a old version %s to join cluster, current version is %s", 153 final.Version, 154 v.Version)) 155 } 156 157 // check upgrade has 2 step: 158 // 1: already checked, version exists 159 // 2: add upgrades from latest version to final version 160 checker := func() (bool, error) { 161 if v.Version == final.Version && v.VersionOffset >= final.VersionOffset { 162 return true, nil 163 } 164 165 state, ok, err := versions.GetVersionState(final.Version, final.VersionOffset, txn, false) 166 if err == nil && ok && state == versions.StateReady { 167 s.upgrade.finalVersionCompleted.Store(true) 168 } 169 if err != nil { 170 getUpgradeLogger().Error("failed to get final version state", 171 zap.String("final", final.Version), 172 zap.Error(err)) 173 } 174 return ok, err 175 } 176 177 addUpgradesToFinalVersion := func() error { 178 if err := versions.AddVersion(final.Version, final.VersionOffset, versions.StateCreated, txn); err != nil { 179 getUpgradeLogger().Error("failed to add final version", 180 zap.String("final", final.Version), 181 zap.Error(err)) 182 return err 183 } 184 185 getUpgradeLogger().Error("final version added", 186 zap.String("final", final.Version)) 187 188 latest, err := versions.MustGetLatestReadyVersion(txn) 189 if err != nil { 190 getUpgradeLogger().Error("failed to get latest ready version", 191 zap.String("latest", latest), 192 zap.Error(err)) 193 return err 194 } 195 196 getUpgradeLogger().Info("current latest ready version loaded", 197 zap.String("latest", latest), 198 zap.String("final", final.Version), 199 zap.Int32("versionOffset", int32(final.VersionOffset))) 200 201 var upgrades []versions.VersionUpgrade 202 from := latest 203 append := func(v versions.Version) { 204 order := int32(len(upgrades)) 205 u := versions.VersionUpgrade{ 206 FromVersion: from, 207 ToVersion: v.Version, 208 FinalVersion: final.Version, 209 FinalVersionOffset: final.VersionOffset, 210 State: versions.StateCreated, 211 UpgradeOrder: order, 212 UpgradeCluster: v.UpgradeCluster, 213 UpgradeTenant: v.UpgradeTenant, 214 } 215 upgrades = append(upgrades, u) 216 217 getUpgradeLogger().Info("version upgrade added", 218 zap.String("upgrade", u.String()), 219 zap.String("final", final.Version)) 220 } 221 222 // can upgrade to final version directly. 223 if final.CanDirectUpgrade(latest) { 224 append(final) 225 } else { 226 for _, v := range s.handles { 227 if versions.Compare(v.Metadata().Version, from) > 0 && 228 v.Metadata().CanDirectUpgrade(from) { 229 append(v.Metadata()) 230 from = v.Metadata().Version 231 } 232 } 233 } 234 return versions.AddVersionUpgrades(upgrades, txn) 235 } 236 237 // step 1 238 if versionAdded, err := checker(); err != nil || versionAdded { 239 return err 240 } 241 242 // step 2 243 return addUpgradesToFinalVersion() 244 }, 245 opts) 246 } 247 248 // asyncUpgradeTask is a task that executes the upgrade logic step by step 249 // according to the created upgrade steps 250 func (s *service) asyncUpgradeTask(ctx context.Context) { 251 fn := func() (bool, error) { 252 ctx, cancel := context.WithTimeout(ctx, time.Hour*24) 253 defer cancel() 254 255 var err error 256 var completed bool 257 opts := executor.Options{}. 258 WithDatabase(catalog.MO_CATALOG). 259 WithMinCommittedTS(s.now()). 260 WithWaitCommittedLogApplied(). 261 WithTimeZone(time.Local) 262 err = s.exec.ExecTxn( 263 ctx, 264 func(txn executor.TxnExecutor) error { 265 completed, err = s.performUpgrade(ctx, txn) 266 return err 267 }, 268 opts) 269 return completed, err 270 } 271 272 timer := time.NewTimer(s.upgrade.checkUpgradeDuration) 273 defer timer.Stop() 274 275 defer func() { 276 getUpgradeLogger().Info("upgrade task exit", 277 zap.String("final", s.getFinalVersionHandle().Metadata().Version)) 278 }() 279 280 for { 281 select { 282 case <-ctx.Done(): 283 return 284 case <-timer.C: 285 if s.upgrade.finalVersionCompleted.Load() { 286 return 287 } 288 289 completed, err := fn() 290 if err == nil && completed { 291 s.upgrade.finalVersionCompleted.Store(true) 292 return 293 } 294 timer.Reset(s.upgrade.checkUpgradeDuration) 295 } 296 } 297 } 298 299 func (s *service) performUpgrade( 300 ctx context.Context, 301 txn executor.TxnExecutor) (bool, error) { 302 final := s.getFinalVersionHandle().Metadata() 303 304 // make sure only one cn can execute upgrade logic 305 state, ok, err := versions.GetVersionState(final.Version, final.VersionOffset, txn, true) 306 if err != nil { 307 getUpgradeLogger().Error("failed to load final version state", 308 zap.String("final", final.Version), 309 zap.Int32("versionOffset", int32(final.VersionOffset)), 310 zap.Error(err)) 311 return false, err 312 } 313 if !ok { 314 getUpgradeLogger().Info("final version not found, retry later", 315 zap.String("final", final.Version), 316 zap.Int32("versionOffset", int32(final.VersionOffset))) 317 return false, nil 318 } 319 320 getUpgradeLogger().Info("final version state loaded", 321 zap.String("final", final.Version), 322 zap.Int32("versionOffset", int32(final.VersionOffset)), 323 zap.Int32("state", state)) 324 325 if state == versions.StateReady { 326 return true, nil 327 } 328 329 // get upgrade steps, and perform upgrade one by one 330 upgrades, err := versions.GetUpgradeVersions(final.Version, final.VersionOffset, txn, true, true) 331 if err != nil { 332 getUpgradeLogger().Error("failed to load upgrades", 333 zap.String("final", final.Version), 334 zap.Error(err)) 335 return false, err 336 } 337 338 for _, u := range upgrades { 339 getUpgradeLogger().Info("handle version upgrade", 340 zap.String("upgrade", u.String())) 341 342 state, err := s.doUpgrade(ctx, u, txn) 343 if err != nil { 344 getUpgradeLogger().Error("failed to handle version upgrade", 345 zap.String("upgrade", u.String()), 346 zap.String("final", final.Version), 347 zap.Error(err)) 348 return false, err 349 } 350 351 switch state { 352 case versions.StateReady: 353 // upgrade was completed 354 getUpgradeLogger().Info("upgrade version completed", 355 zap.String("upgrade", u.String()), 356 zap.String("final", final.Version)) 357 case versions.StateUpgradingTenant: 358 // we must wait all tenant upgrade completed, and then upgrade to 359 // next version 360 getUpgradeLogger().Info("upgrade version in tenant upgrading", 361 zap.String("upgrade", u.String()), 362 zap.String("final", final.Version)) 363 return false, nil 364 default: 365 panic(fmt.Sprintf("BUG: invalid state %d", state)) 366 } 367 } 368 369 // all upgrades completed, update final version to ready state. 370 if err := versions.UpdateVersionState(final.Version, final.VersionOffset, versions.StateReady, txn); err != nil { 371 getUpgradeLogger().Error("failed to update state", 372 zap.String("final", final.Version), 373 zap.Error(err)) 374 375 return false, err 376 } 377 378 getUpgradeLogger().Info("upgrade to final version completed", 379 zap.String("final", final.Version)) 380 return true, nil 381 } 382 383 // doUpgrade Corresponding to one upgrade step in a version upgrade 384 func (s *service) doUpgrade( 385 ctx context.Context, 386 upgrade versions.VersionUpgrade, 387 txn executor.TxnExecutor) (int32, error) { 388 if upgrade.State == versions.StateReady { 389 return upgrade.State, nil 390 } 391 392 if (upgrade.UpgradeCluster == versions.No && upgrade.UpgradeTenant == versions.No) || 393 (upgrade.State == versions.StateUpgradingTenant && upgrade.TotalTenant == upgrade.ReadyTenant) { 394 if err := versions.UpdateVersionUpgradeState(upgrade, versions.StateReady, txn); err != nil { 395 return 0, err 396 } 397 return versions.StateReady, nil 398 } 399 400 if upgrade.State == versions.StateUpgradingTenant { 401 return upgrade.State, nil 402 } 403 404 state := versions.StateReady 405 h := s.getVersionHandle(upgrade.ToVersion) 406 407 getUpgradeLogger().Info("execute upgrade prepare", 408 zap.String("upgrade", upgrade.String())) 409 if err := h.Prepare(ctx, txn, h.Metadata().Version == s.getFinalVersionHandle().Metadata().Version); err != nil { 410 return 0, err 411 } 412 getUpgradeLogger().Info("execute upgrade prepare completed", 413 zap.String("upgrade", upgrade.String())) 414 415 if upgrade.UpgradeCluster == versions.Yes { 416 getUpgradeLogger().Info("execute upgrade cluster", 417 zap.String("upgrade", upgrade.String())) 418 if err := h.HandleClusterUpgrade(ctx, txn); err != nil { 419 return 0, err 420 } 421 getUpgradeLogger().Info("execute upgrade cluster completed", 422 zap.String("upgrade", upgrade.String())) 423 } 424 425 if upgrade.UpgradeTenant == versions.Yes { 426 state = versions.StateUpgradingTenant 427 err := fetchTenants( 428 s.upgrade.upgradeTenantBatch, 429 func(ids []int32) error { 430 upgrade.TotalTenant += int32(len(ids)) 431 getUpgradeLogger().Info("add tenants to upgrade", 432 zap.String("upgrade", upgrade.String()), 433 zap.Int32("from", ids[0]), 434 zap.Int32("to", ids[len(ids)-1])) 435 return versions.AddUpgradeTenantTask(upgrade.ID, upgrade.ToVersion, ids[0], ids[len(ids)-1], txn) 436 }, 437 txn) 438 if err != nil { 439 return 0, err 440 } 441 if err := versions.UpdateVersionUpgradeTasks(upgrade, txn); err != nil { 442 return 0, err 443 } 444 getUpgradeLogger().Info("upgrade tenants task updated", 445 zap.String("upgrade", upgrade.String())) 446 if upgrade.TotalTenant == upgrade.ReadyTenant { 447 state = versions.StateReady 448 } 449 } 450 451 getUpgradeLogger().Info("upgrade update state", 452 zap.String("upgrade", upgrade.String()), 453 zap.Int32("state", state)) 454 return state, versions.UpdateVersionUpgradeState(upgrade, state, txn) 455 } 456 457 func retryRun( 458 ctx context.Context, 459 name string, 460 fn func(ctx context.Context) error) error { 461 wait := time.Second 462 maxWait := time.Second * 10 463 for { 464 err := fn(ctx) 465 if err == nil { 466 return nil 467 } 468 getUpgradeLogger().Error("execute task failed, retry later", 469 zap.String("task", name), 470 zap.Duration("wait", wait), 471 zap.Error(err)) 472 time.Sleep(wait) 473 wait *= 2 474 if wait > maxWait { 475 wait = maxWait 476 } 477 select { 478 case <-ctx.Done(): 479 return ctx.Err() 480 default: 481 } 482 } 483 } 484 485 func (s *service) adjustUpgrade() { 486 if s.upgrade.upgradeTenantBatch == 0 { 487 s.upgrade.upgradeTenantBatch = defaultUpgradeTenantBatch 488 } 489 if s.upgrade.checkUpgradeDuration == 0 { 490 s.upgrade.checkUpgradeDuration = defaultCheckUpgradeDuration 491 } 492 if s.upgrade.checkUpgradeTenantDuration == 0 { 493 s.upgrade.checkUpgradeTenantDuration = defaultCheckUpgradeTenantDuration 494 } 495 if s.upgrade.upgradeTenantTasks == 0 { 496 s.upgrade.upgradeTenantTasks = defaultUpgradeTenantTasks 497 } 498 getUpgradeLogger().Info("upgrade config", 499 zap.Duration("check-upgrade-duration", s.upgrade.checkUpgradeDuration), 500 zap.Duration("check-upgrade-tenant-duration", s.upgrade.checkUpgradeTenantDuration), 501 zap.Int("upgrade-tenant-tasks", s.upgrade.upgradeTenantTasks), 502 zap.Int("tenant-batch", s.upgrade.upgradeTenantBatch)) 503 } 504 505 // createFrameworkTables When init upgrade framework for the first time, 506 // create the tables that the upgrade framework depends on 507 func createFrameworkTables( 508 txn executor.TxnExecutor, 509 final versions.Version) error { 510 values := versions.FrameworkInitSQLs 511 values = append(values, final.GetInitVersionSQL(versions.StateReady)) 512 513 for _, sql := range values { 514 r, err := txn.Exec(sql, executor.StatementOption{}) 515 if err != nil { 516 return err 517 } 518 r.Close() 519 } 520 return nil 521 }