github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sqlmigrations/migrations.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package sqlmigrations 12 13 import ( 14 "context" 15 "fmt" 16 "sort" 17 "time" 18 19 "github.com/cockroachdb/cockroach/pkg/base" 20 "github.com/cockroachdb/cockroach/pkg/clusterversion" 21 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 22 "github.com/cockroachdb/cockroach/pkg/jobs" 23 "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" 24 "github.com/cockroachdb/cockroach/pkg/keys" 25 "github.com/cockroachdb/cockroach/pkg/kv" 26 "github.com/cockroachdb/cockroach/pkg/roachpb" 27 "github.com/cockroachdb/cockroach/pkg/security" 28 "github.com/cockroachdb/cockroach/pkg/settings" 29 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 30 "github.com/cockroachdb/cockroach/pkg/sql" 31 "github.com/cockroachdb/cockroach/pkg/sql/catalog/catalogkv" 32 "github.com/cockroachdb/cockroach/pkg/sql/catalog/descs" 33 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 34 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 35 "github.com/cockroachdb/cockroach/pkg/sqlmigrations/leasemanager" 36 "github.com/cockroachdb/cockroach/pkg/util/encoding" 37 "github.com/cockroachdb/cockroach/pkg/util/hlc" 38 "github.com/cockroachdb/cockroach/pkg/util/log" 39 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 40 "github.com/cockroachdb/cockroach/pkg/util/retry" 41 "github.com/cockroachdb/cockroach/pkg/util/stop" 42 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 43 "github.com/cockroachdb/errors" 44 "github.com/cockroachdb/logtags" 45 ) 46 47 var ( 48 leaseDuration = time.Minute 49 leaseRefreshInterval = leaseDuration / 5 50 ) 51 52 // MigrationManagerTestingKnobs contains testing knobs. 53 type MigrationManagerTestingKnobs struct { 54 // DisableBackfillMigrations stops applying migrations once 55 // a migration with 'doesBackfill == true' is encountered. 56 // TODO(mberhault): we could skip only backfill migrations and dependencies 57 // if we had some concept of migration dependencies. 58 DisableBackfillMigrations bool 59 AfterJobMigration func() 60 // AlwaysRunJobMigration controls whether to always run the schema change job 61 // migration regardless of whether it has been marked as complete. 62 AlwaysRunJobMigration bool 63 } 64 65 // ModuleTestingKnobs is part of the base.ModuleTestingKnobs interface. 66 func (*MigrationManagerTestingKnobs) ModuleTestingKnobs() {} 67 68 var _ base.ModuleTestingKnobs = &MigrationManagerTestingKnobs{} 69 70 // backwardCompatibleMigrations is a hard-coded list of migrations to be run on 71 // startup. They will always be run from top-to-bottom, and because they are 72 // assumed to be backward-compatible, they will be run regardless of what other 73 // node versions are currently running within the cluster. 74 // Migrations must be idempotent: a migration may run successfully but not be 75 // recorded as completed, causing a second run. 76 // 77 // Attention: If a migration is creating new tables, it should also be added to 78 // the metadata schema written by bootstrap (see addSystemDatabaseToSchema()) 79 // and it should have the includedInBootstrap field set (see comments on that 80 // field too). 81 var backwardCompatibleMigrations = []migrationDescriptor{ 82 { 83 // Introduced in v1.0. Baked into v2.0. 84 name: "default UniqueID to uuid_v4 in system.eventlog", 85 }, 86 { 87 // Introduced in v1.0. Baked into v2.0. 88 name: "create system.jobs table", 89 }, 90 { 91 // Introduced in v1.0. Baked into v2.0. 92 name: "create system.settings table", 93 }, 94 { 95 // Introduced in v1.0. Permanent migration. 96 name: "enable diagnostics reporting", 97 workFn: optInToDiagnosticsStatReporting, 98 clusterWide: true, 99 }, 100 { 101 // Introduced in v1.1. Baked into v2.0. 102 name: "establish conservative dependencies for views #17280 #17269 #17306", 103 }, 104 { 105 // Introduced in v1.1. Baked into v2.0. 106 name: "create system.sessions table", 107 }, 108 { 109 // Introduced in v1.1. Permanent migration. 110 name: "populate initial version cluster setting table entry", 111 workFn: populateVersionSetting, 112 clusterWide: true, 113 }, 114 { 115 // Introduced in v2.0. Baked into v2.1. 116 name: "create system.table_statistics table", 117 newDescriptorIDs: staticIDs(keys.TableStatisticsTableID), 118 }, 119 { 120 // Introduced in v2.0. Permanent migration. 121 name: "add root user", 122 workFn: addRootUser, 123 }, 124 { 125 // Introduced in v2.0. Baked into v2.1. 126 name: "create system.locations table", 127 newDescriptorIDs: staticIDs(keys.LocationsTableID), 128 }, 129 { 130 // Introduced in v2.0. Baked into v2.1. 131 name: "add default .meta and .liveness zone configs", 132 }, 133 { 134 // Introduced in v2.0. Baked into v2.1. 135 name: "create system.role_members table", 136 newDescriptorIDs: staticIDs(keys.RoleMembersTableID), 137 }, 138 { 139 // Introduced in v2.0. Permanent migration. 140 name: "add system.users isRole column and create admin role", 141 workFn: addAdminRole, 142 }, 143 { 144 // Introduced in v2.0, replaced by "ensure admin role privileges in all descriptors" 145 name: "grant superuser privileges on all objects to the admin role", 146 }, 147 { 148 // Introduced in v2.0. Permanent migration. 149 name: "make root a member of the admin role", 150 workFn: addRootToAdminRole, 151 }, 152 { 153 // Introduced in v2.0. Baked into v2.1. 154 name: "upgrade table descs to interleaved format version", 155 }, 156 { 157 // Introduced in v2.0 alphas then folded into `retiredSettings`. 158 name: "remove cluster setting `kv.gc.batch_size`", 159 }, 160 { 161 // Introduced in v2.0 alphas then folded into `retiredSettings`. 162 name: "remove cluster setting `kv.transaction.max_intents`", 163 }, 164 { 165 // Introduced in v2.0. Baked into v2.1. 166 name: "add default system.jobs zone config", 167 }, 168 { 169 // Introduced in v2.0. Permanent migration. 170 name: "initialize cluster.secret", 171 workFn: initializeClusterSecret, 172 clusterWide: true, 173 }, 174 { 175 // Introduced in v2.0. Repeated in v2.1 below. 176 name: "ensure admin role privileges in all descriptors", 177 }, 178 { 179 // Introduced in v2.1, repeat of 2.0 migration to catch mixed-version issues. 180 // TODO(mberhault): bake into v19.1. 181 name: "repeat: ensure admin role privileges in all descriptors", 182 }, 183 { 184 // Introduced in v2.1. 185 // TODO(mberhault): bake into v19.1. 186 name: "disallow public user or role name", 187 workFn: disallowPublicUserOrRole, 188 }, 189 { 190 // Introduced in v2.1. 191 // TODO(knz): bake this migration into v19.1. 192 name: "create default databases", 193 workFn: createDefaultDbs, 194 newDescriptorIDs: databaseIDs(sqlbase.DefaultDatabaseName, sqlbase.PgDatabaseName), 195 }, 196 { 197 // Introduced in v2.1. Baked into 20.1. 198 name: "add progress to system.jobs", 199 }, 200 { 201 // Introduced in v19.1. 202 // TODO(knz): bake this migration into v19.2 203 name: "create system.comment table", 204 workFn: createCommentTable, 205 // This migration has been introduced some time before 19.2. 206 includedInBootstrap: clusterversion.VersionByKey(clusterversion.Version19_2), 207 newDescriptorIDs: staticIDs(keys.CommentsTableID), 208 }, 209 { 210 name: "create system.replication_constraint_stats table", 211 workFn: createReplicationConstraintStatsTable, 212 // This migration has been introduced some time before 19.2. 213 includedInBootstrap: clusterversion.VersionByKey(clusterversion.Version19_2), 214 newDescriptorIDs: staticIDs(keys.ReplicationConstraintStatsTableID), 215 }, 216 { 217 name: "create system.replication_critical_localities table", 218 workFn: createReplicationCriticalLocalitiesTable, 219 // This migration has been introduced some time before 19.2. 220 includedInBootstrap: clusterversion.VersionByKey(clusterversion.Version19_2), 221 newDescriptorIDs: staticIDs(keys.ReplicationCriticalLocalitiesTableID), 222 }, 223 { 224 name: "create system.reports_meta table", 225 workFn: createReportsMetaTable, 226 // This migration has been introduced some time before 19.2. 227 includedInBootstrap: clusterversion.VersionByKey(clusterversion.Version19_2), 228 newDescriptorIDs: staticIDs(keys.ReportsMetaTableID), 229 }, 230 { 231 name: "create system.replication_stats table", 232 workFn: createReplicationStatsTable, 233 // This migration has been introduced some time before 19.2. 234 includedInBootstrap: clusterversion.VersionByKey(clusterversion.Version19_2), 235 newDescriptorIDs: staticIDs(keys.ReplicationStatsTableID), 236 }, 237 { 238 // Introduced in v19.1. 239 // TODO(knz): bake this migration into v19.2. 240 name: "propagate the ts purge interval to the new setting names", 241 workFn: retireOldTsPurgeIntervalSettings, 242 clusterWide: true, 243 }, 244 { 245 // Introduced in v19.2. 246 name: "update system.locations with default location data", 247 workFn: updateSystemLocationData, 248 }, 249 { 250 // Introduced in v19.2, baked into v20.1. 251 name: "change reports fields from timestamp to timestamptz", 252 includedInBootstrap: clusterversion.VersionByKey(clusterversion.Version19_2), 253 }, 254 { 255 // Introduced in v20.1. 256 // TODO(ajwerner): Bake this migration into v20.2. 257 name: "create system.protected_ts_meta table", 258 workFn: createProtectedTimestampsMetaTable, 259 includedInBootstrap: clusterversion.VersionByKey(clusterversion.VersionProtectedTimestamps), 260 newDescriptorIDs: staticIDs(keys.ProtectedTimestampsMetaTableID), 261 }, 262 { 263 // Introduced in v20.1. 264 // TODO(ajwerner): Bake this migration into v20.2. 265 name: "create system.protected_ts_records table", 266 workFn: createProtectedTimestampsRecordsTable, 267 includedInBootstrap: clusterversion.VersionByKey(clusterversion.VersionProtectedTimestamps), 268 newDescriptorIDs: staticIDs(keys.ProtectedTimestampsRecordsTableID), 269 }, 270 { 271 // Introduced in v20.1. Note that this migration 272 // has v2 appended to it because in 20.1 betas, the migration edited the old 273 // system.namespace descriptor to change its Name. This wrought havoc, 274 // causing #47167, which caused 19.2 nodes to fail to be able to read 275 // system.namespace from SQL queries. However, without changing the old 276 // descriptor's Name, backup would fail, since backup requires that no two 277 // descriptors have the same Name. So, in v2 of this migration, we edit 278 // the name of the new table's Descriptor, calling it 279 // namespace2, and re-edit the old descriptor's Name to 280 // be just "namespace" again, to try to help clusters that might have 281 // upgraded to the 20.1 betas with the problem. 282 name: "create new system.namespace table v2", 283 workFn: createNewSystemNamespaceDescriptor, 284 includedInBootstrap: clusterversion.VersionByKey(clusterversion.VersionNamespaceTableWithSchemas), 285 newDescriptorIDs: staticIDs(keys.NamespaceTableID), 286 }, 287 { 288 // Introduced in v20.10. Replaced in v20.1.1 and v20.2 by the 289 // StartSystemNamespaceMigration post-finalization-style migration. 290 name: "migrate system.namespace_deprecated entries into system.namespace", 291 // workFn: migrateSystemNamespace, 292 includedInBootstrap: clusterversion.VersionByKey(clusterversion.VersionNamespaceTableWithSchemas), 293 }, 294 { 295 // Introduced in v20.1. 296 name: "create system.role_options table", 297 workFn: createRoleOptionsTable, 298 includedInBootstrap: clusterversion.VersionByKey(clusterversion.VersionCreateRolePrivilege), 299 newDescriptorIDs: staticIDs(keys.RoleOptionsTableID), 300 }, 301 { 302 // Introduced in v20.1. 303 // TODO(andrei): Bake this migration into v20.2. 304 name: "create statement_diagnostics_requests, statement_diagnostics and " + 305 "system.statement_bundle_chunks tables", 306 workFn: createStatementInfoSystemTables, 307 includedInBootstrap: clusterversion.VersionByKey(clusterversion.VersionStatementDiagnosticsSystemTables), 308 newDescriptorIDs: staticIDs(keys.StatementBundleChunksTableID, 309 keys.StatementDiagnosticsRequestsTableID, keys.StatementDiagnosticsTableID), 310 }, 311 { 312 // Introduced in v20.1. 313 name: "remove public permissions on system.comments", 314 includedInBootstrap: clusterversion.VersionByKey(clusterversion.VersionSchemaChangeJob), 315 workFn: depublicizeSystemComments, 316 }, 317 { 318 // Introduced in v20.1. 319 name: "add CREATEROLE privilege to admin/root", 320 workFn: addCreateRoleToAdminAndRoot, 321 }, 322 { 323 // Introduced in v20.2. 324 name: "add created_by columns to system.jobs", 325 workFn: alterSystemJobsAddCreatedByColumns, 326 includedInBootstrap: clusterversion.VersionByKey( 327 clusterversion.VersionAlterSystemJobsAddCreatedByColumns), 328 }, 329 } 330 331 func staticIDs( 332 ids ...sqlbase.ID, 333 ) func(ctx context.Context, db db, codec keys.SQLCodec) ([]sqlbase.ID, error) { 334 return func(ctx context.Context, db db, codec keys.SQLCodec) ([]sqlbase.ID, error) { return ids, nil } 335 } 336 337 func databaseIDs( 338 names ...string, 339 ) func(ctx context.Context, db db, codec keys.SQLCodec) ([]sqlbase.ID, error) { 340 return func(ctx context.Context, db db, codec keys.SQLCodec) ([]sqlbase.ID, error) { 341 var ids []sqlbase.ID 342 for _, name := range names { 343 // This runs as part of an older migration (introduced in 2.1). We use 344 // the DeprecatedDatabaseKey, and let the 20.1 migration handle moving 345 // from the old namespace table into the new one. 346 kv, err := db.Get(ctx, sqlbase.NewDeprecatedDatabaseKey(name).Key(codec)) 347 if err != nil { 348 return nil, err 349 } 350 ids = append(ids, sqlbase.ID(kv.ValueInt())) 351 } 352 return ids, nil 353 } 354 } 355 356 // migrationDescriptor describes a single migration hook that's used to modify 357 // some part of the cluster state when the CockroachDB version is upgraded. 358 // See docs/RFCs/cluster_upgrade_tool.md for details. 359 type migrationDescriptor struct { 360 // name must be unique amongst all hard-coded migrations. 361 // ATTENTION: A migration's name can never be changed. It is included in a key 362 // marking a migration as completed. 363 name string 364 // workFn must be idempotent so that we can safely re-run it if a node failed 365 // while running it. nil if the migration has been "backed in" and is no 366 // longer to be performed at cluster startup. 367 workFn func(context.Context, runner) error 368 // includedInBootstrap is set for migrations that need to be performed for 369 // updating old clusters, but are also covered by the MetadataSchema that gets 370 // created by hand for a new cluster when it bootstraps itself. This kind of 371 // duplication between a migration and the MetadataSchema is useful for 372 // migrations that create system descriptor - for new clusters (particularly 373 // for tests) we want to create these tables by hand so that a corresponding 374 // range is created at bootstrap time. Otherwise, we'd have the split queue 375 // asynchronously creating some ranges which is annoying for tests. 376 // 377 // Generally when setting this field you'll want to introduce a new cluster 378 // version. 379 includedInBootstrap roachpb.Version 380 // doesBackfill should be set to true if the migration triggers a backfill. 381 doesBackfill bool 382 // clusterWide migrations are only run by the system tenant. All other 383 // migrations are run by each individual tenant. clusterWide migrations 384 // typically have to do with cluster settings, which is a cluster-wide 385 // concept. 386 clusterWide bool 387 // newDescriptorIDs is a function that returns the IDs of any additional 388 // descriptors that were added by this migration. This is needed to automate 389 // certain tests, which check the number of ranges/descriptors present on 390 // server bootup. 391 newDescriptorIDs func(ctx context.Context, db db, codec keys.SQLCodec) ([]sqlbase.ID, error) 392 } 393 394 func init() { 395 // Ensure that all migrations have unique names. 396 names := make(map[string]struct{}, len(backwardCompatibleMigrations)) 397 for _, migration := range backwardCompatibleMigrations { 398 name := migration.name 399 if _, ok := names[name]; ok { 400 log.Fatalf(context.Background(), "duplicate sql migration %q", name) 401 } 402 names[name] = struct{}{} 403 } 404 } 405 406 type runner struct { 407 db db 408 codec keys.SQLCodec 409 sqlExecutor *sql.InternalExecutor 410 settings *cluster.Settings 411 } 412 413 func (r runner) execAsRoot(ctx context.Context, opName, stmt string, qargs ...interface{}) error { 414 _, err := r.sqlExecutor.ExecEx(ctx, opName, nil, /* txn */ 415 sqlbase.InternalExecutorSessionDataOverride{ 416 User: security.RootUser, 417 }, 418 stmt, qargs...) 419 return err 420 } 421 422 func (r runner) execAsRootWithRetry( 423 ctx context.Context, opName string, stmt string, qargs ...interface{}, 424 ) error { 425 // Retry a limited number of times because returning an error and letting 426 // the node kill itself is better than holding the migration lease for an 427 // arbitrarily long time. 428 var err error 429 for retry := retry.Start(retry.Options{MaxRetries: 5}); retry.Next(); { 430 err := r.execAsRoot(ctx, opName, stmt, qargs...) 431 if err == nil { 432 break 433 } 434 log.Warningf(ctx, "failed to run %s: %v", stmt, err) 435 } 436 return err 437 } 438 439 // leaseManager is defined just to allow us to use a fake client.LeaseManager 440 // when testing this package. 441 type leaseManager interface { 442 AcquireLease(ctx context.Context, key roachpb.Key) (*leasemanager.Lease, error) 443 ExtendLease(ctx context.Context, l *leasemanager.Lease) error 444 ReleaseLease(ctx context.Context, l *leasemanager.Lease) error 445 TimeRemaining(l *leasemanager.Lease) time.Duration 446 } 447 448 // db is defined just to allow us to use a fake client.DB when testing this 449 // package. 450 type db interface { 451 Scan(ctx context.Context, begin, end interface{}, maxRows int64) ([]kv.KeyValue, error) 452 Get(ctx context.Context, key interface{}) (kv.KeyValue, error) 453 Put(ctx context.Context, key, value interface{}) error 454 Txn(ctx context.Context, retryable func(ctx context.Context, txn *kv.Txn) error) error 455 } 456 457 // Manager encapsulates the necessary functionality for handling migrations 458 // of data in the cluster. 459 type Manager struct { 460 stopper *stop.Stopper 461 leaseManager leaseManager 462 db db 463 codec keys.SQLCodec 464 sqlExecutor *sql.InternalExecutor 465 testingKnobs MigrationManagerTestingKnobs 466 settings *cluster.Settings 467 jobRegistry *jobs.Registry 468 } 469 470 // NewManager initializes and returns a new Manager object. 471 func NewManager( 472 stopper *stop.Stopper, 473 db *kv.DB, 474 codec keys.SQLCodec, 475 executor *sql.InternalExecutor, 476 clock *hlc.Clock, 477 testingKnobs MigrationManagerTestingKnobs, 478 clientID string, 479 settings *cluster.Settings, 480 registry *jobs.Registry, 481 ) *Manager { 482 opts := leasemanager.Options{ 483 ClientID: clientID, 484 LeaseDuration: leaseDuration, 485 } 486 return &Manager{ 487 stopper: stopper, 488 leaseManager: leasemanager.New(db, clock, opts), 489 db: db, 490 codec: codec, 491 sqlExecutor: executor, 492 testingKnobs: testingKnobs, 493 settings: settings, 494 jobRegistry: registry, 495 } 496 } 497 498 // ExpectedDescriptorIDs returns the list of all expected system descriptor IDs, 499 // including those added by completed migrations. This is needed for certain 500 // tests, which check the number of ranges and system tables at node startup. 501 // 502 // NOTE: This value may be out-of-date if another node is actively running 503 // migrations, and so should only be used in test code where the migration 504 // lifecycle is tightly controlled. 505 func ExpectedDescriptorIDs( 506 ctx context.Context, 507 db db, 508 codec keys.SQLCodec, 509 defaultZoneConfig *zonepb.ZoneConfig, 510 defaultSystemZoneConfig *zonepb.ZoneConfig, 511 ) (sqlbase.IDs, error) { 512 completedMigrations, err := getCompletedMigrations(ctx, db, codec) 513 if err != nil { 514 return nil, err 515 } 516 descriptorIDs := sqlbase.MakeMetadataSchema(codec, defaultZoneConfig, defaultSystemZoneConfig).DescriptorIDs() 517 for _, migration := range backwardCompatibleMigrations { 518 // Is the migration not creating descriptors? 519 if migration.newDescriptorIDs == nil || 520 // Is the migration included in the metadata schema considered above? 521 (migration.includedInBootstrap != roachpb.Version{}) { 522 continue 523 } 524 if _, ok := completedMigrations[string(migrationKey(codec, migration))]; ok { 525 newIDs, err := migration.newDescriptorIDs(ctx, db, codec) 526 if err != nil { 527 return nil, err 528 } 529 descriptorIDs = append(descriptorIDs, newIDs...) 530 } 531 } 532 sort.Sort(descriptorIDs) 533 return descriptorIDs, nil 534 } 535 536 // EnsureMigrations should be run during node startup to ensure that all 537 // required migrations have been run (and running all those that are definitely 538 // safe to run). 539 func (m *Manager) EnsureMigrations(ctx context.Context, bootstrapVersion roachpb.Version) error { 540 // First, check whether there are any migrations that need to be run. 541 completedMigrations, err := getCompletedMigrations(ctx, m.db, m.codec) 542 if err != nil { 543 return err 544 } 545 allMigrationsCompleted := true 546 for _, migration := range backwardCompatibleMigrations { 547 if !m.shouldRunMigration(migration, bootstrapVersion) { 548 continue 549 } 550 if m.testingKnobs.DisableBackfillMigrations && migration.doesBackfill { 551 log.Infof(ctx, "ignoring migrations after (and including) %s due to testing knob", 552 migration.name) 553 break 554 } 555 key := migrationKey(m.codec, migration) 556 if _, ok := completedMigrations[string(key)]; !ok { 557 allMigrationsCompleted = false 558 } 559 } 560 if allMigrationsCompleted { 561 return nil 562 } 563 564 // If there are any, grab the migration lease to ensure that only one 565 // node is ever doing migrations at a time. 566 // Note that we shouldn't ever let client.LeaseNotAvailableErrors cause us 567 // to stop trying, because if we return an error the server will be shut down, 568 // and this server being down may prevent the leaseholder from finishing. 569 var lease *leasemanager.Lease 570 if log.V(1) { 571 log.Info(ctx, "trying to acquire lease") 572 } 573 for r := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); r.Next(); { 574 lease, err = m.leaseManager.AcquireLease(ctx, m.codec.MigrationLeaseKey()) 575 if err == nil { 576 break 577 } 578 log.Infof(ctx, "failed attempt to acquire migration lease: %s", err) 579 } 580 if err != nil { 581 return errors.Wrapf(err, "failed to acquire lease for running necessary migrations") 582 } 583 584 // Ensure that we hold the lease throughout the migration process and release 585 // it when we're done. 586 done := make(chan interface{}, 1) 587 defer func() { 588 done <- nil 589 if log.V(1) { 590 log.Info(ctx, "trying to release the lease") 591 } 592 if err := m.leaseManager.ReleaseLease(ctx, lease); err != nil { 593 log.Errorf(ctx, "failed to release migration lease: %s", err) 594 } 595 }() 596 if err := m.stopper.RunAsyncTask(ctx, "migrations.Manager: lease watcher", 597 func(ctx context.Context) { 598 select { 599 case <-done: 600 return 601 case <-time.After(leaseRefreshInterval): 602 if err := m.leaseManager.ExtendLease(ctx, lease); err != nil { 603 log.Warningf(ctx, "unable to extend ownership of expiration lease: %s", err) 604 } 605 if m.leaseManager.TimeRemaining(lease) < leaseRefreshInterval { 606 // Do one last final check of whether we're done - it's possible that 607 // ReleaseLease can sneak in and execute ahead of ExtendLease even if 608 // the ExtendLease started first (making for an unexpected value error), 609 // and doing this final check can avoid unintended shutdowns. 610 select { 611 case <-done: 612 return 613 default: 614 // Note that we may be able to do better than this by influencing the 615 // deadline of migrations' transactions based on the lease expiration 616 // time, but simply kill the process for now for the sake of simplicity. 617 log.Fatal(ctx, "not enough time left on migration lease, terminating for safety") 618 } 619 } 620 } 621 }); err != nil { 622 return err 623 } 624 625 // Re-get the list of migrations in case any of them were completed between 626 // our initial check and our grabbing of the lease. 627 completedMigrations, err = getCompletedMigrations(ctx, m.db, m.codec) 628 if err != nil { 629 return err 630 } 631 632 startTime := timeutil.Now().String() 633 r := runner{ 634 db: m.db, 635 codec: m.codec, 636 sqlExecutor: m.sqlExecutor, 637 settings: m.settings, 638 } 639 for _, migration := range backwardCompatibleMigrations { 640 if !m.shouldRunMigration(migration, bootstrapVersion) { 641 continue 642 } 643 644 key := migrationKey(m.codec, migration) 645 if _, ok := completedMigrations[string(key)]; ok { 646 continue 647 } 648 649 if m.testingKnobs.DisableBackfillMigrations && migration.doesBackfill { 650 log.Infof(ctx, "ignoring migrations after (and including) %s due to testing knob", 651 migration.name) 652 break 653 } 654 655 if log.V(1) { 656 log.Infof(ctx, "running migration %q", migration.name) 657 } 658 if err := migration.workFn(ctx, r); err != nil { 659 return errors.Wrapf(err, "failed to run migration %q", migration.name) 660 } 661 662 log.VEventf(ctx, 1, "persisting record of completing migration %s", migration.name) 663 if err := m.db.Put(ctx, key, startTime); err != nil { 664 return errors.Wrapf(err, "failed to persist record of completing migration %q", 665 migration.name) 666 } 667 } 668 669 return nil 670 } 671 672 func (m *Manager) shouldRunMigration( 673 migration migrationDescriptor, bootstrapVersion roachpb.Version, 674 ) bool { 675 if migration.workFn == nil { 676 // The migration has been baked in. 677 return false 678 } 679 minVersion := migration.includedInBootstrap 680 if minVersion != (roachpb.Version{}) && !bootstrapVersion.Less(minVersion) { 681 // The migration is unnecessary. 682 return false 683 } 684 if migration.clusterWide && !m.codec.ForSystemTenant() { 685 // The migration is a cluster-wide migration and we are not the 686 // system tenant. 687 return false 688 } 689 return true 690 } 691 692 var schemaChangeJobMigrationName = "upgrade schema change job format" 693 694 func schemaChangeJobMigrationKey(codec keys.SQLCodec) roachpb.Key { 695 return append(codec.MigrationKeyPrefix(), roachpb.RKey(schemaChangeJobMigrationName)...) 696 } 697 698 var systemNamespaceMigrationName = "upgrade system.namespace post-20.1-finalization" 699 700 func systemNamespaceMigrationKey(codec keys.SQLCodec) roachpb.Key { 701 return append(codec.MigrationKeyPrefix(), roachpb.RKey(systemNamespaceMigrationName)...) 702 } 703 704 // schemaChangeJobMigrationKeyForTable returns a key prefixed with 705 // schemaChangeJobMigrationKey for a specific table, to store the completion 706 // status for adding a new job if the table was being added or needed to drain 707 // names. 708 func schemaChangeJobMigrationKeyForTable(codec keys.SQLCodec, tableID sqlbase.ID) roachpb.Key { 709 return encoding.EncodeUint32Ascending(schemaChangeJobMigrationKey(codec), uint32(tableID)) 710 } 711 712 // StartSchemaChangeJobMigration starts an async task to run the migration that 713 // upgrades 19.2-style jobs to the 20.1 job format, so that the jobs can be 714 // adopted by the job registry. The task first waits until the upgrade to 20.1 715 // is finalized before running the migration. The migration is retried until 716 // it succeeds (on any node). 717 func (m *Manager) StartSchemaChangeJobMigration(ctx context.Context) error { 718 migrationKey := schemaChangeJobMigrationKey(m.codec) 719 return m.stopper.RunAsyncTask(ctx, "run-schema-change-job-migration", func(ctx context.Context) { 720 log.Info(ctx, "starting wait for upgrade finalization before schema change job migration") 721 // First wait for the cluster to finalize the upgrade to 20.1. These values 722 // were chosen to be similar to the retry loop for finalizing the cluster 723 // upgrade. 724 waitRetryOpts := retry.Options{ 725 InitialBackoff: 10 * time.Second, 726 MaxBackoff: 10 * time.Second, 727 Closer: m.stopper.ShouldQuiesce(), 728 } 729 for retry := retry.StartWithCtx(ctx, waitRetryOpts); retry.Next(); { 730 if m.settings.Version.IsActive(ctx, clusterversion.VersionSchemaChangeJob) { 731 break 732 } 733 } 734 select { 735 case <-m.stopper.ShouldQuiesce(): 736 return 737 default: 738 } 739 log.VEventf(ctx, 2, "detected upgrade finalization") 740 741 if !m.testingKnobs.AlwaysRunJobMigration { 742 // Check whether this migration has already been completed. 743 if kv, err := m.db.Get(ctx, migrationKey); err != nil { 744 log.Infof(ctx, "error getting record of schema change job migration: %s", err.Error()) 745 } else if kv.Exists() { 746 log.Infof(ctx, "schema change job migration already complete") 747 return 748 } 749 } 750 751 // Now run the migration. This is retried indefinitely until it finishes. 752 log.Infof(ctx, "starting schema change job migration") 753 r := runner{ 754 db: m.db, 755 codec: m.codec, 756 sqlExecutor: m.sqlExecutor, 757 settings: m.settings, 758 } 759 migrationRetryOpts := retry.Options{ 760 InitialBackoff: 1 * time.Minute, 761 MaxBackoff: 10 * time.Minute, 762 Closer: m.stopper.ShouldQuiesce(), 763 } 764 startTime := timeutil.Now().String() 765 for migRetry := retry.Start(migrationRetryOpts); migRetry.Next(); { 766 migrateCtx, _ := m.stopper.WithCancelOnQuiesce(context.Background()) 767 migrateCtx = logtags.AddTag(migrateCtx, "schema-change-job-migration", nil) 768 if err := migrateSchemaChangeJobs(migrateCtx, r, m.jobRegistry); err != nil { 769 log.Errorf(ctx, "error attempting running schema change job migration, will retry: %s %s", err.Error(), startTime) 770 continue 771 } 772 log.Infof(ctx, "schema change job migration completed") 773 if err := m.db.Put(ctx, migrationKey, startTime); err != nil { 774 log.Warningf(ctx, "error persisting record of schema change job migration, will retry: %s", err.Error()) 775 } 776 break 777 } 778 if fn := m.testingKnobs.AfterJobMigration; fn != nil { 779 fn() 780 } 781 }) 782 } 783 784 var systemNamespaceMigrationEnabled = settings.RegisterBoolSetting( 785 "testing.system_namespace_migration.enabled", 786 "internal testing only: disable the system namespace migration", 787 true, 788 ) 789 790 // StartSystemNamespaceMigration starts an async task to run the migration that 791 // migrates entries from system.namespace (descriptor 2) to system.namespace2 792 // (descriptor 30). The task first waits until the upgrade to 20.1 is finalized 793 // before running the migration. The migration is retried until it succeeds (on 794 // any node). 795 func (m *Manager) StartSystemNamespaceMigration( 796 ctx context.Context, bootstrapVersion roachpb.Version, 797 ) error { 798 if !bootstrapVersion.Less(clusterversion.VersionByKey(clusterversion.VersionNamespaceTableWithSchemas)) { 799 // Our bootstrap version is equal to or greater than 20.1, where no old 800 // namespace table is created: we can skip this migration. 801 return nil 802 } 803 return m.stopper.RunAsyncTask(ctx, "run-system-namespace-migration", func(ctx context.Context) { 804 log.Info(ctx, "starting wait for upgrade finalization before system.namespace migration") 805 // First wait for the cluster to finalize the upgrade to 20.1. These values 806 // were chosen to be similar to the retry loop for finalizing the cluster 807 // upgrade. 808 waitRetryOpts := retry.Options{ 809 InitialBackoff: 10 * time.Second, 810 MaxBackoff: 10 * time.Second, 811 Closer: m.stopper.ShouldQuiesce(), 812 } 813 for retry := retry.StartWithCtx(ctx, waitRetryOpts); retry.Next(); { 814 if !systemNamespaceMigrationEnabled.Get(&m.settings.SV) { 815 continue 816 } 817 if m.settings.Version.IsActive(ctx, clusterversion.VersionNamespaceTableWithSchemas) { 818 break 819 } 820 } 821 select { 822 case <-m.stopper.ShouldQuiesce(): 823 return 824 default: 825 } 826 log.VEventf(ctx, 2, "detected upgrade finalization for system.namespace migration") 827 828 migrationKey := systemNamespaceMigrationKey(m.codec) 829 // Check whether this migration has already been completed. 830 if kv, err := m.db.Get(ctx, migrationKey); err != nil { 831 log.Infof(ctx, "error getting record of system.namespace migration: %s", err.Error()) 832 } else if kv.Exists() { 833 log.Infof(ctx, "system.namespace migration already complete") 834 return 835 } 836 837 // Now run the migration. This is retried indefinitely until it finishes. 838 log.Infof(ctx, "starting system.namespace migration") 839 r := runner{ 840 db: m.db, 841 codec: m.codec, 842 sqlExecutor: m.sqlExecutor, 843 settings: m.settings, 844 } 845 migrationRetryOpts := retry.Options{ 846 InitialBackoff: 1 * time.Minute, 847 MaxBackoff: 10 * time.Minute, 848 Closer: m.stopper.ShouldQuiesce(), 849 } 850 startTime := timeutil.Now().String() 851 for migRetry := retry.Start(migrationRetryOpts); migRetry.Next(); { 852 if err := m.migrateSystemNamespace(ctx, migrationKey, r, startTime); err != nil { 853 log.Errorf(ctx, "error attempting running system.namespace migration, will retry: %s %s", err.Error(), 854 startTime) 855 continue 856 } 857 break 858 } 859 }) 860 } 861 862 // migrateSystemNamespace migrates entries from the deprecated system.namespace 863 // table to the new one, which includes a parentSchemaID column. Each database 864 // entry is copied to the new table along with a corresponding entry for the 865 // 'public' schema. Each table entry is copied over with the public schema as 866 // as its parentSchemaID. 867 // 868 // Only entries that do not exist in the new table are copied. 869 // 870 // New database and table entries continue to be written to the deprecated 871 // namespace table until VersionNamespaceTableWithSchemas is active. This means 872 // that an additional migration will be necessary in 20.2 to catch any new 873 // entries which may have been missed by this one. In the meantime, namespace 874 // lookups fall back to the deprecated table if a name is not found in the new 875 // one. 876 func (m *Manager) migrateSystemNamespace( 877 ctx context.Context, migrationKey roachpb.Key, r runner, startTime string, 878 ) error { 879 migrateCtx, cancel := m.stopper.WithCancelOnQuiesce(ctx) 880 defer cancel() 881 migrateCtx = logtags.AddTag(migrateCtx, "system-namespace-migration", nil) 882 // Loop until there's no more work to be done. 883 workLeft := true 884 for workLeft { 885 if err := m.db.Txn(migrateCtx, func(ctx context.Context, txn *kv.Txn) error { 886 // Check again to see if someone else wrote the migration key. 887 if kv, err := txn.Get(ctx, migrationKey); err != nil { 888 log.Infof(ctx, "error getting record of system.namespace migration: %s", err.Error()) 889 // Retry the migration. 890 return err 891 } else if kv.Exists() { 892 // Give up, no work to be done. 893 log.Infof(ctx, "system.namespace migration already complete") 894 return nil 895 } 896 // Fetch all entries that are not present in the new namespace table. Each 897 // of these entries will be copied to the new table. 898 // 899 // Note that we are very careful to always delete from both namespace tables 900 // in 20.1, so there's no possibility that we'll be overwriting a deleted 901 // table that existed in the old table and the new table but was deleted 902 // from only the new table. 903 const batchSize = 1000 904 q := fmt.Sprintf( 905 `SELECT "parentID", name, id FROM [%d AS namespace_deprecated] 906 WHERE id NOT IN (SELECT id FROM [%d AS namespace]) LIMIT %d`, 907 sqlbase.DeprecatedNamespaceTable.ID, sqlbase.NamespaceTable.ID, batchSize+1) 908 rows, err := r.sqlExecutor.QueryEx( 909 ctx, "read-deprecated-namespace-table", txn, 910 sqlbase.InternalExecutorSessionDataOverride{ 911 User: security.RootUser, 912 }, 913 q) 914 if err != nil { 915 return err 916 } 917 log.Infof(ctx, "Migrating system.namespace chunk with %d rows", len(rows)) 918 for i, row := range rows { 919 workLeft = false 920 // We found some rows from the query, which means that we can't quit 921 // just yet. 922 if i >= batchSize { 923 workLeft = true 924 // Just process 1000 rows at a time. 925 break 926 } 927 parentID := sqlbase.ID(tree.MustBeDInt(row[0])) 928 name := string(tree.MustBeDString(row[1])) 929 id := sqlbase.ID(tree.MustBeDInt(row[2])) 930 if parentID == keys.RootNamespaceID { 931 // This row represents a database. Add it to the new namespace table. 932 databaseKey := sqlbase.NewDatabaseKey(name) 933 if err := txn.Put(ctx, databaseKey.Key(r.codec), id); err != nil { 934 return err 935 } 936 // Also create a 'public' schema for this database. 937 schemaKey := sqlbase.NewSchemaKey(id, "public") 938 log.VEventf(ctx, 2, "Migrating system.namespace entry for database %s", name) 939 if err := txn.Put(ctx, schemaKey.Key(r.codec), keys.PublicSchemaID); err != nil { 940 return err 941 } 942 } else { 943 // This row represents a table. Add it to the new namespace table with the 944 // schema set to 'public'. 945 if id == keys.DeprecatedNamespaceTableID { 946 // The namespace table itself was already handled in 947 // createNewSystemNamespaceDescriptor. Do not overwrite it with the 948 // deprecated ID. 949 continue 950 } 951 tableKey := sqlbase.NewTableKey(parentID, keys.PublicSchemaID, name) 952 log.VEventf(ctx, 2, "Migrating system.namespace entry for table %s", name) 953 if err := txn.Put(ctx, tableKey.Key(r.codec), id); err != nil { 954 return err 955 } 956 } 957 } 958 return nil 959 }); err != nil { 960 return err 961 } 962 } 963 // No more work to be done. 964 log.Infof(migrateCtx, "system.namespace migration completed") 965 if err := m.db.Put(migrateCtx, migrationKey, startTime); err != nil { 966 log.Warningf(migrateCtx, "error persisting record of system.namespace migration, will retry: %s", err.Error()) 967 return err 968 } 969 return nil 970 } 971 972 // migrateSchemaChangeJobs runs the schema change job migration. The migration 973 // has two steps. In the first step, we scan the jobs table for all 974 // non-Succeeded jobs; for each job, it looks up the associated table and uses 975 // the table descriptor state to update the job payload appropriately. For jobs 976 // that are waiting for GC for dropped tables, indexes, etc., we mark the 977 // existing job as completed and create a new GC job. In the second step, we 978 // get all the descriptors and all running jobs, and create a new job for all 979 // tables that are either in the ADD state or have draining names but which 980 // have no running jobs, since tables in those states in 19.2 would have been 981 // processed by the schema changer. 982 func migrateSchemaChangeJobs(ctx context.Context, r runner, registry *jobs.Registry) error { 983 // Get all jobs that aren't Succeeded and evaluate whether they need a 984 // migration. (Jobs that are canceled in 19.2 could still have in-progress 985 // schema changes.) 986 rows, err := r.sqlExecutor.QueryEx( 987 ctx, "jobs-for-migration", nil, /* txn */ 988 sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser}, 989 "SELECT id, payload FROM system.jobs WHERE status != $1", jobs.StatusSucceeded, 990 ) 991 if err != nil { 992 return err 993 } 994 for _, row := range rows { 995 jobID := int64(tree.MustBeDInt(row[0])) 996 log.VEventf(ctx, 2, "job %d: evaluating for schema change job migration", jobID) 997 998 payload, err := jobs.UnmarshalPayload(row[1]) 999 if err != nil { 1000 return err 1001 } 1002 if details := payload.GetSchemaChange(); details == nil || 1003 details.FormatVersion > jobspb.BaseFormatVersion { 1004 continue 1005 } 1006 1007 log.Infof(ctx, "job %d: undergoing schema change job migration", jobID) 1008 1009 if err := r.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 1010 // Read the job again inside the transaction. If the job was already 1011 // upgraded, we don't have to do anything else. 1012 job, err := registry.LoadJobWithTxn(ctx, jobID, txn) 1013 if err != nil { 1014 // The job could have been GC'ed in the meantime. 1015 if jobs.HasJobNotFoundError(err) { 1016 return nil 1017 } 1018 return err 1019 } 1020 payload := job.Payload() 1021 details := payload.GetSchemaChange() 1022 if details.FormatVersion > jobspb.BaseFormatVersion { 1023 return nil 1024 } 1025 1026 // Determine whether the job is for dropping a database/table. Note that 1027 // DroppedTables is always populated in 19.2 for all jobs that drop 1028 // tables. 1029 if len(details.DroppedTables) > 0 { 1030 return migrateDropTablesOrDatabaseJob(ctx, txn, r.codec, registry, job) 1031 } 1032 1033 descIDs := job.Payload().DescriptorIDs 1034 // All other jobs have exactly 1 associated descriptor ID (for a table), 1035 // and correspond to a schema change with a mutation. 1036 if len(descIDs) != 1 { 1037 return errors.AssertionFailedf( 1038 "job %d: could not be migrated due to unexpected descriptor IDs %v", *job.ID(), descIDs) 1039 } 1040 descID := descIDs[0] 1041 tableDesc, err := sqlbase.GetTableDescFromID(ctx, txn, r.codec, descID) 1042 if err != nil { 1043 return err 1044 } 1045 return migrateMutationJobForTable(ctx, txn, registry, job, tableDesc) 1046 }); err != nil { 1047 return err 1048 } 1049 log.Infof(ctx, "job %d: completed schema change job migration", jobID) 1050 } 1051 1052 // Finally, we iterate through all table descriptors and jobs, and create jobs 1053 // for any tables in the ADD state or that have draining names that don't 1054 // already have jobs. We also create a GC job for all tables in the DROP state 1055 // with no associated schema change or GC job, which can result from failed 1056 // IMPORT and RESTORE jobs whose table data wasn't fully GC'ed. 1057 // 1058 // We start by getting all descriptors and all running jobs in a single 1059 // transaction. Each eligible table then gets a job created for it, each in a 1060 // separate transaction; in each of those transactions, we write a table- 1061 // specific KV with a key prefixed by schemaChangeJobMigrationKey to try to 1062 // prevent more than one such job from being created for the table. 1063 // 1064 // This process ensures that every table that entered into one of these 1065 // intermediate states (being added/dropped, or having draining names) in 19.2 1066 // will have a job created for it in 20.1, so that the table can finish being 1067 // processed. It's not essential for only one job to be created for each 1068 // table, since a redundant schema change job is a no-op, but we make an 1069 // effort to do that anyway. 1070 // 1071 // There are probably more efficient ways to do this part of the migration, 1072 // but the current approach seemed like the most straightforward. 1073 var allDescs []sqlbase.DescriptorProto 1074 schemaChangeJobsForDesc := make(map[sqlbase.ID][]int64) 1075 gcJobsForDesc := make(map[sqlbase.ID][]int64) 1076 if err := r.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 1077 descs, err := catalogkv.GetAllDescriptors(ctx, txn, r.codec) 1078 if err != nil { 1079 return err 1080 } 1081 allDescs = descs 1082 1083 // Get all running schema change jobs. 1084 rows, err := r.sqlExecutor.QueryEx( 1085 ctx, "preexisting-jobs", txn, 1086 sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser}, 1087 "SELECT id, payload FROM system.jobs WHERE status = $1", jobs.StatusRunning, 1088 ) 1089 if err != nil { 1090 return err 1091 } 1092 for _, row := range rows { 1093 jobID := int64(tree.MustBeDInt(row[0])) 1094 payload, err := jobs.UnmarshalPayload(row[1]) 1095 if err != nil { 1096 return err 1097 } 1098 if details := payload.GetSchemaChange(); details != nil { 1099 if details.FormatVersion < jobspb.JobResumerFormatVersion { 1100 continue 1101 } 1102 if details.TableID != sqlbase.InvalidID { 1103 schemaChangeJobsForDesc[details.TableID] = append(schemaChangeJobsForDesc[details.TableID], jobID) 1104 } else { 1105 for _, t := range details.DroppedTables { 1106 schemaChangeJobsForDesc[t.ID] = append(schemaChangeJobsForDesc[t.ID], jobID) 1107 } 1108 } 1109 } else if details := payload.GetSchemaChangeGC(); details != nil { 1110 for _, t := range details.Tables { 1111 gcJobsForDesc[t.ID] = append(gcJobsForDesc[t.ID], jobID) 1112 } 1113 } 1114 } 1115 return nil 1116 }); err != nil { 1117 return err 1118 } 1119 1120 createSchemaChangeJobForTable := func(txn *kv.Txn, desc *sqlbase.TableDescriptor) error { 1121 var description string 1122 if desc.Adding() { 1123 description = fmt.Sprintf("adding table %d", desc.ID) 1124 } else if desc.HasDrainingNames() { 1125 description = fmt.Sprintf("draining names for table %d", desc.ID) 1126 } else { 1127 // This shouldn't be possible, but if it happens, it would be 1128 // appropriate to do nothing without returning an error. 1129 log.Warningf( 1130 ctx, 1131 "tried to add schema change job for table %d which is neither being added nor has draining names", 1132 desc.ID, 1133 ) 1134 return nil 1135 } 1136 record := jobs.Record{ 1137 Description: description, 1138 Username: security.NodeUser, 1139 DescriptorIDs: sqlbase.IDs{desc.ID}, 1140 Details: jobspb.SchemaChangeDetails{ 1141 TableID: desc.ID, 1142 FormatVersion: jobspb.JobResumerFormatVersion, 1143 }, 1144 Progress: jobspb.SchemaChangeProgress{}, 1145 NonCancelable: true, 1146 } 1147 job, err := registry.CreateJobWithTxn(ctx, record, txn) 1148 if err != nil { 1149 return err 1150 } 1151 log.Infof(ctx, "migration created new schema change job %d: %s", *job.ID(), description) 1152 return nil 1153 } 1154 1155 createGCJobForTable := func(txn *kv.Txn, desc *sqlbase.TableDescriptor) error { 1156 record := sql.CreateGCJobRecord( 1157 fmt.Sprintf("table %d", desc.ID), 1158 security.NodeUser, 1159 jobspb.SchemaChangeGCDetails{ 1160 Tables: []jobspb.SchemaChangeGCDetails_DroppedID{{ID: desc.ID, DropTime: desc.DropTime}}, 1161 }) 1162 job, err := registry.CreateJobWithTxn(ctx, record, txn) 1163 if err != nil { 1164 return err 1165 } 1166 log.Infof(ctx, "migration created new GC job %d for table %d", *job.ID(), desc.ID) 1167 return nil 1168 } 1169 1170 log.Infof(ctx, "evaluating tables for creating jobs") 1171 for _, desc := range allDescs { 1172 switch desc := desc.(type) { 1173 case *sqlbase.TableDescriptor: 1174 if scJobs := schemaChangeJobsForDesc[desc.ID]; len(scJobs) > 0 { 1175 log.VEventf(ctx, 3, "table %d has running schema change jobs %v, skipping", desc.ID, scJobs) 1176 continue 1177 } else if gcJobs := gcJobsForDesc[desc.ID]; len(gcJobs) > 0 { 1178 log.VEventf(ctx, 3, "table %d has running GC jobs %v, skipping", desc.ID, gcJobs) 1179 continue 1180 } 1181 if !desc.Adding() && !desc.Dropped() && !desc.HasDrainingNames() { 1182 log.VEventf(ctx, 3, 1183 "table %d is not being added or dropped and does not have draining names, skipping", 1184 desc.ID, 1185 ) 1186 continue 1187 } 1188 1189 if err := r.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 1190 key := schemaChangeJobMigrationKeyForTable(r.codec, desc.ID) 1191 startTime := timeutil.Now().String() 1192 if kv, err := txn.Get(ctx, key); err != nil { 1193 return err 1194 } else if kv.Exists() { 1195 log.VEventf(ctx, 3, "table %d already processed in migration", desc.ID) 1196 return nil 1197 } 1198 if desc.Adding() || desc.HasDrainingNames() { 1199 if err := createSchemaChangeJobForTable(txn, desc); err != nil { 1200 return err 1201 } 1202 } else if desc.Dropped() { 1203 // Note that a table can be both in the DROP state and have draining 1204 // names. In that case it was enough to just create a schema change 1205 // job, as in the case above, because that job will itself create a 1206 // GC job. 1207 if err := createGCJobForTable(txn, desc); err != nil { 1208 return err 1209 } 1210 } 1211 if err := txn.Put(ctx, key, startTime); err != nil { 1212 return err 1213 } 1214 return nil 1215 }); err != nil { 1216 return err 1217 } 1218 case *sqlbase.DatabaseDescriptor: 1219 // Do nothing. 1220 } 1221 } 1222 1223 return nil 1224 } 1225 1226 // migrateMutationJobForTable handles migrating jobs associated with mutations 1227 // on a table, each of which is stored in MutationJobs. (This includes adding 1228 // and dropping columns, indexes, and constraints, as well as primary key 1229 // changes.) This function also handles jobs for indexes waiting for GC, 1230 // stored in GCMutations. 1231 func migrateMutationJobForTable( 1232 ctx context.Context, 1233 txn *kv.Txn, 1234 registry *jobs.Registry, 1235 job *jobs.Job, 1236 tableDesc *sql.TableDescriptor, 1237 ) error { 1238 log.VEventf(ctx, 2, "job %d: undergoing migration as mutation job for table %d", *job.ID(), tableDesc.ID) 1239 1240 // Check whether the job is for a mutation. There can be multiple mutations 1241 // with the same ID that correspond to the same job, but we just need to 1242 // look at one, since all the mutations with the same ID get state updates 1243 // in the same transaction. 1244 for i := range tableDesc.MutationJobs { 1245 mutationJob := &tableDesc.MutationJobs[i] 1246 if mutationJob.JobID != *job.ID() { 1247 continue 1248 } 1249 log.VEventf( 1250 ctx, 2, "job %d: found corresponding MutationJob %d on table %d", 1251 *job.ID(), mutationJob.MutationID, tableDesc.ID, 1252 ) 1253 var mutation *sqlbase.DescriptorMutation 1254 for i := range tableDesc.Mutations { 1255 if tableDesc.Mutations[i].MutationID == mutationJob.MutationID { 1256 mutation = &tableDesc.Mutations[i] 1257 break 1258 } 1259 } 1260 if mutation == nil { 1261 // In theory, MutationJobs[i] corresponds to Mutations[i] in 19.2 and 1262 // earlier versions, so this should never happen. However, we've seen this 1263 // happen (#48786), so we have to be defensive. 1264 mutationNotFoundError := errors.AssertionFailedf("mutation %d not found for MutationJob %d", 1265 mutationJob.MutationID, mutationJob.JobID) 1266 log.Errorf(ctx, "%v", mutationNotFoundError) 1267 return registry.Failed(ctx, txn, *job.ID(), mutationNotFoundError) 1268 } 1269 1270 // Update the job details and status based on the table descriptor 1271 // state. 1272 if err := job.WithTxn(txn).Update(ctx, func(txn *kv.Txn, md jobs.JobMetadata, ju *jobs.JobUpdater) error { 1273 // Update the job details with the table and mutation IDs. 1274 details := md.Payload.GetSchemaChange() 1275 details.TableID = tableDesc.ID 1276 details.MutationID = mutationJob.MutationID 1277 details.FormatVersion = jobspb.JobResumerFormatVersion 1278 md.Payload.Details = jobspb.WrapPayloadDetails(*details) 1279 1280 log.VEventf(ctx, 2, "job %d: updating details to %+v", *job.ID(), details) 1281 1282 // Also give the job a non-nil expired lease to indicate that the job 1283 // is adoptable. 1284 md.Payload.Lease = &jobspb.Lease{} 1285 ju.UpdatePayload(md.Payload) 1286 1287 // If the mutation exists on the table descriptor, then the schema 1288 // change isn't actually in a terminal state, regardless of what the 1289 // job status is. So we force the status to Reverting if there's any 1290 // indication that it failed or was canceled, and otherwise force the 1291 // state to Running. 1292 shouldRevert := md.Status == jobs.StatusFailed || md.Status == jobs.StatusCanceled || 1293 md.Status == jobs.StatusReverting || md.Status == jobs.StatusCancelRequested 1294 previousStatus := md.Status 1295 if mutation.Rollback || shouldRevert { 1296 md.Status = jobs.StatusReverting 1297 } else { 1298 md.Status = jobs.StatusRunning 1299 } 1300 log.VEventf(ctx, 2, "job %d: updating status from %s to %s", *job.ID(), previousStatus, md.Status) 1301 ju.UpdateStatus(md.Status) 1302 return nil 1303 }); err != nil { 1304 return err 1305 } 1306 log.Infof( 1307 ctx, "job %d: successfully migrated for table %d, mutation %d", 1308 *job.ID(), tableDesc.ID, mutationJob.MutationID, 1309 ) 1310 return nil 1311 } 1312 1313 // If not a mutation, check whether the job corresponds to a GCMutation. 1314 // This indicates that the job must be in the "waiting for GC TTL" state. 1315 // In that case, we mark the job as succeeded and create a new job for GC. 1316 for i := range tableDesc.GCMutations { 1317 gcMutation := &tableDesc.GCMutations[i] 1318 // JobID and dropTime are populated only in 19.2 and earlier versions. 1319 if gcMutation.JobID != *job.ID() { 1320 continue 1321 } 1322 log.VEventf( 1323 ctx, 2, "job %d: found corresponding index GC mutation for index %d on table %d", 1324 *job.ID(), gcMutation.IndexID, tableDesc.ID, 1325 ) 1326 if err := registry.Succeeded(ctx, txn, *job.ID()); err != nil { 1327 return err 1328 } 1329 log.VEventf(ctx, 2, "job %d: marked as succeeded", *job.ID()) 1330 1331 indexGCJobRecord := sql.CreateGCJobRecord( 1332 job.Payload().Description, 1333 job.Payload().Username, 1334 jobspb.SchemaChangeGCDetails{ 1335 Indexes: []jobspb.SchemaChangeGCDetails_DroppedIndex{ 1336 { 1337 IndexID: gcMutation.IndexID, 1338 DropTime: gcMutation.DropTime, 1339 }, 1340 }, 1341 ParentID: tableDesc.GetID(), 1342 }, 1343 ) 1344 // The new job ID won't be written to GCMutations, which is fine because 1345 // we don't read the job ID in 20.1 for anything except this migration. 1346 newJob, err := registry.CreateJobWithTxn(ctx, indexGCJobRecord, txn) 1347 if err != nil { 1348 return err 1349 } 1350 log.Infof(ctx, 1351 "migration marked drop table job %d as successful, created GC job %d", 1352 *job.ID(), *newJob.ID(), 1353 ) 1354 return nil 1355 } 1356 1357 // If the job isn't in MutationJobs or GCMutations, it's likely just a 1358 // failed or canceled job that was successfully cleaned up. Check for this, 1359 // and return an error if this is not the case. 1360 status, err := job.CurrentStatus(ctx) 1361 if err != nil { 1362 return err 1363 } 1364 if status == jobs.StatusCanceled || status == jobs.StatusFailed { 1365 return nil 1366 } 1367 return errors.Newf( 1368 "job %d: no corresponding mutation found on table %d during migration", *job.ID(), tableDesc.ID) 1369 } 1370 1371 // migrateDropTablesOrDatabaseJob handles migrating any jobs that require 1372 // dropping a table, including dropping tables, views, sequences, and 1373 // databases, as well as truncating tables. 1374 func migrateDropTablesOrDatabaseJob( 1375 ctx context.Context, txn *kv.Txn, codec keys.SQLCodec, registry *jobs.Registry, job *jobs.Job, 1376 ) error { 1377 payload := job.Payload() 1378 details := payload.GetSchemaChange() 1379 log.VEventf(ctx, 2, 1380 "job %d: undergoing migration as drop table/database job for tables %+v", 1381 *job.ID(), details.DroppedTables, 1382 ) 1383 1384 if job.Progress().RunningStatus == string(sql.RunningStatusDrainingNames) { 1385 // If the job is draining names, the schema change job resumer will handle 1386 // it. Just update the job details. 1387 if err := job.WithTxn(txn).Update(ctx, func(txn *kv.Txn, md jobs.JobMetadata, ju *jobs.JobUpdater) error { 1388 if len(details.DroppedTables) == 1 { 1389 details.TableID = details.DroppedTables[0].ID 1390 } 1391 details.FormatVersion = jobspb.JobResumerFormatVersion 1392 md.Payload.Details = jobspb.WrapPayloadDetails(*details) 1393 1394 log.VEventf(ctx, 2, "job %d: updating details to %+v", *job.ID(), details) 1395 1396 // Also give the job a non-nil expired lease to indicate that the job 1397 // is adoptable. 1398 md.Payload.Lease = &jobspb.Lease{} 1399 ju.UpdatePayload(md.Payload) 1400 return nil 1401 }); err != nil { 1402 return err 1403 } 1404 log.Infof(ctx, "job %d: successfully migrated in draining names state", *job.ID()) 1405 return nil 1406 } 1407 1408 // Otherwise, the job is in the "waiting for GC TTL" phase. In this case, we 1409 // mark the present job as Succeeded and create a new GC job. 1410 1411 // TODO (lucy/paul): In the case of multiple tables, is it a problem if some 1412 // of the tables have already been GC'ed at this point? In 19.2, each table 1413 // advances separately through the stages of being dropped, so it should be 1414 // possible for some tables to still be draining names while others have 1415 // already undergone GC. 1416 1417 if err := registry.Succeeded(ctx, txn, *job.ID()); err != nil { 1418 return err 1419 } 1420 log.VEventf(ctx, 2, "job %d: marked as succeeded", *job.ID()) 1421 1422 tablesToDrop := make([]jobspb.SchemaChangeGCDetails_DroppedID, len(details.DroppedTables)) 1423 for i := range details.DroppedTables { 1424 tableID := details.DroppedTables[i].ID 1425 tablesToDrop[i].ID = details.DroppedTables[i].ID 1426 desc, err := sqlbase.GetTableDescFromID(ctx, txn, codec, tableID) 1427 if err != nil { 1428 return err 1429 } 1430 tablesToDrop[i].DropTime = desc.DropTime 1431 } 1432 gcJobRecord := sql.CreateGCJobRecord( 1433 job.Payload().Description, 1434 job.Payload().Username, 1435 jobspb.SchemaChangeGCDetails{ 1436 Tables: tablesToDrop, 1437 ParentID: details.DroppedDatabaseID, 1438 }, 1439 ) 1440 // The new job ID won't be written to DropJobID on the table descriptor(s), 1441 // which is fine because we don't read the job ID in 20.1 for anything 1442 // except this migration. 1443 // TODO (lucy): The above is true except for the cleanup loop for orphaned 1444 // jobs in the registry, which should be fixed. 1445 newJob, err := registry.CreateJobWithTxn(ctx, gcJobRecord, txn) 1446 if err != nil { 1447 return err 1448 } 1449 log.Infof(ctx, 1450 "migration marked drop database/table job %d as successful, created GC job %d", 1451 *job.ID(), *newJob.ID(), 1452 ) 1453 return err 1454 } 1455 1456 func getCompletedMigrations( 1457 ctx context.Context, db db, codec keys.SQLCodec, 1458 ) (map[string]struct{}, error) { 1459 if log.V(1) { 1460 log.Info(ctx, "trying to get the list of completed migrations") 1461 } 1462 prefix := codec.MigrationKeyPrefix() 1463 keyvals, err := db.Scan(ctx, prefix, prefix.PrefixEnd(), 0 /* maxRows */) 1464 if err != nil { 1465 return nil, errors.Wrapf(err, "failed to get list of completed migrations") 1466 } 1467 completedMigrations := make(map[string]struct{}) 1468 for _, keyval := range keyvals { 1469 completedMigrations[string(keyval.Key)] = struct{}{} 1470 } 1471 return completedMigrations, nil 1472 } 1473 1474 func migrationKey(codec keys.SQLCodec, migration migrationDescriptor) roachpb.Key { 1475 return append(codec.MigrationKeyPrefix(), roachpb.RKey(migration.name)...) 1476 } 1477 1478 func createSystemTable(ctx context.Context, r runner, desc sqlbase.TableDescriptor) error { 1479 // We install the table at the KV layer so that we can choose a known ID in 1480 // the reserved ID space. (The SQL layer doesn't allow this.) 1481 err := r.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 1482 b := txn.NewBatch() 1483 tKey := sqlbase.MakePublicTableNameKey(ctx, r.settings, desc.GetParentID(), desc.GetName()) 1484 b.CPut(tKey.Key(r.codec), desc.GetID(), nil) 1485 b.CPut(sqlbase.MakeDescMetadataKey(r.codec, desc.GetID()), sqlbase.WrapDescriptor(&desc), nil) 1486 if err := txn.SetSystemConfigTrigger(); err != nil { 1487 return err 1488 } 1489 return txn.Run(ctx, b) 1490 }) 1491 // CPuts only provide idempotent inserts if we ignore the errors that arise 1492 // when the condition isn't met. 1493 if errors.HasType(err, (*roachpb.ConditionFailedError)(nil)) { 1494 return nil 1495 } 1496 return err 1497 } 1498 1499 func createCommentTable(ctx context.Context, r runner) error { 1500 return createSystemTable(ctx, r, sqlbase.CommentsTable) 1501 } 1502 1503 func createReplicationConstraintStatsTable(ctx context.Context, r runner) error { 1504 if err := createSystemTable(ctx, r, sqlbase.ReplicationConstraintStatsTable); err != nil { 1505 return err 1506 } 1507 err := r.execAsRoot(ctx, "add-constraints-ttl", 1508 fmt.Sprintf( 1509 "ALTER TABLE system.replication_constraint_stats CONFIGURE ZONE USING gc.ttlseconds = %d", 1510 int(sqlbase.ReplicationConstraintStatsTableTTL.Seconds()))) 1511 return errors.Wrapf(err, "failed to set TTL on %s", sqlbase.ReplicationConstraintStatsTable.Name) 1512 } 1513 1514 func createReplicationCriticalLocalitiesTable(ctx context.Context, r runner) error { 1515 return createSystemTable(ctx, r, sqlbase.ReplicationCriticalLocalitiesTable) 1516 } 1517 1518 func createReplicationStatsTable(ctx context.Context, r runner) error { 1519 if err := createSystemTable(ctx, r, sqlbase.ReplicationStatsTable); err != nil { 1520 return err 1521 } 1522 err := r.execAsRoot(ctx, "add-replication-status-ttl", 1523 fmt.Sprintf("ALTER TABLE system.replication_stats CONFIGURE ZONE USING gc.ttlseconds = %d", 1524 int(sqlbase.ReplicationStatsTableTTL.Seconds()))) 1525 return errors.Wrapf(err, "failed to set TTL on %s", sqlbase.ReplicationStatsTable.Name) 1526 } 1527 1528 func createProtectedTimestampsMetaTable(ctx context.Context, r runner) error { 1529 return errors.Wrap(createSystemTable(ctx, r, sqlbase.ProtectedTimestampsMetaTable), 1530 "failed to create system.protected_ts_meta") 1531 } 1532 1533 func createProtectedTimestampsRecordsTable(ctx context.Context, r runner) error { 1534 return errors.Wrap(createSystemTable(ctx, r, sqlbase.ProtectedTimestampsRecordsTable), 1535 "failed to create system.protected_ts_records") 1536 } 1537 1538 func createNewSystemNamespaceDescriptor(ctx context.Context, r runner) error { 1539 return r.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 1540 b := txn.NewBatch() 1541 1542 // Retrieve the existing namespace table's descriptor and change its name to 1543 // "namespace". This corrects the behavior of this migration as it existed 1544 // in 20.1 betas. The old namespace table cannot be edited without breaking 1545 // explicit selects from system.namespace in 19.2. 1546 deprecatedKey := sqlbase.MakeDescMetadataKey(r.codec, keys.DeprecatedNamespaceTableID) 1547 deprecatedDesc := &sqlbase.Descriptor{} 1548 ts, err := txn.GetProtoTs(ctx, deprecatedKey, deprecatedDesc) 1549 if err != nil { 1550 return err 1551 } 1552 deprecatedDesc.Table(ts).Name = sqlbase.DeprecatedNamespaceTable.Name 1553 b.Put(deprecatedKey, deprecatedDesc) 1554 1555 // The 19.2 namespace table contains an entry for "namespace" which maps to 1556 // the deprecated namespace tables ID. Even though the cluster version at 1557 // this point is 19.2, we construct a metadata name key in the 20.1 format. 1558 // This is for two reasons: 1559 // 1. We do not want to change the mapping in namespace_deprecated for 1560 // "namespace", as for the purpose of namespace_deprecated, namespace 1561 // refers to the correct ID. 1562 // 2. By adding the ID mapping in the new system.namespace table, the 1563 // idempotent semantics of the migration ensure that "namespace" maps to 1564 // the correct ID in the new system.namespace table after all tables are 1565 // copied over. 1566 nameKey := sqlbase.NewPublicTableKey( 1567 sqlbase.NamespaceTable.GetParentID(), sqlbase.NamespaceTableName) 1568 b.Put(nameKey.Key(r.codec), sqlbase.NamespaceTable.GetID()) 1569 b.Put(sqlbase.MakeDescMetadataKey( 1570 r.codec, sqlbase.NamespaceTable.GetID()), sqlbase.WrapDescriptor(&sqlbase.NamespaceTable)) 1571 return txn.Run(ctx, b) 1572 }) 1573 } 1574 1575 func createRoleOptionsTable(ctx context.Context, r runner) error { 1576 // Create system.role_options table with an entry for (admin, CREATEROLE). 1577 err := createSystemTable(ctx, r, sqlbase.RoleOptionsTable) 1578 if err != nil { 1579 return errors.Wrap(err, "failed to create system.role_options") 1580 } 1581 1582 return nil 1583 } 1584 1585 func addCreateRoleToAdminAndRoot(ctx context.Context, r runner) error { 1586 // Upsert the admin/root roles with CreateRole privilege into the table. 1587 // We intentionally override any existing entry. 1588 const upsertCreateRoleStmt = ` 1589 UPSERT INTO system.role_options (username, option, value) VALUES ($1, 'CREATEROLE', NULL) 1590 ` 1591 err := r.execAsRootWithRetry(ctx, 1592 "add role options table and upsert admin with CREATEROLE", 1593 upsertCreateRoleStmt, 1594 sqlbase.AdminRole) 1595 1596 if err != nil { 1597 return err 1598 } 1599 1600 return r.execAsRootWithRetry(ctx, 1601 "add role options table and upsert admin with CREATEROLE", 1602 upsertCreateRoleStmt, 1603 security.RootUser) 1604 } 1605 1606 func createReportsMetaTable(ctx context.Context, r runner) error { 1607 return createSystemTable(ctx, r, sqlbase.ReportsMetaTable) 1608 } 1609 1610 func createStatementInfoSystemTables(ctx context.Context, r runner) error { 1611 if err := createSystemTable(ctx, r, sqlbase.StatementBundleChunksTable); err != nil { 1612 return errors.Wrap(err, "failed to create system.statement_bundle_chunks") 1613 } 1614 if err := createSystemTable(ctx, r, sqlbase.StatementDiagnosticsRequestsTable); err != nil { 1615 return errors.Wrap(err, "failed to create system.statement_diagnostics_requests") 1616 } 1617 if err := createSystemTable(ctx, r, sqlbase.StatementDiagnosticsTable); err != nil { 1618 return errors.Wrap(err, "failed to create system.statement_diagnostics") 1619 } 1620 return nil 1621 } 1622 1623 // SettingsDefaultOverrides documents the effect of several migrations that add 1624 // an explicit value for a setting, effectively changing the "default value" 1625 // from what was defined in code. 1626 var SettingsDefaultOverrides = map[string]string{ 1627 "diagnostics.reporting.enabled": "true", 1628 "cluster.secret": "<random>", 1629 } 1630 1631 func optInToDiagnosticsStatReporting(ctx context.Context, r runner) error { 1632 // We're opting-out of the automatic opt-in. See discussion in updates.go. 1633 if cluster.TelemetryOptOut() { 1634 return nil 1635 } 1636 return r.execAsRootWithRetry(ctx, "optInToDiagnosticsStatReporting", 1637 `SET CLUSTER SETTING diagnostics.reporting.enabled = true`) 1638 } 1639 1640 func initializeClusterSecret(ctx context.Context, r runner) error { 1641 return r.execAsRootWithRetry( 1642 ctx, "initializeClusterSecret", 1643 `SET CLUSTER SETTING cluster.secret = gen_random_uuid()::STRING`, 1644 ) 1645 } 1646 1647 func populateVersionSetting(ctx context.Context, r runner) error { 1648 var v roachpb.Version 1649 if err := r.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 1650 return txn.GetProto(ctx, keys.BootstrapVersionKey, &v) 1651 }); err != nil { 1652 return err 1653 } 1654 if v == (roachpb.Version{}) { 1655 // The cluster was bootstrapped at v1.0 (or even earlier), so just use 1656 // the TestingBinaryMinSupportedVersion of the binary. 1657 v = clusterversion.TestingBinaryMinSupportedVersion 1658 } 1659 1660 b, err := protoutil.Marshal(&clusterversion.ClusterVersion{Version: v}) 1661 if err != nil { 1662 return errors.Wrap(err, "while marshaling version") 1663 } 1664 1665 // Add a ON CONFLICT DO NOTHING to avoid changing an existing version. 1666 // Again, this can happen if the migration doesn't run to completion 1667 // (overwriting also seems reasonable, but what for). 1668 // We don't allow users to perform version changes until we have run 1669 // the insert below. 1670 if err := r.execAsRoot( 1671 ctx, 1672 "insert-setting", 1673 fmt.Sprintf(`INSERT INTO system.settings (name, value, "lastUpdated", "valueType") VALUES ('version', x'%x', now(), 'm') ON CONFLICT(name) DO NOTHING`, b), 1674 ); err != nil { 1675 return err 1676 } 1677 1678 if err := r.execAsRoot( 1679 ctx, "set-setting", "SET CLUSTER SETTING version = $1", v.String(), 1680 ); err != nil { 1681 return err 1682 } 1683 return nil 1684 } 1685 1686 func addRootUser(ctx context.Context, r runner) error { 1687 // Upsert the root user into the table. We intentionally override any existing entry. 1688 const upsertRootStmt = ` 1689 UPSERT INTO system.users (username, "hashedPassword", "isRole") VALUES ($1, '', false) 1690 ` 1691 return r.execAsRootWithRetry(ctx, "addRootUser", upsertRootStmt, security.RootUser) 1692 } 1693 1694 func addAdminRole(ctx context.Context, r runner) error { 1695 // Upsert the admin role into the table. We intentionally override any existing entry. 1696 const upsertAdminStmt = ` 1697 UPSERT INTO system.users (username, "hashedPassword", "isRole") VALUES ($1, '', true) 1698 ` 1699 return r.execAsRootWithRetry(ctx, "addAdminRole", upsertAdminStmt, sqlbase.AdminRole) 1700 } 1701 1702 func addRootToAdminRole(ctx context.Context, r runner) error { 1703 // Upsert the role membership into the table. We intentionally override any existing entry. 1704 const upsertAdminStmt = ` 1705 UPSERT INTO system.role_members ("role", "member", "isAdmin") VALUES ($1, $2, true) 1706 ` 1707 return r.execAsRootWithRetry( 1708 ctx, "addRootToAdminRole", upsertAdminStmt, sqlbase.AdminRole, security.RootUser) 1709 } 1710 1711 func disallowPublicUserOrRole(ctx context.Context, r runner) error { 1712 // Check whether a user or role named "public" exists. 1713 const selectPublicStmt = ` 1714 SELECT username, "isRole" from system.users WHERE username = $1 1715 ` 1716 1717 for retry := retry.Start(retry.Options{MaxRetries: 5}); retry.Next(); { 1718 row, err := r.sqlExecutor.QueryRowEx( 1719 ctx, "disallowPublicUserOrRole", nil, /* txn */ 1720 sqlbase.InternalExecutorSessionDataOverride{ 1721 User: security.RootUser, 1722 }, 1723 selectPublicStmt, sqlbase.PublicRole, 1724 ) 1725 if err != nil { 1726 continue 1727 } 1728 if row == nil { 1729 // No such user. 1730 return nil 1731 } 1732 1733 isRole, ok := tree.AsDBool(row[1]) 1734 if !ok { 1735 log.Fatalf(ctx, "expected 'isRole' column of system.users to be of type bool, got %v", row) 1736 } 1737 1738 if isRole { 1739 return fmt.Errorf(`found a role named %s which is now a reserved name. Please drop the role `+ 1740 `(DROP ROLE %s) using a previous version of CockroachDB and try again`, 1741 sqlbase.PublicRole, sqlbase.PublicRole) 1742 } 1743 return fmt.Errorf(`found a user named %s which is now a reserved name. Please drop the role `+ 1744 `(DROP USER %s) using a previous version of CockroachDB and try again`, 1745 sqlbase.PublicRole, sqlbase.PublicRole) 1746 } 1747 return nil 1748 } 1749 1750 func createDefaultDbs(ctx context.Context, r runner) error { 1751 // Create the default databases. These are plain databases with 1752 // default permissions. Nothing special happens if they exist 1753 // already. 1754 const createDbStmt = `CREATE DATABASE IF NOT EXISTS "%s"` 1755 1756 var err error 1757 for retry := retry.Start(retry.Options{MaxRetries: 5}); retry.Next(); { 1758 for _, dbName := range []string{sqlbase.DefaultDatabaseName, sqlbase.PgDatabaseName} { 1759 stmt := fmt.Sprintf(createDbStmt, dbName) 1760 err = r.execAsRoot(ctx, "create-default-db", stmt) 1761 if err != nil { 1762 log.Warningf(ctx, "failed attempt to add database %q: %s", dbName, err) 1763 break 1764 } 1765 } 1766 if err == nil { 1767 break 1768 } 1769 } 1770 return err 1771 } 1772 1773 func retireOldTsPurgeIntervalSettings(ctx context.Context, r runner) error { 1774 // We are going to deprecate `timeseries.storage.10s_resolution_ttl` 1775 // into `timeseries.storage.resolution_10s.ttl` if the latter is not 1776 // defined. 1777 // 1778 // Ditto for the `30m` resolution. 1779 1780 // Copy 'timeseries.storage.10s_resolution_ttl' into 1781 // 'timeseries.storage.resolution_10s.ttl' if the former is defined 1782 // and the latter is not defined yet. 1783 // 1784 // We rely on the SELECT returning no row if the original setting 1785 // was not defined, and INSERT ON CONFLICT DO NOTHING to ignore the 1786 // insert if the new name was already set. 1787 if err := r.execAsRoot(ctx, "copy-setting", ` 1788 INSERT INTO system.settings (name, value, "lastUpdated", "valueType") 1789 SELECT 'timeseries.storage.resolution_10s.ttl', value, "lastUpdated", "valueType" 1790 FROM system.settings WHERE name = 'timeseries.storage.10s_resolution_ttl' 1791 ON CONFLICT (name) DO NOTHING`, 1792 ); err != nil { 1793 return err 1794 } 1795 1796 // Ditto 30m. 1797 if err := r.execAsRoot(ctx, "copy-setting", ` 1798 INSERT INTO system.settings (name, value, "lastUpdated", "valueType") 1799 SELECT 'timeseries.storage.resolution_30m.ttl', value, "lastUpdated", "valueType" 1800 FROM system.settings WHERE name = 'timeseries.storage.30m_resolution_ttl' 1801 ON CONFLICT (name) DO NOTHING`, 1802 ); err != nil { 1803 return err 1804 } 1805 1806 return nil 1807 } 1808 1809 func updateSystemLocationData(ctx context.Context, r runner) error { 1810 // See if the system.locations table already has data in it. 1811 // If so, we don't want to do anything. 1812 row, err := r.sqlExecutor.QueryRowEx(ctx, "update-system-locations", 1813 nil, /* txn */ 1814 sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser}, 1815 `SELECT count(*) FROM system.locations`) 1816 if err != nil { 1817 return err 1818 } 1819 count := int(tree.MustBeDInt(row[0])) 1820 if count != 0 { 1821 return nil 1822 } 1823 1824 for _, loc := range roachpb.DefaultLocationInformation { 1825 stmt := `UPSERT INTO system.locations VALUES ($1, $2, $3, $4)` 1826 tier := loc.Locality.Tiers[0] 1827 if err := r.execAsRoot(ctx, "update-system-locations", 1828 stmt, tier.Key, tier.Value, loc.Latitude, loc.Longitude, 1829 ); err != nil { 1830 return err 1831 } 1832 } 1833 return nil 1834 } 1835 1836 func depublicizeSystemComments(ctx context.Context, r runner) error { 1837 // At some point in time, system.comments was mistakenly created 1838 // with all privileges granted to the "public" role (i.e. everyone). 1839 // This migration cleans this up. 1840 1841 // Schema changes are normally banned in the mixed-version 19.2/20.1 state, so 1842 // we override the ban and force the schema change to run anyway. This is safe 1843 // because updating privileges on a table only requires an update to the table 1844 // descriptor, and the job created will only wait for leases to expire. We 1845 // don't expect any other schema changes to happen on the comments table that 1846 // the 20.1 job could interfere with. The update to the table descriptor would 1847 // cause a 19.2 SchemaChangeManager to attempt a schema change, but it would 1848 // be a no-op. 1849 ctx = descs.MigrationSchemaChangeRequiredContext(ctx) 1850 1851 for _, priv := range []string{"GRANT", "INSERT", "DELETE", "UPDATE"} { 1852 stmt := fmt.Sprintf(`REVOKE %s ON TABLE system.comments FROM public`, priv) 1853 // REVOKE should never fail here -- it's always possible for root 1854 // to revoke a privilege even if it's not currently granted. 1855 if err := r.execAsRoot(ctx, "depublicize-system-comments", stmt); err != nil { 1856 return err 1857 } 1858 } 1859 return nil 1860 } 1861 1862 func alterSystemJobsAddCreatedByColumns(ctx context.Context, r runner) error { 1863 // NB: we use family name as it existed in the original system.jobs schema to 1864 // minimize migration work needed (avoid renames). 1865 addColsStmt := ` 1866 ALTER TABLE system.jobs 1867 ADD COLUMN IF NOT EXISTS created_by_type STRING FAMILY fam_0_id_status_created_payload, 1868 ADD COLUMN IF NOT EXISTS created_by_id INT FAMILY fam_0_id_status_created_payload 1869 ` 1870 addIdxStmt := ` 1871 CREATE INDEX IF NOT EXISTS jobs_created_by_type_created_by_id_idx 1872 ON system.jobs (created_by_type, created_by_id) 1873 STORING (status) 1874 ` 1875 asNode := sqlbase.InternalExecutorSessionDataOverride{ 1876 User: security.NodeUser, 1877 } 1878 1879 if _, err := r.sqlExecutor.ExecEx( 1880 ctx, "add-jobs-cols", nil, asNode, addColsStmt); err != nil { 1881 return err 1882 } 1883 1884 _, err := r.sqlExecutor.ExecEx(ctx, "add-jobs-idx", nil, asNode, addIdxStmt) 1885 return err 1886 }