vitess.io/vitess@v0.16.2/go/vt/vtorc/inst/instance_dao.go (about) 1 /* 2 Copyright 2014 Outbrain Inc. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package inst 18 19 import ( 20 "bytes" 21 "errors" 22 "fmt" 23 "regexp" 24 "runtime" 25 "strconv" 26 "strings" 27 "sync" 28 "time" 29 30 "github.com/patrickmn/go-cache" 31 "github.com/rcrowley/go-metrics" 32 "github.com/sjmudd/stopwatch" 33 34 "vitess.io/vitess/go/vt/external/golib/sqlutils" 35 36 vitessmysql "vitess.io/vitess/go/mysql" 37 "vitess.io/vitess/go/tb" 38 "vitess.io/vitess/go/vt/log" 39 replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata" 40 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 41 "vitess.io/vitess/go/vt/topo/topoproto" 42 "vitess.io/vitess/go/vt/vtctl/reparentutil" 43 "vitess.io/vitess/go/vt/vtctl/reparentutil/promotionrule" 44 "vitess.io/vitess/go/vt/vtorc/collection" 45 "vitess.io/vitess/go/vt/vtorc/config" 46 "vitess.io/vitess/go/vt/vtorc/db" 47 "vitess.io/vitess/go/vt/vtorc/metrics/query" 48 "vitess.io/vitess/go/vt/vtorc/util" 49 math "vitess.io/vitess/go/vt/vtorc/util" 50 ) 51 52 const ( 53 backendDBConcurrency = 20 54 ) 55 56 var instanceReadChan = make(chan bool, backendDBConcurrency) 57 var instanceWriteChan = make(chan bool, backendDBConcurrency) 58 59 // Constant strings for Group Replication information 60 // See https://dev.mysql.com/doc/refman/8.0/en/replication-group-members-table.html for additional information. 61 const ( 62 // Group member roles 63 GroupReplicationMemberRolePrimary = "PRIMARY" 64 GroupReplicationMemberRoleSecondary = "SECONDARY" 65 // Group member states 66 GroupReplicationMemberStateOnline = "ONLINE" 67 GroupReplicationMemberStateRecovering = "RECOVERING" 68 GroupReplicationMemberStateUnreachable = "UNREACHABLE" 69 GroupReplicationMemberStateOffline = "OFFLINE" 70 GroupReplicationMemberStateError = "ERROR" 71 ) 72 73 var forgetInstanceKeys *cache.Cache 74 75 var accessDeniedCounter = metrics.NewCounter() 76 var readTopologyInstanceCounter = metrics.NewCounter() 77 var readInstanceCounter = metrics.NewCounter() 78 var writeInstanceCounter = metrics.NewCounter() 79 var backendWrites = collection.CreateOrReturnCollection("BACKEND_WRITES") 80 var writeBufferLatency = stopwatch.NewNamedStopwatch() 81 82 var emptyQuotesRegexp = regexp.MustCompile(`^""$`) 83 84 func init() { 85 _ = metrics.Register("instance.access_denied", accessDeniedCounter) 86 _ = metrics.Register("instance.read_topology", readTopologyInstanceCounter) 87 _ = metrics.Register("instance.read", readInstanceCounter) 88 _ = metrics.Register("instance.write", writeInstanceCounter) 89 _ = writeBufferLatency.AddMany([]string{"wait", "write"}) 90 writeBufferLatency.Start("wait") 91 92 go initializeInstanceDao() 93 } 94 95 func initializeInstanceDao() { 96 config.WaitForConfigurationToBeLoaded() 97 forgetInstanceKeys = cache.New(time.Duration(config.Config.InstancePollSeconds*3)*time.Second, time.Second) 98 } 99 100 // ExecDBWriteFunc chooses how to execute a write onto the database: whether synchronuously or not 101 func ExecDBWriteFunc(f func() error) error { 102 m := query.NewMetric() 103 104 instanceWriteChan <- true 105 m.WaitLatency = time.Since(m.Timestamp) 106 107 // catch the exec time and error if there is one 108 defer func() { 109 if r := recover(); r != nil { 110 if _, ok := r.(runtime.Error); ok { 111 panic(r) 112 } 113 114 if s, ok := r.(string); ok { 115 m.Err = errors.New(s) 116 } else { 117 m.Err = r.(error) 118 } 119 } 120 m.ExecuteLatency = time.Since(m.Timestamp.Add(m.WaitLatency)) 121 _ = backendWrites.Append(m) 122 <-instanceWriteChan // assume this takes no time 123 }() 124 res := f() 125 return res 126 } 127 128 func ExpireTableData(tableName string, timestampColumn string) error { 129 query := fmt.Sprintf("delete from %s where %s < NOW() - INTERVAL ? DAY", tableName, timestampColumn) 130 writeFunc := func() error { 131 _, err := db.ExecVTOrc(query, config.Config.AuditPurgeDays) 132 return err 133 } 134 return ExecDBWriteFunc(writeFunc) 135 } 136 137 // logReadTopologyInstanceError logs an error, if applicable, for a ReadTopologyInstance operation, 138 // providing context and hint as for the source of the error. If there's no hint just provide the 139 // original error. 140 func logReadTopologyInstanceError(instanceKey *InstanceKey, hint string, err error) error { 141 if err == nil { 142 return nil 143 } 144 if !util.ClearToLog("ReadTopologyInstance", instanceKey.StringCode()) { 145 return err 146 } 147 var msg string 148 if hint == "" { 149 msg = fmt.Sprintf("ReadTopologyInstance(%+v): %+v", *instanceKey, err) 150 } else { 151 msg = fmt.Sprintf("ReadTopologyInstance(%+v) %+v: %+v", 152 *instanceKey, 153 strings.Replace(hint, "%", "%%", -1), // escape % 154 err) 155 } 156 log.Errorf(msg) 157 return fmt.Errorf(msg) 158 } 159 160 // ReadTopologyInstance collects information on the state of a MySQL 161 // server and writes the result synchronously to the vtorc 162 // backend. 163 func ReadTopologyInstance(instanceKey *InstanceKey) (*Instance, error) { 164 return ReadTopologyInstanceBufferable(instanceKey, nil) 165 } 166 167 // ReadTopologyInstanceBufferable connects to a topology MySQL instance 168 // and collects information on the server and its replication state. 169 // It writes the information retrieved into vtorc's backend. 170 // - writes are optionally buffered. 171 // - timing information can be collected for the stages performed. 172 func ReadTopologyInstanceBufferable(instanceKey *InstanceKey, latency *stopwatch.NamedStopwatch) (inst *Instance, err error) { 173 defer func() { 174 if r := recover(); r != nil { 175 err = logReadTopologyInstanceError(instanceKey, "Unexpected, aborting", tb.Errorf("%+v", r)) 176 } 177 }() 178 179 var waitGroup sync.WaitGroup 180 var tablet *topodatapb.Tablet 181 var durability reparentutil.Durabler 182 var fullStatus *replicationdatapb.FullStatus 183 readingStartTime := time.Now() 184 instance := NewInstance() 185 instanceFound := false 186 partialSuccess := false 187 resolvedHostname := "" 188 errorChan := make(chan error, 32) 189 var resolveErr error 190 191 if !instanceKey.IsValid() { 192 latency.Start("backend") 193 if err := UpdateInstanceLastAttemptedCheck(instanceKey); err != nil { 194 log.Errorf("ReadTopologyInstanceBufferable: %+v: %v", instanceKey, err) 195 } 196 latency.Stop("backend") 197 return instance, fmt.Errorf("ReadTopologyInstance will not act on invalid instance key: %+v", *instanceKey) 198 } 199 200 lastAttemptedCheckTimer := time.AfterFunc(time.Second, func() { 201 go func() { 202 _ = UpdateInstanceLastAttemptedCheck(instanceKey) 203 }() 204 }) 205 206 latency.Start("instance") 207 208 tablet, err = ReadTablet(*instanceKey) 209 if err != nil { 210 goto Cleanup 211 } 212 if tablet == nil { 213 // This can happen because Orc rediscovers instances by alt hostnames, 214 // lit localhost, ip, etc. 215 // TODO(sougou): disable this ability. 216 goto Cleanup 217 } 218 219 durability, err = GetDurabilityPolicy(tablet) 220 if err != nil { 221 goto Cleanup 222 } 223 224 fullStatus, err = FullStatus(*instanceKey) 225 if err != nil { 226 goto Cleanup 227 } 228 partialSuccess = true // We at least managed to read something from the server. 229 230 instance.Key = *instanceKey 231 { 232 // We begin with a few operations we can run concurrently, and which do not depend on anything 233 instance.ServerID = uint(fullStatus.ServerId) 234 instance.Version = fullStatus.Version 235 instance.ReadOnly = fullStatus.ReadOnly 236 instance.LogBinEnabled = fullStatus.LogBinEnabled 237 instance.BinlogFormat = fullStatus.BinlogFormat 238 instance.LogReplicationUpdatesEnabled = fullStatus.LogReplicaUpdates 239 instance.VersionComment = fullStatus.VersionComment 240 resolvedHostname = instance.Key.Hostname 241 242 if instance.LogBinEnabled && fullStatus.PrimaryStatus != nil { 243 binlogPos, err := getBinlogCoordinatesFromPositionString(fullStatus.PrimaryStatus.FilePosition) 244 instance.SelfBinlogCoordinates = binlogPos 245 errorChan <- err 246 } 247 248 instance.SemiSyncPrimaryEnabled = fullStatus.SemiSyncPrimaryEnabled 249 instance.SemiSyncReplicaEnabled = fullStatus.SemiSyncReplicaEnabled 250 instance.SemiSyncPrimaryWaitForReplicaCount = uint(fullStatus.SemiSyncWaitForReplicaCount) 251 instance.SemiSyncPrimaryTimeout = fullStatus.SemiSyncPrimaryTimeout 252 253 instance.SemiSyncPrimaryClients = uint(fullStatus.SemiSyncPrimaryClients) 254 instance.SemiSyncPrimaryStatus = fullStatus.SemiSyncPrimaryStatus 255 instance.SemiSyncReplicaStatus = fullStatus.SemiSyncReplicaStatus 256 257 if (instance.IsOracleMySQL() || instance.IsPercona()) && !instance.IsSmallerMajorVersionByString("5.6") { 258 // Stuff only supported on Oracle MySQL >= 5.6 259 // ... 260 // @@gtid_mode only available in Orcale MySQL >= 5.6 261 instance.GTIDMode = fullStatus.GtidMode 262 instance.ServerUUID = fullStatus.ServerUuid 263 if fullStatus.PrimaryStatus != nil { 264 GtidExecutedPos, err := vitessmysql.DecodePosition(fullStatus.PrimaryStatus.Position) 265 errorChan <- err 266 if err == nil && GtidExecutedPos.GTIDSet != nil { 267 instance.ExecutedGtidSet = GtidExecutedPos.GTIDSet.String() 268 } 269 } 270 GtidPurgedPos, err := vitessmysql.DecodePosition(fullStatus.GtidPurged) 271 errorChan <- err 272 if err == nil && GtidPurgedPos.GTIDSet != nil { 273 instance.GtidPurged = GtidPurgedPos.GTIDSet.String() 274 } 275 instance.BinlogRowImage = fullStatus.BinlogRowImage 276 277 if instance.GTIDMode != "" && instance.GTIDMode != "OFF" { 278 instance.SupportsOracleGTID = true 279 } 280 } 281 } 282 if resolvedHostname != instance.Key.Hostname { 283 latency.Start("backend") 284 UpdateResolvedHostname(instance.Key.Hostname, resolvedHostname) 285 latency.Stop("backend") 286 instance.Key.Hostname = resolvedHostname 287 } 288 if instance.Key.Hostname == "" { 289 err = fmt.Errorf("ReadTopologyInstance: empty hostname (%+v). Bailing out", *instanceKey) 290 goto Cleanup 291 } 292 go func() { 293 _ = ResolveHostnameIPs(instance.Key.Hostname) 294 }() 295 296 instance.ReplicationIOThreadState = ReplicationThreadStateNoThread 297 instance.ReplicationSQLThreadState = ReplicationThreadStateNoThread 298 if fullStatus.ReplicationStatus != nil { 299 instance.HasReplicationCredentials = fullStatus.ReplicationStatus.SourceUser != "" 300 301 instance.ReplicationIOThreadState = ReplicationThreadStateFromReplicationState(vitessmysql.ReplicationState(fullStatus.ReplicationStatus.IoState)) 302 instance.ReplicationSQLThreadState = ReplicationThreadStateFromReplicationState(vitessmysql.ReplicationState(fullStatus.ReplicationStatus.SqlState)) 303 instance.ReplicationIOThreadRuning = instance.ReplicationIOThreadState.IsRunning() 304 instance.ReplicationSQLThreadRuning = instance.ReplicationSQLThreadState.IsRunning() 305 306 binlogPos, err := getBinlogCoordinatesFromPositionString(fullStatus.ReplicationStatus.RelayLogSourceBinlogEquivalentPosition) 307 instance.ReadBinlogCoordinates = binlogPos 308 errorChan <- err 309 310 binlogPos, err = getBinlogCoordinatesFromPositionString(fullStatus.ReplicationStatus.FilePosition) 311 instance.ExecBinlogCoordinates = binlogPos 312 errorChan <- err 313 instance.IsDetached, _ = instance.ExecBinlogCoordinates.ExtractDetachedCoordinates() 314 315 binlogPos, err = getBinlogCoordinatesFromPositionString(fullStatus.ReplicationStatus.RelayLogFilePosition) 316 instance.RelaylogCoordinates = binlogPos 317 instance.RelaylogCoordinates.Type = RelayLog 318 errorChan <- err 319 320 instance.LastSQLError = emptyQuotesRegexp.ReplaceAllString(strconv.QuoteToASCII(fullStatus.ReplicationStatus.LastSqlError), "") 321 instance.LastIOError = emptyQuotesRegexp.ReplaceAllString(strconv.QuoteToASCII(fullStatus.ReplicationStatus.LastIoError), "") 322 323 instance.SQLDelay = uint(fullStatus.ReplicationStatus.SqlDelay) 324 instance.UsingOracleGTID = fullStatus.ReplicationStatus.AutoPosition 325 instance.UsingMariaDBGTID = fullStatus.ReplicationStatus.UsingGtid 326 instance.SourceUUID = fullStatus.ReplicationStatus.SourceUuid 327 instance.HasReplicationFilters = fullStatus.ReplicationStatus.HasReplicationFilters 328 329 primaryHostname := fullStatus.ReplicationStatus.SourceHost 330 primaryKey, err := NewResolveInstanceKey(primaryHostname, int(fullStatus.ReplicationStatus.SourcePort)) 331 if err != nil { 332 _ = logReadTopologyInstanceError(instanceKey, "NewResolveInstanceKey", err) 333 } 334 primaryKey.Hostname, resolveErr = ResolveHostname(primaryKey.Hostname) 335 if resolveErr != nil { 336 _ = logReadTopologyInstanceError(instanceKey, fmt.Sprintf("ResolveHostname(%q)", primaryKey.Hostname), resolveErr) 337 } 338 instance.SourceKey = *primaryKey 339 instance.IsDetachedPrimary = instance.SourceKey.IsDetached() 340 341 if fullStatus.ReplicationStatus.ReplicationLagUnknown { 342 instance.SecondsBehindPrimary.Valid = false 343 } else { 344 instance.SecondsBehindPrimary.Valid = true 345 instance.SecondsBehindPrimary.Int64 = int64(fullStatus.ReplicationStatus.ReplicationLagSeconds) 346 } 347 if instance.SecondsBehindPrimary.Valid && instance.SecondsBehindPrimary.Int64 < 0 { 348 log.Warningf("Host: %+v, instance.ReplicationLagSeconds < 0 [%+v], correcting to 0", instanceKey, instance.SecondsBehindPrimary.Int64) 349 instance.SecondsBehindPrimary.Int64 = 0 350 } 351 // And until told otherwise: 352 instance.ReplicationLagSeconds = instance.SecondsBehindPrimary 353 354 instance.AllowTLS = fullStatus.ReplicationStatus.SslAllowed 355 } 356 357 instanceFound = true 358 359 // ------------------------------------------------------------------------- 360 // Anything after this point does not affect the fact the instance is found. 361 // No `goto Cleanup` after this point. 362 // ------------------------------------------------------------------------- 363 364 instance.DataCenter = tablet.Alias.Cell 365 instance.InstanceAlias = topoproto.TabletAliasString(tablet.Alias) 366 367 { 368 latency.Start("backend") 369 err = ReadInstanceClusterAttributes(instance) 370 latency.Stop("backend") 371 _ = logReadTopologyInstanceError(instanceKey, "ReadInstanceClusterAttributes", err) 372 } 373 374 // We need to update candidate_database_instance. 375 // We register the rule even if it hasn't changed, 376 // to bump the last_suggested time. 377 instance.PromotionRule = PromotionRule(durability, tablet) 378 err = RegisterCandidateInstance(NewCandidateDatabaseInstance(instanceKey, instance.PromotionRule).WithCurrentTime()) 379 _ = logReadTopologyInstanceError(instanceKey, "RegisterCandidateInstance", err) 380 381 Cleanup: 382 waitGroup.Wait() 383 close(errorChan) 384 err = func() error { 385 if err != nil { 386 return err 387 } 388 389 for err := range errorChan { 390 if err != nil { 391 return err 392 } 393 } 394 return nil 395 }() 396 397 if instanceFound { 398 if instance.IsCoPrimary { 399 // Take co-primary into account, and avoid infinite loop 400 instance.AncestryUUID = fmt.Sprintf("%s,%s", instance.SourceUUID, instance.ServerUUID) 401 } else { 402 instance.AncestryUUID = fmt.Sprintf("%s,%s", instance.AncestryUUID, instance.ServerUUID) 403 } 404 // Add replication group ancestry UUID as well. Otherwise, VTOrc thinks there are errant GTIDs in group 405 // members and its replicas, even though they are not. 406 instance.AncestryUUID = fmt.Sprintf("%s,%s", instance.AncestryUUID, instance.ReplicationGroupName) 407 instance.AncestryUUID = strings.Trim(instance.AncestryUUID, ",") 408 if instance.ExecutedGtidSet != "" && instance.primaryExecutedGtidSet != "" { 409 // Compare primary & replica GTID sets, but ignore the sets that present the primary's UUID. 410 // This is because vtorc may pool primary and replica at an inconvenient timing, 411 // such that the replica may _seems_ to have more entries than the primary, when in fact 412 // it's just that the primary's probing is stale. 413 redactedExecutedGtidSet, _ := NewOracleGtidSet(instance.ExecutedGtidSet) 414 for _, uuid := range strings.Split(instance.AncestryUUID, ",") { 415 if uuid != instance.ServerUUID { 416 redactedExecutedGtidSet.RemoveUUID(uuid) 417 } 418 if instance.IsCoPrimary && uuid == instance.ServerUUID { 419 // If this is a co-primary, then this server is likely to show its own generated GTIDs as errant, 420 // because its co-primary has not applied them yet 421 redactedExecutedGtidSet.RemoveUUID(uuid) 422 } 423 } 424 // Avoid querying the database if there's no point: 425 if !redactedExecutedGtidSet.IsEmpty() { 426 redactedPrimaryExecutedGtidSet, _ := NewOracleGtidSet(instance.primaryExecutedGtidSet) 427 redactedPrimaryExecutedGtidSet.RemoveUUID(instance.SourceUUID) 428 429 instance.GtidErrant, err = vitessmysql.Subtract(redactedExecutedGtidSet.String(), redactedPrimaryExecutedGtidSet.String()) 430 } 431 } 432 } 433 434 latency.Stop("instance") 435 readTopologyInstanceCounter.Inc(1) 436 437 if instanceFound { 438 instance.LastDiscoveryLatency = time.Since(readingStartTime) 439 instance.IsLastCheckValid = true 440 instance.IsRecentlyChecked = true 441 instance.IsUpToDate = true 442 latency.Start("backend") 443 _ = WriteInstance(instance, instanceFound, err) 444 lastAttemptedCheckTimer.Stop() 445 latency.Stop("backend") 446 return instance, nil 447 } 448 449 // Something is wrong, could be network-wise. Record that we 450 // tried to check the instance. last_attempted_check is also 451 // updated on success by writeInstance. 452 latency.Start("backend") 453 _ = UpdateInstanceLastChecked(instanceKey, partialSuccess) 454 latency.Stop("backend") 455 return nil, err 456 } 457 458 // getKeyspaceShardName returns a single string having both the keyspace and shard 459 func getKeyspaceShardName(keyspace, shard string) string { 460 return fmt.Sprintf("%v:%v", keyspace, shard) 461 } 462 463 func getBinlogCoordinatesFromPositionString(position string) (BinlogCoordinates, error) { 464 pos, err := vitessmysql.DecodePosition(position) 465 if err != nil || pos.GTIDSet == nil { 466 return BinlogCoordinates{}, err 467 } 468 binLogCoordinates, err := ParseBinlogCoordinates(pos.String()) 469 if err != nil { 470 return BinlogCoordinates{}, err 471 } 472 return *binLogCoordinates, nil 473 } 474 475 func ReadReplicationGroupPrimary(instance *Instance) (err error) { 476 query := ` 477 SELECT 478 replication_group_primary_host, 479 replication_group_primary_port 480 FROM 481 database_instance 482 WHERE 483 replication_group_name = ? 484 AND replication_group_member_role = 'PRIMARY' 485 ` 486 queryArgs := sqlutils.Args(instance.ReplicationGroupName) 487 err = db.QueryVTOrc(query, queryArgs, func(row sqlutils.RowMap) error { 488 groupPrimaryHost := row.GetString("replication_group_primary_host") 489 groupPrimaryPort := row.GetInt("replication_group_primary_port") 490 resolvedGroupPrimary, err := NewResolveInstanceKey(groupPrimaryHost, groupPrimaryPort) 491 if err != nil { 492 return err 493 } 494 instance.ReplicationGroupPrimaryInstanceKey = *resolvedGroupPrimary 495 return nil 496 }) 497 return err 498 } 499 500 // ReadInstanceClusterAttributes will return the cluster name for a given instance by looking at its primary 501 // and getting it from there. 502 // It is a non-recursive function and so-called-recursion is performed upon periodic reading of 503 // instances. 504 func ReadInstanceClusterAttributes(instance *Instance) (err error) { 505 var primaryOrGroupPrimaryInstanceKey InstanceKey 506 var primaryOrGroupPrimaryReplicationDepth uint 507 var ancestryUUID string 508 var primaryOrGroupPrimaryExecutedGtidSet string 509 primaryOrGroupPrimaryDataFound := false 510 511 query := ` 512 select 513 replication_depth, 514 source_host, 515 source_port, 516 ancestry_uuid, 517 executed_gtid_set 518 from database_instance 519 where hostname=? and port=? 520 ` 521 // For instances that are part of a replication group, if the host is not the group's primary, we use the 522 // information from the group primary. If it is the group primary, we use the information of its primary 523 // (if it has any). If it is not a group member, we use the information from the host's primary. 524 if instance.IsReplicationGroupSecondary() { 525 primaryOrGroupPrimaryInstanceKey = instance.ReplicationGroupPrimaryInstanceKey 526 } else { 527 primaryOrGroupPrimaryInstanceKey = instance.SourceKey 528 } 529 args := sqlutils.Args(primaryOrGroupPrimaryInstanceKey.Hostname, primaryOrGroupPrimaryInstanceKey.Port) 530 err = db.QueryVTOrc(query, args, func(m sqlutils.RowMap) error { 531 primaryOrGroupPrimaryReplicationDepth = m.GetUint("replication_depth") 532 primaryOrGroupPrimaryInstanceKey.Hostname = m.GetString("source_host") 533 primaryOrGroupPrimaryInstanceKey.Port = m.GetInt("source_port") 534 ancestryUUID = m.GetString("ancestry_uuid") 535 primaryOrGroupPrimaryExecutedGtidSet = m.GetString("executed_gtid_set") 536 primaryOrGroupPrimaryDataFound = true 537 return nil 538 }) 539 if err != nil { 540 log.Error(err) 541 return err 542 } 543 544 var replicationDepth uint 545 if primaryOrGroupPrimaryDataFound { 546 replicationDepth = primaryOrGroupPrimaryReplicationDepth + 1 547 } 548 isCoPrimary := false 549 if primaryOrGroupPrimaryInstanceKey.Equals(&instance.Key) { 550 // co-primary calls for special case, in fear of the infinite loop 551 isCoPrimary = true 552 } 553 instance.ReplicationDepth = replicationDepth 554 instance.IsCoPrimary = isCoPrimary 555 instance.AncestryUUID = ancestryUUID 556 instance.primaryExecutedGtidSet = primaryOrGroupPrimaryExecutedGtidSet 557 return nil 558 } 559 560 // readInstanceRow reads a single instance row from the vtorc backend database. 561 func readInstanceRow(m sqlutils.RowMap) *Instance { 562 instance := NewInstance() 563 564 instance.Key.Hostname = m.GetString("hostname") 565 instance.Key.Port = m.GetInt("port") 566 instance.ServerID = m.GetUint("server_id") 567 instance.ServerUUID = m.GetString("server_uuid") 568 instance.Version = m.GetString("version") 569 instance.VersionComment = m.GetString("version_comment") 570 instance.ReadOnly = m.GetBool("read_only") 571 instance.BinlogFormat = m.GetString("binlog_format") 572 instance.BinlogRowImage = m.GetString("binlog_row_image") 573 instance.LogBinEnabled = m.GetBool("log_bin") 574 instance.LogReplicationUpdatesEnabled = m.GetBool("log_replica_updates") 575 instance.SourceKey.Hostname = m.GetString("source_host") 576 instance.SourceKey.Port = m.GetInt("source_port") 577 instance.IsDetachedPrimary = instance.SourceKey.IsDetached() 578 instance.ReplicationSQLThreadRuning = m.GetBool("replica_sql_running") 579 instance.ReplicationIOThreadRuning = m.GetBool("replica_io_running") 580 instance.ReplicationSQLThreadState = ReplicationThreadState(m.GetInt("replication_sql_thread_state")) 581 instance.ReplicationIOThreadState = ReplicationThreadState(m.GetInt("replication_io_thread_state")) 582 instance.HasReplicationFilters = m.GetBool("has_replication_filters") 583 instance.SupportsOracleGTID = m.GetBool("supports_oracle_gtid") 584 instance.UsingOracleGTID = m.GetBool("oracle_gtid") 585 instance.SourceUUID = m.GetString("source_uuid") 586 instance.AncestryUUID = m.GetString("ancestry_uuid") 587 instance.ExecutedGtidSet = m.GetString("executed_gtid_set") 588 instance.GTIDMode = m.GetString("gtid_mode") 589 instance.GtidPurged = m.GetString("gtid_purged") 590 instance.GtidErrant = m.GetString("gtid_errant") 591 instance.UsingMariaDBGTID = m.GetBool("mariadb_gtid") 592 instance.SelfBinlogCoordinates.LogFile = m.GetString("binary_log_file") 593 instance.SelfBinlogCoordinates.LogPos = m.GetInt64("binary_log_pos") 594 instance.ReadBinlogCoordinates.LogFile = m.GetString("source_log_file") 595 instance.ReadBinlogCoordinates.LogPos = m.GetInt64("read_source_log_pos") 596 instance.ExecBinlogCoordinates.LogFile = m.GetString("relay_source_log_file") 597 instance.ExecBinlogCoordinates.LogPos = m.GetInt64("exec_source_log_pos") 598 instance.IsDetached, _ = instance.ExecBinlogCoordinates.ExtractDetachedCoordinates() 599 instance.RelaylogCoordinates.LogFile = m.GetString("relay_log_file") 600 instance.RelaylogCoordinates.LogPos = m.GetInt64("relay_log_pos") 601 instance.RelaylogCoordinates.Type = RelayLog 602 instance.LastSQLError = m.GetString("last_sql_error") 603 instance.LastIOError = m.GetString("last_io_error") 604 instance.SecondsBehindPrimary = m.GetNullInt64("replication_lag_seconds") 605 instance.ReplicationLagSeconds = m.GetNullInt64("replica_lag_seconds") 606 instance.SQLDelay = m.GetUint("sql_delay") 607 instance.DataCenter = m.GetString("data_center") 608 instance.Region = m.GetString("region") 609 instance.PhysicalEnvironment = m.GetString("physical_environment") 610 instance.SemiSyncEnforced = m.GetBool("semi_sync_enforced") 611 instance.SemiSyncPrimaryEnabled = m.GetBool("semi_sync_primary_enabled") 612 instance.SemiSyncPrimaryTimeout = m.GetUint64("semi_sync_primary_timeout") 613 instance.SemiSyncPrimaryWaitForReplicaCount = m.GetUint("semi_sync_primary_wait_for_replica_count") 614 instance.SemiSyncReplicaEnabled = m.GetBool("semi_sync_replica_enabled") 615 instance.SemiSyncPrimaryStatus = m.GetBool("semi_sync_primary_status") 616 instance.SemiSyncPrimaryClients = m.GetUint("semi_sync_primary_clients") 617 instance.SemiSyncReplicaStatus = m.GetBool("semi_sync_replica_status") 618 instance.ReplicationDepth = m.GetUint("replication_depth") 619 instance.IsCoPrimary = m.GetBool("is_co_primary") 620 instance.HasReplicationCredentials = m.GetBool("has_replication_credentials") 621 instance.IsUpToDate = (m.GetUint("seconds_since_last_checked") <= config.Config.InstancePollSeconds) 622 instance.IsRecentlyChecked = (m.GetUint("seconds_since_last_checked") <= config.Config.InstancePollSeconds*5) 623 instance.LastSeenTimestamp = m.GetString("last_seen") 624 instance.IsLastCheckValid = m.GetBool("is_last_check_valid") 625 instance.SecondsSinceLastSeen = m.GetNullInt64("seconds_since_last_seen") 626 instance.IsCandidate = m.GetBool("is_candidate") 627 instance.PromotionRule = promotionrule.CandidatePromotionRule(m.GetString("promotion_rule")) 628 instance.IsDowntimed = m.GetBool("is_downtimed") 629 instance.DowntimeReason = m.GetString("downtime_reason") 630 instance.DowntimeOwner = m.GetString("downtime_owner") 631 instance.DowntimeEndTimestamp = m.GetString("downtime_end_timestamp") 632 instance.ElapsedDowntime = time.Second * time.Duration(m.GetInt("elapsed_downtime_seconds")) 633 instance.UnresolvedHostname = m.GetString("unresolved_hostname") 634 instance.AllowTLS = m.GetBool("allow_tls") 635 instance.InstanceAlias = m.GetString("instance_alias") 636 instance.LastDiscoveryLatency = time.Duration(m.GetInt64("last_discovery_latency")) * time.Nanosecond 637 638 instance.applyFlavorName() 639 640 /* Read Group Replication variables below */ 641 instance.ReplicationGroupName = m.GetString("replication_group_name") 642 instance.ReplicationGroupIsSinglePrimary = m.GetBool("replication_group_is_single_primary_mode") 643 instance.ReplicationGroupMemberState = m.GetString("replication_group_member_state") 644 instance.ReplicationGroupMemberRole = m.GetString("replication_group_member_role") 645 instance.ReplicationGroupPrimaryInstanceKey = InstanceKey{Hostname: m.GetString("replication_group_primary_host"), 646 Port: m.GetInt("replication_group_primary_port")} 647 _ = instance.ReplicationGroupMembers.ReadJSON(m.GetString("replication_group_members")) 648 //instance.ReplicationGroup = m.GetString("replication_group_") 649 650 // problems 651 if !instance.IsLastCheckValid { 652 instance.Problems = append(instance.Problems, "last_check_invalid") 653 } else if !instance.IsRecentlyChecked { 654 instance.Problems = append(instance.Problems, "not_recently_checked") 655 } else if instance.ReplicationThreadsExist() && !instance.ReplicaRunning() { 656 instance.Problems = append(instance.Problems, "not_replicating") 657 } else if instance.ReplicationLagSeconds.Valid && math.AbsInt64(instance.ReplicationLagSeconds.Int64-int64(instance.SQLDelay)) > int64(config.Config.ReasonableReplicationLagSeconds) { 658 instance.Problems = append(instance.Problems, "replication_lag") 659 } 660 if instance.GtidErrant != "" { 661 instance.Problems = append(instance.Problems, "errant_gtid") 662 } 663 // Group replication problems 664 if instance.ReplicationGroupName != "" && instance.ReplicationGroupMemberState != GroupReplicationMemberStateOnline { 665 instance.Problems = append(instance.Problems, "group_replication_member_not_online") 666 } 667 668 return instance 669 } 670 671 // readInstancesByCondition is a generic function to read instances from the backend database 672 func readInstancesByCondition(condition string, args []any, sort string) ([](*Instance), error) { 673 readFunc := func() ([](*Instance), error) { 674 instances := [](*Instance){} 675 676 if sort == "" { 677 sort = `hostname, port` 678 } 679 query := fmt.Sprintf(` 680 select 681 *, 682 unix_timestamp() - unix_timestamp(last_checked) as seconds_since_last_checked, 683 ifnull(last_checked <= last_seen, 0) as is_last_check_valid, 684 unix_timestamp() - unix_timestamp(last_seen) as seconds_since_last_seen, 685 candidate_database_instance.last_suggested is not null 686 and candidate_database_instance.promotion_rule in ('must', 'prefer') as is_candidate, 687 ifnull(nullif(candidate_database_instance.promotion_rule, ''), 'neutral') as promotion_rule, 688 ifnull(unresolved_hostname, '') as unresolved_hostname, 689 (database_instance_downtime.downtime_active is not null and ifnull(database_instance_downtime.end_timestamp, now()) > now()) as is_downtimed, 690 ifnull(database_instance_downtime.reason, '') as downtime_reason, 691 ifnull(database_instance_downtime.owner, '') as downtime_owner, 692 ifnull(unix_timestamp() - unix_timestamp(begin_timestamp), 0) as elapsed_downtime_seconds, 693 ifnull(database_instance_downtime.end_timestamp, '') as downtime_end_timestamp 694 from 695 database_instance 696 left join vitess_tablet using (hostname, port) 697 left join candidate_database_instance using (hostname, port) 698 left join hostname_unresolve using (hostname) 699 left join database_instance_downtime using (hostname, port) 700 where 701 %s 702 order by 703 %s 704 `, condition, sort) 705 706 err := db.QueryVTOrc(query, args, func(m sqlutils.RowMap) error { 707 instance := readInstanceRow(m) 708 instances = append(instances, instance) 709 return nil 710 }) 711 if err != nil { 712 log.Error(err) 713 return instances, err 714 } 715 return instances, err 716 } 717 instanceReadChan <- true 718 instances, err := readFunc() 719 <-instanceReadChan 720 return instances, err 721 } 722 723 func readInstancesByExactKey(instanceKey *InstanceKey) ([](*Instance), error) { 724 condition := ` 725 hostname = ? 726 and port = ? 727 ` 728 return readInstancesByCondition(condition, sqlutils.Args(instanceKey.Hostname, instanceKey.Port), "") 729 } 730 731 // ReadInstance reads an instance from the vtorc backend database 732 func ReadInstance(instanceKey *InstanceKey) (*Instance, bool, error) { 733 instances, err := readInstancesByExactKey(instanceKey) 734 // We know there will be at most one (hostname & port are PK) 735 // And we expect to find one 736 readInstanceCounter.Inc(1) 737 if len(instances) == 0 { 738 return nil, false, err 739 } 740 if err != nil { 741 return instances[0], false, err 742 } 743 return instances[0], true, nil 744 } 745 746 // ReadReplicaInstances reads replicas of a given primary 747 func ReadReplicaInstances(primaryKey *InstanceKey) ([](*Instance), error) { 748 condition := ` 749 source_host = ? 750 and source_port = ? 751 ` 752 return readInstancesByCondition(condition, sqlutils.Args(primaryKey.Hostname, primaryKey.Port), "") 753 } 754 755 // ReadReplicaInstancesIncludingBinlogServerSubReplicas returns a list of direct slves including any replicas 756 // of a binlog server replica 757 func ReadReplicaInstancesIncludingBinlogServerSubReplicas(primaryKey *InstanceKey) ([](*Instance), error) { 758 replicas, err := ReadReplicaInstances(primaryKey) 759 if err != nil { 760 return replicas, err 761 } 762 for _, replica := range replicas { 763 replica := replica 764 if replica.IsBinlogServer() { 765 binlogServerReplicas, err := ReadReplicaInstancesIncludingBinlogServerSubReplicas(&replica.Key) 766 if err != nil { 767 return replicas, err 768 } 769 replicas = append(replicas, binlogServerReplicas...) 770 } 771 } 772 return replicas, err 773 } 774 775 // ReadProblemInstances reads all instances with problems 776 func ReadProblemInstances(keyspace string, shard string) ([](*Instance), error) { 777 condition := ` 778 keyspace LIKE (CASE WHEN ? = '' THEN '%' ELSE ? END) 779 and shard LIKE (CASE WHEN ? = '' THEN '%' ELSE ? END) 780 and ( 781 (last_seen < last_checked) 782 or (unix_timestamp() - unix_timestamp(last_checked) > ?) 783 or (replication_sql_thread_state not in (-1 ,1)) 784 or (replication_io_thread_state not in (-1 ,1)) 785 or (abs(cast(replication_lag_seconds as signed) - cast(sql_delay as signed)) > ?) 786 or (abs(cast(replica_lag_seconds as signed) - cast(sql_delay as signed)) > ?) 787 or (gtid_errant != '') 788 or (replication_group_name != '' and replication_group_member_state != 'ONLINE') 789 ) 790 ` 791 792 args := sqlutils.Args(keyspace, keyspace, shard, shard, config.Config.InstancePollSeconds*5, config.Config.ReasonableReplicationLagSeconds, config.Config.ReasonableReplicationLagSeconds) 793 instances, err := readInstancesByCondition(condition, args, "") 794 if err != nil { 795 return instances, err 796 } 797 var reportedInstances [](*Instance) 798 for _, instance := range instances { 799 skip := false 800 if instance.IsDowntimed { 801 skip = true 802 } 803 if !skip { 804 reportedInstances = append(reportedInstances, instance) 805 } 806 } 807 return reportedInstances, nil 808 } 809 810 // ReadLostInRecoveryInstances returns all instances (potentially filtered by cluster) 811 // which are currently indicated as downtimed due to being lost during a topology recovery. 812 func ReadLostInRecoveryInstances(keyspace string, shard string) ([](*Instance), error) { 813 condition := ` 814 ifnull( 815 database_instance_downtime.downtime_active = 1 816 and database_instance_downtime.end_timestamp > now() 817 and database_instance_downtime.reason = ?, 0) 818 and ? IN ('', keyspace) 819 and ? IN ('', shard) 820 ` 821 return readInstancesByCondition(condition, sqlutils.Args(DowntimeLostInRecoveryMessage, keyspace, shard), "keyspace asc, shard asc, replication_depth asc") 822 } 823 824 // readUnseenPrimaryKeys will read list of primaries that have never been seen, and yet whose replicas 825 // seem to be replicating. 826 func readUnseenPrimaryKeys() ([]InstanceKey, error) { 827 res := []InstanceKey{} 828 829 err := db.QueryVTOrcRowsMap(` 830 SELECT DISTINCT 831 replica_instance.source_host, replica_instance.source_port 832 FROM 833 database_instance replica_instance 834 LEFT JOIN 835 hostname_resolve ON (replica_instance.source_host = hostname_resolve.hostname) 836 LEFT JOIN 837 database_instance primary_instance ON ( 838 COALESCE(hostname_resolve.resolved_hostname, replica_instance.source_host) = primary_instance.hostname 839 and replica_instance.source_port = primary_instance.port) 840 WHERE 841 primary_instance.last_checked IS NULL 842 and replica_instance.source_host != '' 843 and replica_instance.source_host != '_' 844 and replica_instance.source_port > 0 845 and replica_instance.replica_io_running = 1 846 `, func(m sqlutils.RowMap) error { 847 instanceKey, _ := NewResolveInstanceKey(m.GetString("source_host"), m.GetInt("source_port")) 848 // we ignore the error. It can be expected that we are unable to resolve the hostname. 849 // Maybe that's how we got here in the first place! 850 res = append(res, *instanceKey) 851 852 return nil 853 }) 854 if err != nil { 855 log.Error(err) 856 return res, err 857 } 858 859 return res, nil 860 } 861 862 // ForgetUnseenInstancesDifferentlyResolved will purge instances which are invalid, and whose hostname 863 // appears on the hostname_resolved table; this means some time in the past their hostname was unresovled, and now 864 // resovled to a different value; the old hostname is never accessed anymore and the old entry should be removed. 865 func ForgetUnseenInstancesDifferentlyResolved() error { 866 query := ` 867 select 868 database_instance.hostname, database_instance.port 869 from 870 hostname_resolve 871 JOIN database_instance ON (hostname_resolve.hostname = database_instance.hostname) 872 where 873 hostname_resolve.hostname != hostname_resolve.resolved_hostname 874 AND ifnull(last_checked <= last_seen, 0) = 0 875 ` 876 keys := NewInstanceKeyMap() 877 err := db.QueryVTOrc(query, nil, func(m sqlutils.RowMap) error { 878 key := InstanceKey{ 879 Hostname: m.GetString("hostname"), 880 Port: m.GetInt("port"), 881 } 882 keys.AddKey(key) 883 return nil 884 }) 885 var rowsAffected int64 886 for _, key := range keys.GetInstanceKeys() { 887 sqlResult, err := db.ExecVTOrc(` 888 delete from 889 database_instance 890 where 891 hostname = ? and port = ? 892 `, key.Hostname, key.Port, 893 ) 894 if err != nil { 895 log.Error(err) 896 return err 897 } 898 rows, err := sqlResult.RowsAffected() 899 if err != nil { 900 log.Error(err) 901 return err 902 } 903 rowsAffected = rowsAffected + rows 904 } 905 _ = AuditOperation("forget-unseen-differently-resolved", nil, fmt.Sprintf("Forgotten instances: %d", rowsAffected)) 906 return err 907 } 908 909 // readUnknownPrimaryHostnameResolves will figure out the resolved hostnames of primary-hosts which cannot be found. 910 // It uses the hostname_resolve_history table to heuristically guess the correct hostname (based on "this was the 911 // last time we saw this hostname and it resolves into THAT") 912 func readUnknownPrimaryHostnameResolves() (map[string]string, error) { 913 res := make(map[string]string) 914 err := db.QueryVTOrcRowsMap(` 915 SELECT DISTINCT 916 replica_instance.source_host, hostname_resolve_history.resolved_hostname 917 FROM 918 database_instance replica_instance 919 LEFT JOIN hostname_resolve ON (replica_instance.source_host = hostname_resolve.hostname) 920 LEFT JOIN database_instance primary_instance ON ( 921 COALESCE(hostname_resolve.resolved_hostname, replica_instance.source_host) = primary_instance.hostname 922 and replica_instance.source_port = primary_instance.port 923 ) LEFT JOIN hostname_resolve_history ON (replica_instance.source_host = hostname_resolve_history.hostname) 924 WHERE 925 primary_instance.last_checked IS NULL 926 and replica_instance.source_host != '' 927 and replica_instance.source_host != '_' 928 and replica_instance.source_port > 0 929 `, func(m sqlutils.RowMap) error { 930 res[m.GetString("source_host")] = m.GetString("resolved_hostname") 931 return nil 932 }) 933 if err != nil { 934 log.Error(err) 935 return res, err 936 } 937 938 return res, nil 939 } 940 941 // ResolveUnknownPrimaryHostnameResolves fixes missing hostname resolves based on hostname_resolve_history 942 // The use case is replicas replicating from some unknown-hostname which cannot be otherwise found. This could 943 // happen due to an expire unresolve together with clearing up of hostname cache. 944 func ResolveUnknownPrimaryHostnameResolves() error { 945 946 hostnameResolves, err := readUnknownPrimaryHostnameResolves() 947 if err != nil { 948 return err 949 } 950 for hostname, resolvedHostname := range hostnameResolves { 951 UpdateResolvedHostname(hostname, resolvedHostname) 952 } 953 954 _ = AuditOperation("resolve-unknown-primaries", nil, fmt.Sprintf("Num resolved hostnames: %d", len(hostnameResolves))) 955 return err 956 } 957 958 // GetKeyspaceShardName gets the keyspace shard name for the given instance key 959 func GetKeyspaceShardName(instanceKey *InstanceKey) (keyspace string, shard string, err error) { 960 query := ` 961 select 962 keyspace, 963 shard 964 from 965 vitess_tablet 966 where 967 hostname = ? 968 and port = ? 969 ` 970 err = db.QueryVTOrc(query, sqlutils.Args(instanceKey.Hostname, instanceKey.Port), func(m sqlutils.RowMap) error { 971 keyspace = m.GetString("keyspace") 972 shard = m.GetString("shard") 973 return nil 974 }) 975 if err != nil { 976 log.Error(err) 977 } 978 return keyspace, shard, err 979 } 980 981 // ReadOutdatedInstanceKeys reads and returns keys for all instances that are not up to date (i.e. 982 // pre-configured time has passed since they were last checked) or the ones whose tablet information was read 983 // but not the mysql information. This could happen if the durability policy of the keyspace wasn't 984 // available at the time it was discovered. This would lead to not having the record of the tablet in the 985 // database_instance table. 986 // We also check for the case where an attempt at instance checking has been made, that hasn't 987 // resulted in an actual check! This can happen when TCP/IP connections are hung, in which case the "check" 988 // never returns. In such case we multiply interval by a factor, so as not to open too many connections on 989 // the instance. 990 func ReadOutdatedInstanceKeys() ([]InstanceKey, error) { 991 res := []InstanceKey{} 992 query := ` 993 SELECT 994 hostname, port 995 FROM 996 database_instance 997 WHERE 998 CASE 999 WHEN last_attempted_check <= last_checked 1000 THEN last_checked < now() - interval ? second 1001 ELSE last_checked < now() - interval ? second 1002 END 1003 UNION 1004 SELECT 1005 vitess_tablet.hostname, vitess_tablet.port 1006 FROM 1007 vitess_tablet LEFT JOIN database_instance ON ( 1008 vitess_tablet.hostname = database_instance.hostname 1009 AND vitess_tablet.port = database_instance.port 1010 ) 1011 WHERE 1012 database_instance.hostname IS NULL 1013 ` 1014 args := sqlutils.Args(config.Config.InstancePollSeconds, 2*config.Config.InstancePollSeconds) 1015 1016 err := db.QueryVTOrc(query, args, func(m sqlutils.RowMap) error { 1017 instanceKey, merr := NewResolveInstanceKey(m.GetString("hostname"), m.GetInt("port")) 1018 if merr != nil { 1019 log.Error(merr) 1020 } else if !InstanceIsForgotten(instanceKey) { 1021 // only if not in "forget" cache 1022 res = append(res, *instanceKey) 1023 } 1024 // We don;t return an error because we want to keep filling the outdated instances list. 1025 return nil 1026 }) 1027 1028 if err != nil { 1029 log.Error(err) 1030 } 1031 return res, err 1032 1033 } 1034 1035 func mkInsertOdku(table string, columns []string, values []string, nrRows int, insertIgnore bool) (string, error) { 1036 if len(columns) == 0 { 1037 return "", errors.New("Column list cannot be empty") 1038 } 1039 if nrRows < 1 { 1040 return "", errors.New("nrRows must be a positive number") 1041 } 1042 if len(columns) != len(values) { 1043 return "", errors.New("number of values must be equal to number of columns") 1044 } 1045 1046 var q bytes.Buffer 1047 var ignore string 1048 if insertIgnore { 1049 ignore = "ignore" 1050 } 1051 var valRow = fmt.Sprintf("(%s)", strings.Join(values, ", ")) 1052 var val bytes.Buffer 1053 val.WriteString(valRow) 1054 for i := 1; i < nrRows; i++ { 1055 val.WriteString(",\n ") // indent VALUES, see below 1056 val.WriteString(valRow) 1057 } 1058 1059 var col = strings.Join(columns, ", ") 1060 var odku bytes.Buffer 1061 odku.WriteString(fmt.Sprintf("%s=VALUES(%s)", columns[0], columns[0])) 1062 for _, c := range columns[1:] { 1063 odku.WriteString(", ") 1064 odku.WriteString(fmt.Sprintf("%s=VALUES(%s)", c, c)) 1065 } 1066 1067 q.WriteString(fmt.Sprintf(`INSERT %s INTO %s 1068 (%s) 1069 VALUES 1070 %s 1071 ON DUPLICATE KEY UPDATE 1072 %s 1073 `, 1074 ignore, table, col, val.String(), odku.String())) 1075 1076 return q.String(), nil 1077 } 1078 1079 func mkInsertOdkuForInstances(instances []*Instance, instanceWasActuallyFound bool, updateLastSeen bool) (string, []any, error) { 1080 if len(instances) == 0 { 1081 return "", nil, nil 1082 } 1083 1084 insertIgnore := false 1085 if !instanceWasActuallyFound { 1086 insertIgnore = true 1087 } 1088 var columns = []string{ 1089 "hostname", 1090 "port", 1091 "last_checked", 1092 "last_attempted_check", 1093 "last_check_partial_success", 1094 "server_id", 1095 "server_uuid", 1096 "version", 1097 "major_version", 1098 "version_comment", 1099 "binlog_server", 1100 "read_only", 1101 "binlog_format", 1102 "binlog_row_image", 1103 "log_bin", 1104 "log_replica_updates", 1105 "binary_log_file", 1106 "binary_log_pos", 1107 "source_host", 1108 "source_port", 1109 "replica_sql_running", 1110 "replica_io_running", 1111 "replication_sql_thread_state", 1112 "replication_io_thread_state", 1113 "has_replication_filters", 1114 "supports_oracle_gtid", 1115 "oracle_gtid", 1116 "source_uuid", 1117 "ancestry_uuid", 1118 "executed_gtid_set", 1119 "gtid_mode", 1120 "gtid_purged", 1121 "gtid_errant", 1122 "mariadb_gtid", 1123 "pseudo_gtid", 1124 "source_log_file", 1125 "read_source_log_pos", 1126 "relay_source_log_file", 1127 "exec_source_log_pos", 1128 "relay_log_file", 1129 "relay_log_pos", 1130 "last_sql_error", 1131 "last_io_error", 1132 "replication_lag_seconds", 1133 "replica_lag_seconds", 1134 "sql_delay", 1135 "data_center", 1136 "region", 1137 "physical_environment", 1138 "replication_depth", 1139 "is_co_primary", 1140 "has_replication_credentials", 1141 "allow_tls", 1142 "semi_sync_enforced", 1143 "semi_sync_primary_enabled", 1144 "semi_sync_primary_timeout", 1145 "semi_sync_primary_wait_for_replica_count", 1146 "semi_sync_replica_enabled", 1147 "semi_sync_primary_status", 1148 "semi_sync_primary_clients", 1149 "semi_sync_replica_status", 1150 "instance_alias", 1151 "last_discovery_latency", 1152 "replication_group_name", 1153 "replication_group_is_single_primary_mode", 1154 "replication_group_member_state", 1155 "replication_group_member_role", 1156 "replication_group_members", 1157 "replication_group_primary_host", 1158 "replication_group_primary_port", 1159 } 1160 1161 var values = make([]string, len(columns)) 1162 for i := range columns { 1163 values[i] = "?" 1164 } 1165 values[2] = "NOW()" // last_checked 1166 values[3] = "NOW()" // last_attempted_check 1167 values[4] = "1" // last_check_partial_success 1168 1169 if updateLastSeen { 1170 columns = append(columns, "last_seen") 1171 values = append(values, "NOW()") 1172 } 1173 1174 var args []any 1175 for _, instance := range instances { 1176 // number of columns minus 2 as last_checked and last_attempted_check 1177 // updated with NOW() 1178 args = append(args, instance.Key.Hostname) 1179 args = append(args, instance.Key.Port) 1180 args = append(args, instance.ServerID) 1181 args = append(args, instance.ServerUUID) 1182 args = append(args, instance.Version) 1183 args = append(args, instance.MajorVersionString()) 1184 args = append(args, instance.VersionComment) 1185 args = append(args, instance.IsBinlogServer()) 1186 args = append(args, instance.ReadOnly) 1187 args = append(args, instance.BinlogFormat) 1188 args = append(args, instance.BinlogRowImage) 1189 args = append(args, instance.LogBinEnabled) 1190 args = append(args, instance.LogReplicationUpdatesEnabled) 1191 args = append(args, instance.SelfBinlogCoordinates.LogFile) 1192 args = append(args, instance.SelfBinlogCoordinates.LogPos) 1193 args = append(args, instance.SourceKey.Hostname) 1194 args = append(args, instance.SourceKey.Port) 1195 args = append(args, instance.ReplicationSQLThreadRuning) 1196 args = append(args, instance.ReplicationIOThreadRuning) 1197 args = append(args, instance.ReplicationSQLThreadState) 1198 args = append(args, instance.ReplicationIOThreadState) 1199 args = append(args, instance.HasReplicationFilters) 1200 args = append(args, instance.SupportsOracleGTID) 1201 args = append(args, instance.UsingOracleGTID) 1202 args = append(args, instance.SourceUUID) 1203 args = append(args, instance.AncestryUUID) 1204 args = append(args, instance.ExecutedGtidSet) 1205 args = append(args, instance.GTIDMode) 1206 args = append(args, instance.GtidPurged) 1207 args = append(args, instance.GtidErrant) 1208 args = append(args, instance.UsingMariaDBGTID) 1209 args = append(args, instance.UsingPseudoGTID) 1210 args = append(args, instance.ReadBinlogCoordinates.LogFile) 1211 args = append(args, instance.ReadBinlogCoordinates.LogPos) 1212 args = append(args, instance.ExecBinlogCoordinates.LogFile) 1213 args = append(args, instance.ExecBinlogCoordinates.LogPos) 1214 args = append(args, instance.RelaylogCoordinates.LogFile) 1215 args = append(args, instance.RelaylogCoordinates.LogPos) 1216 args = append(args, instance.LastSQLError) 1217 args = append(args, instance.LastIOError) 1218 args = append(args, instance.SecondsBehindPrimary) 1219 args = append(args, instance.ReplicationLagSeconds) 1220 args = append(args, instance.SQLDelay) 1221 args = append(args, instance.DataCenter) 1222 args = append(args, instance.Region) 1223 args = append(args, instance.PhysicalEnvironment) 1224 args = append(args, instance.ReplicationDepth) 1225 args = append(args, instance.IsCoPrimary) 1226 args = append(args, instance.HasReplicationCredentials) 1227 args = append(args, instance.AllowTLS) 1228 args = append(args, instance.SemiSyncEnforced) 1229 args = append(args, instance.SemiSyncPrimaryEnabled) 1230 args = append(args, instance.SemiSyncPrimaryTimeout) 1231 args = append(args, instance.SemiSyncPrimaryWaitForReplicaCount) 1232 args = append(args, instance.SemiSyncReplicaEnabled) 1233 args = append(args, instance.SemiSyncPrimaryStatus) 1234 args = append(args, instance.SemiSyncPrimaryClients) 1235 args = append(args, instance.SemiSyncReplicaStatus) 1236 args = append(args, instance.InstanceAlias) 1237 args = append(args, instance.LastDiscoveryLatency.Nanoseconds()) 1238 args = append(args, instance.ReplicationGroupName) 1239 args = append(args, instance.ReplicationGroupIsSinglePrimary) 1240 args = append(args, instance.ReplicationGroupMemberState) 1241 args = append(args, instance.ReplicationGroupMemberRole) 1242 args = append(args, instance.ReplicationGroupMembers.ToJSONString()) 1243 args = append(args, instance.ReplicationGroupPrimaryInstanceKey.Hostname) 1244 args = append(args, instance.ReplicationGroupPrimaryInstanceKey.Port) 1245 } 1246 1247 sql, err := mkInsertOdku("database_instance", columns, values, len(instances), insertIgnore) 1248 if err != nil { 1249 errMsg := fmt.Sprintf("Failed to build query: %v", err) 1250 log.Errorf(errMsg) 1251 return sql, args, fmt.Errorf(errMsg) 1252 } 1253 1254 return sql, args, nil 1255 } 1256 1257 // writeManyInstances stores instances in the vtorc backend 1258 func writeManyInstances(instances []*Instance, instanceWasActuallyFound bool, updateLastSeen bool) error { 1259 writeInstances := [](*Instance){} 1260 for _, instance := range instances { 1261 if InstanceIsForgotten(&instance.Key) && !instance.IsSeed() { 1262 continue 1263 } 1264 writeInstances = append(writeInstances, instance) 1265 } 1266 if len(writeInstances) == 0 { 1267 return nil // nothing to write 1268 } 1269 sql, args, err := mkInsertOdkuForInstances(writeInstances, instanceWasActuallyFound, updateLastSeen) 1270 if err != nil { 1271 return err 1272 } 1273 if _, err := db.ExecVTOrc(sql, args...); err != nil { 1274 return err 1275 } 1276 return nil 1277 } 1278 1279 // WriteInstance stores an instance in the vtorc backend 1280 func WriteInstance(instance *Instance, instanceWasActuallyFound bool, lastError error) error { 1281 if lastError != nil { 1282 log.Infof("writeInstance: will not update database_instance due to error: %+v", lastError) 1283 return nil 1284 } 1285 return writeManyInstances([]*Instance{instance}, instanceWasActuallyFound, true) 1286 } 1287 1288 // UpdateInstanceLastChecked updates the last_check timestamp in the vtorc backed database 1289 // for a given instance 1290 func UpdateInstanceLastChecked(instanceKey *InstanceKey, partialSuccess bool) error { 1291 writeFunc := func() error { 1292 _, err := db.ExecVTOrc(` 1293 update 1294 database_instance 1295 set 1296 last_checked = NOW(), 1297 last_check_partial_success = ? 1298 where 1299 hostname = ? 1300 and port = ?`, 1301 partialSuccess, 1302 instanceKey.Hostname, 1303 instanceKey.Port, 1304 ) 1305 if err != nil { 1306 log.Error(err) 1307 } 1308 return err 1309 } 1310 return ExecDBWriteFunc(writeFunc) 1311 } 1312 1313 // UpdateInstanceLastAttemptedCheck updates the last_attempted_check timestamp in the vtorc backed database 1314 // for a given instance. 1315 // This is used as a failsafe mechanism in case access to the instance gets hung (it happens), in which case 1316 // the entire ReadTopology gets stuck (and no, connection timeout nor driver timeouts don't help. Don't look at me, 1317 // the world is a harsh place to live in). 1318 // And so we make sure to note down *before* we even attempt to access the instance; and this raises a red flag when we 1319 // wish to access the instance again: if last_attempted_check is *newer* than last_checked, that's bad news and means 1320 // we have a "hanging" issue. 1321 func UpdateInstanceLastAttemptedCheck(instanceKey *InstanceKey) error { 1322 writeFunc := func() error { 1323 _, err := db.ExecVTOrc(` 1324 update 1325 database_instance 1326 set 1327 last_attempted_check = NOW() 1328 where 1329 hostname = ? 1330 and port = ?`, 1331 instanceKey.Hostname, 1332 instanceKey.Port, 1333 ) 1334 if err != nil { 1335 log.Error(err) 1336 } 1337 return err 1338 } 1339 return ExecDBWriteFunc(writeFunc) 1340 } 1341 1342 func InstanceIsForgotten(instanceKey *InstanceKey) bool { 1343 _, found := forgetInstanceKeys.Get(instanceKey.StringCode()) 1344 return found 1345 } 1346 1347 // ForgetInstance removes an instance entry from the vtorc backed database. 1348 // It may be auto-rediscovered through topology or requested for discovery by multiple means. 1349 func ForgetInstance(instanceKey *InstanceKey) error { 1350 if instanceKey == nil { 1351 errMsg := "ForgetInstance(): nil instanceKey" 1352 log.Errorf(errMsg) 1353 return fmt.Errorf(errMsg) 1354 } 1355 forgetInstanceKeys.Set(instanceKey.StringCode(), true, cache.DefaultExpiration) 1356 sqlResult, err := db.ExecVTOrc(` 1357 delete 1358 from database_instance 1359 where 1360 hostname = ? and port = ?`, 1361 instanceKey.Hostname, 1362 instanceKey.Port, 1363 ) 1364 if err != nil { 1365 log.Error(err) 1366 return err 1367 } 1368 rows, err := sqlResult.RowsAffected() 1369 if err != nil { 1370 log.Error(err) 1371 return err 1372 } 1373 if rows == 0 { 1374 errMsg := fmt.Sprintf("ForgetInstance(): instance %+v not found", *instanceKey) 1375 log.Errorf(errMsg) 1376 return fmt.Errorf(errMsg) 1377 } 1378 _ = AuditOperation("forget", instanceKey, "") 1379 return nil 1380 } 1381 1382 // ForgetLongUnseenInstances will remove entries of all instacnes that have long since been last seen. 1383 func ForgetLongUnseenInstances() error { 1384 sqlResult, err := db.ExecVTOrc(` 1385 delete 1386 from database_instance 1387 where 1388 last_seen < NOW() - interval ? hour`, 1389 config.UnseenInstanceForgetHours, 1390 ) 1391 if err != nil { 1392 log.Error(err) 1393 return err 1394 } 1395 rows, err := sqlResult.RowsAffected() 1396 if err != nil { 1397 log.Error(err) 1398 return err 1399 } 1400 _ = AuditOperation("forget-unseen", nil, fmt.Sprintf("Forgotten instances: %d", rows)) 1401 return err 1402 } 1403 1404 // SnapshotTopologies records topology graph for all existing topologies 1405 func SnapshotTopologies() error { 1406 writeFunc := func() error { 1407 _, err := db.ExecVTOrc(` 1408 insert ignore into 1409 database_instance_topology_history (snapshot_unix_timestamp, 1410 hostname, port, source_host, source_port, version) 1411 select 1412 UNIX_TIMESTAMP(NOW()), 1413 hostname, port, source_host, source_port, version 1414 from 1415 database_instance 1416 `, 1417 ) 1418 if err != nil { 1419 log.Error(err) 1420 return err 1421 } 1422 1423 return nil 1424 } 1425 return ExecDBWriteFunc(writeFunc) 1426 } 1427 1428 // RecordStaleInstanceBinlogCoordinates snapshots the binlog coordinates of instances 1429 func RecordStaleInstanceBinlogCoordinates(instanceKey *InstanceKey, binlogCoordinates *BinlogCoordinates) error { 1430 args := sqlutils.Args( 1431 instanceKey.Hostname, instanceKey.Port, 1432 binlogCoordinates.LogFile, binlogCoordinates.LogPos, 1433 ) 1434 _, err := db.ExecVTOrc(` 1435 delete from 1436 database_instance_stale_binlog_coordinates 1437 where 1438 hostname=? and port=? 1439 and ( 1440 binary_log_file != ? 1441 or binary_log_pos != ? 1442 ) 1443 `, 1444 args..., 1445 ) 1446 if err != nil { 1447 log.Error(err) 1448 return err 1449 } 1450 _, err = db.ExecVTOrc(` 1451 insert ignore into 1452 database_instance_stale_binlog_coordinates ( 1453 hostname, port, binary_log_file, binary_log_pos, first_seen 1454 ) 1455 values ( 1456 ?, ?, ?, ?, NOW() 1457 )`, 1458 args...) 1459 if err != nil { 1460 log.Error(err) 1461 } 1462 return err 1463 } 1464 1465 func ExpireStaleInstanceBinlogCoordinates() error { 1466 expireSeconds := config.Config.ReasonableReplicationLagSeconds * 2 1467 if expireSeconds < config.StaleInstanceCoordinatesExpireSeconds { 1468 expireSeconds = config.StaleInstanceCoordinatesExpireSeconds 1469 } 1470 writeFunc := func() error { 1471 _, err := db.ExecVTOrc(` 1472 delete from database_instance_stale_binlog_coordinates 1473 where first_seen < NOW() - INTERVAL ? SECOND 1474 `, expireSeconds, 1475 ) 1476 if err != nil { 1477 log.Error(err) 1478 } 1479 return err 1480 } 1481 return ExecDBWriteFunc(writeFunc) 1482 }