vitess.io/vitess@v0.16.2/go/vt/vtorc/inst/analysis_dao.go (about) 1 /* 2 Copyright 2015 Shlomi Noach, courtesy Booking.com 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package inst 18 19 import ( 20 "fmt" 21 "time" 22 23 "vitess.io/vitess/go/vt/external/golib/sqlutils" 24 "vitess.io/vitess/go/vt/log" 25 26 "google.golang.org/protobuf/encoding/prototext" 27 28 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 29 "vitess.io/vitess/go/vt/topo" 30 "vitess.io/vitess/go/vt/vtctl/reparentutil" 31 "vitess.io/vitess/go/vt/vtorc/config" 32 "vitess.io/vitess/go/vt/vtorc/db" 33 "vitess.io/vitess/go/vt/vtorc/process" 34 "vitess.io/vitess/go/vt/vtorc/util" 35 36 "github.com/patrickmn/go-cache" 37 "github.com/rcrowley/go-metrics" 38 ) 39 40 var analysisChangeWriteAttemptCounter = metrics.NewCounter() 41 var analysisChangeWriteCounter = metrics.NewCounter() 42 43 var recentInstantAnalysis *cache.Cache 44 45 func init() { 46 _ = metrics.Register("analysis.change.write.attempt", analysisChangeWriteAttemptCounter) 47 _ = metrics.Register("analysis.change.write", analysisChangeWriteCounter) 48 49 go initializeAnalysisDaoPostConfiguration() 50 } 51 52 func initializeAnalysisDaoPostConfiguration() { 53 config.WaitForConfigurationToBeLoaded() 54 55 recentInstantAnalysis = cache.New(time.Duration(config.Config.RecoveryPollSeconds*2)*time.Second, time.Second) 56 } 57 58 type clusterAnalysis struct { 59 hasClusterwideAction bool 60 primaryKey *InstanceKey 61 durability reparentutil.Durabler 62 } 63 64 // GetReplicationAnalysis will check for replication problems (dead primary; unreachable primary; etc) 65 func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAnalysisHints) ([]ReplicationAnalysis, error) { 66 result := []ReplicationAnalysis{} 67 68 // TODO(sougou); deprecate ReduceReplicationAnalysisCount 69 args := sqlutils.Args(config.Config.ReasonableReplicationLagSeconds, ValidSecondsFromSeenToLastAttemptedCheck(), config.Config.ReasonableReplicationLagSeconds, keyspace, shard) 70 query := ` 71 SELECT 72 vitess_tablet.info AS tablet_info, 73 vitess_tablet.hostname, 74 vitess_tablet.port, 75 vitess_tablet.tablet_type, 76 vitess_tablet.primary_timestamp, 77 vitess_tablet.shard AS shard, 78 vitess_keyspace.keyspace AS keyspace, 79 vitess_keyspace.keyspace_type AS keyspace_type, 80 vitess_keyspace.durability_policy AS durability_policy, 81 primary_instance.read_only AS read_only, 82 MIN(primary_instance.hostname) IS NULL AS is_invalid, 83 MIN(primary_instance.data_center) AS data_center, 84 MIN(primary_instance.region) AS region, 85 MIN(primary_instance.physical_environment) AS physical_environment, 86 MIN(primary_instance.source_host) AS source_host, 87 MIN(primary_instance.source_port) AS source_port, 88 MIN(primary_instance.binary_log_file) AS binary_log_file, 89 MIN(primary_instance.binary_log_pos) AS binary_log_pos, 90 MIN(primary_tablet.info) AS primary_tablet_info, 91 MIN( 92 IFNULL( 93 primary_instance.binary_log_file = database_instance_stale_binlog_coordinates.binary_log_file 94 AND primary_instance.binary_log_pos = database_instance_stale_binlog_coordinates.binary_log_pos 95 AND database_instance_stale_binlog_coordinates.first_seen < NOW() - interval ? second, 96 0 97 ) 98 ) AS is_stale_binlog_coordinates, 99 MIN( 100 primary_instance.last_checked <= primary_instance.last_seen 101 and primary_instance.last_attempted_check <= primary_instance.last_seen + interval ? second 102 ) = 1 AS is_last_check_valid, 103 /* To be considered a primary, traditional async replication must not be present/valid AND the host should either */ 104 /* not be a replication group member OR be the primary of the replication group */ 105 MIN(primary_instance.last_check_partial_success) as last_check_partial_success, 106 MIN( 107 ( 108 primary_instance.source_host IN ('', '_') 109 OR primary_instance.source_port = 0 110 OR substr(primary_instance.source_host, 1, 2) = '//' 111 ) 112 AND ( 113 primary_instance.replication_group_name = '' 114 OR primary_instance.replication_group_member_role = 'PRIMARY' 115 ) 116 ) AS is_primary, 117 MIN(primary_instance.is_co_primary) AS is_co_primary, 118 MIN(primary_instance.gtid_mode) AS gtid_mode, 119 COUNT(replica_instance.server_id) AS count_replicas, 120 IFNULL( 121 SUM( 122 replica_instance.last_checked <= replica_instance.last_seen 123 ), 124 0 125 ) AS count_valid_replicas, 126 IFNULL( 127 SUM( 128 replica_instance.last_checked <= replica_instance.last_seen 129 AND replica_instance.replica_io_running != 0 130 AND replica_instance.replica_sql_running != 0 131 ), 132 0 133 ) AS count_valid_replicating_replicas, 134 IFNULL( 135 SUM( 136 replica_instance.last_checked <= replica_instance.last_seen 137 AND replica_instance.replica_io_running = 0 138 AND replica_instance.last_io_error like '%%error %%connecting to master%%' 139 AND replica_instance.replica_sql_running = 1 140 ), 141 0 142 ) AS count_replicas_failing_to_connect_to_primary, 143 MIN(primary_instance.replication_depth) AS replication_depth, 144 MIN( 145 primary_instance.replica_sql_running = 1 146 AND primary_instance.replica_io_running = 0 147 AND primary_instance.last_io_error like '%%error %%connecting to master%%' 148 ) AS is_failing_to_connect_to_primary, 149 MIN( 150 primary_instance.replica_sql_running = 0 151 OR primary_instance.replica_io_running = 0 152 ) AS replication_stopped, 153 MIN( 154 primary_downtime.downtime_active is not null 155 and ifnull(primary_downtime.end_timestamp, now()) > now() 156 ) AS is_downtimed, 157 MIN( 158 IFNULL(primary_downtime.end_timestamp, '') 159 ) AS downtime_end_timestamp, 160 MIN( 161 IFNULL( 162 unix_timestamp() - unix_timestamp(primary_downtime.end_timestamp), 163 0 164 ) 165 ) AS downtime_remaining_seconds, 166 MIN( 167 primary_instance.binlog_server 168 ) AS is_binlog_server, 169 MIN( 170 primary_instance.supports_oracle_gtid 171 ) AS supports_oracle_gtid, 172 MIN( 173 primary_instance.semi_sync_primary_enabled 174 ) AS semi_sync_primary_enabled, 175 MIN( 176 primary_instance.semi_sync_primary_wait_for_replica_count 177 ) AS semi_sync_primary_wait_for_replica_count, 178 MIN( 179 primary_instance.semi_sync_primary_clients 180 ) AS semi_sync_primary_clients, 181 MIN( 182 primary_instance.semi_sync_primary_status 183 ) AS semi_sync_primary_status, 184 MIN( 185 primary_instance.semi_sync_replica_enabled 186 ) AS semi_sync_replica_enabled, 187 SUM(replica_instance.is_co_primary) AS count_co_primary_replicas, 188 SUM(replica_instance.oracle_gtid) AS count_oracle_gtid_replicas, 189 IFNULL( 190 SUM( 191 replica_instance.last_checked <= replica_instance.last_seen 192 AND replica_instance.oracle_gtid != 0 193 ), 194 0 195 ) AS count_valid_oracle_gtid_replicas, 196 SUM( 197 replica_instance.binlog_server 198 ) AS count_binlog_server_replicas, 199 IFNULL( 200 SUM( 201 replica_instance.last_checked <= replica_instance.last_seen 202 AND replica_instance.binlog_server != 0 203 ), 204 0 205 ) AS count_valid_binlog_server_replicas, 206 SUM( 207 replica_instance.semi_sync_replica_enabled 208 ) AS count_semi_sync_replicas, 209 IFNULL( 210 SUM( 211 replica_instance.last_checked <= replica_instance.last_seen 212 AND replica_instance.semi_sync_replica_enabled != 0 213 ), 214 0 215 ) AS count_valid_semi_sync_replicas, 216 MIN( 217 primary_instance.mariadb_gtid 218 ) AS is_mariadb_gtid, 219 SUM(replica_instance.mariadb_gtid) AS count_mariadb_gtid_replicas, 220 IFNULL( 221 SUM( 222 replica_instance.last_checked <= replica_instance.last_seen 223 AND replica_instance.mariadb_gtid != 0 224 ), 225 0 226 ) AS count_valid_mariadb_gtid_replicas, 227 IFNULL( 228 SUM( 229 replica_instance.log_bin 230 AND replica_instance.log_replica_updates 231 ), 232 0 233 ) AS count_logging_replicas, 234 IFNULL( 235 SUM( 236 replica_instance.log_bin 237 AND replica_instance.log_replica_updates 238 AND replica_instance.binlog_format = 'STATEMENT' 239 ), 240 0 241 ) AS count_statement_based_logging_replicas, 242 IFNULL( 243 SUM( 244 replica_instance.log_bin 245 AND replica_instance.log_replica_updates 246 AND replica_instance.binlog_format = 'MIXED' 247 ), 248 0 249 ) AS count_mixed_based_logging_replicas, 250 IFNULL( 251 SUM( 252 replica_instance.log_bin 253 AND replica_instance.log_replica_updates 254 AND replica_instance.binlog_format = 'ROW' 255 ), 256 0 257 ) AS count_row_based_logging_replicas, 258 IFNULL( 259 SUM(replica_instance.sql_delay > 0), 260 0 261 ) AS count_delayed_replicas, 262 IFNULL( 263 SUM(replica_instance.replica_lag_seconds > ?), 264 0 265 ) AS count_lagging_replicas, 266 IFNULL(MIN(replica_instance.gtid_mode), '') AS min_replica_gtid_mode, 267 IFNULL(MAX(replica_instance.gtid_mode), '') AS max_replica_gtid_mode, 268 IFNULL( 269 MAX( 270 case when replica_downtime.downtime_active is not null 271 and ifnull(replica_downtime.end_timestamp, now()) > now() then '' else replica_instance.gtid_errant end 272 ), 273 '' 274 ) AS max_replica_gtid_errant, 275 IFNULL( 276 SUM( 277 replica_downtime.downtime_active is not null 278 and ifnull(replica_downtime.end_timestamp, now()) > now() 279 ), 280 0 281 ) AS count_downtimed_replicas, 282 COUNT( 283 DISTINCT case when replica_instance.log_bin 284 AND replica_instance.log_replica_updates then replica_instance.major_version else NULL end 285 ) AS count_distinct_logging_major_versions 286 FROM 287 vitess_tablet 288 JOIN vitess_keyspace ON ( 289 vitess_tablet.keyspace = vitess_keyspace.keyspace 290 ) 291 LEFT JOIN database_instance primary_instance ON ( 292 vitess_tablet.hostname = primary_instance.hostname 293 AND vitess_tablet.port = primary_instance.port 294 ) 295 LEFT JOIN vitess_tablet primary_tablet ON ( 296 primary_tablet.hostname = primary_instance.source_host 297 AND primary_tablet.port = primary_instance.source_port 298 ) 299 LEFT JOIN hostname_resolve ON ( 300 primary_instance.hostname = hostname_resolve.hostname 301 ) 302 LEFT JOIN database_instance replica_instance ON ( 303 COALESCE( 304 hostname_resolve.resolved_hostname, 305 primary_instance.hostname 306 ) = replica_instance.source_host 307 AND primary_instance.port = replica_instance.source_port 308 ) 309 LEFT JOIN database_instance_maintenance ON ( 310 primary_instance.hostname = database_instance_maintenance.hostname 311 AND primary_instance.port = database_instance_maintenance.port 312 AND database_instance_maintenance.maintenance_active = 1 313 ) 314 LEFT JOIN database_instance_stale_binlog_coordinates ON ( 315 primary_instance.hostname = database_instance_stale_binlog_coordinates.hostname 316 AND primary_instance.port = database_instance_stale_binlog_coordinates.port 317 ) 318 LEFT JOIN database_instance_downtime as primary_downtime ON ( 319 primary_instance.hostname = primary_downtime.hostname 320 AND primary_instance.port = primary_downtime.port 321 AND primary_downtime.downtime_active = 1 322 ) 323 LEFT JOIN database_instance_downtime as replica_downtime ON ( 324 replica_instance.hostname = replica_downtime.hostname 325 AND replica_instance.port = replica_downtime.port 326 AND replica_downtime.downtime_active = 1 327 ) 328 WHERE 329 database_instance_maintenance.database_instance_maintenance_id IS NULL 330 AND ? IN ('', vitess_keyspace.keyspace) 331 AND ? IN ('', vitess_tablet.shard) 332 GROUP BY 333 vitess_tablet.hostname, 334 vitess_tablet.port 335 ORDER BY 336 vitess_tablet.tablet_type ASC, 337 vitess_tablet.primary_timestamp DESC 338 ` 339 340 clusters := make(map[string]*clusterAnalysis) 341 err := db.Db.QueryVTOrc(query, args, func(m sqlutils.RowMap) error { 342 a := ReplicationAnalysis{ 343 Analysis: NoProblem, 344 ProcessingNodeHostname: process.ThisHostname, 345 ProcessingNodeToken: util.ProcessToken.Hash, 346 } 347 348 tablet := &topodatapb.Tablet{} 349 if err := prototext.Unmarshal([]byte(m.GetString("tablet_info")), tablet); err != nil { 350 log.Errorf("could not read tablet %v: %v", m.GetString("tablet_info"), err) 351 return nil 352 } 353 354 primaryTablet := &topodatapb.Tablet{} 355 if str := m.GetString("primary_tablet_info"); str != "" { 356 if err := prototext.Unmarshal([]byte(str), primaryTablet); err != nil { 357 log.Errorf("could not read tablet %v: %v", str, err) 358 return nil 359 } 360 } 361 362 a.TabletType = tablet.Type 363 a.AnalyzedKeyspace = m.GetString("keyspace") 364 a.AnalyzedShard = m.GetString("shard") 365 a.PrimaryTimeStamp = m.GetTime("primary_timestamp") 366 367 if keyspaceType := topodatapb.KeyspaceType(m.GetInt("keyspace_type")); keyspaceType == topodatapb.KeyspaceType_SNAPSHOT { 368 log.Errorf("keyspace %v is a snapshot keyspace. Skipping.", a.AnalyzedKeyspace) 369 return nil 370 } 371 372 a.IsPrimary = m.GetBool("is_primary") 373 countCoPrimaryReplicas := m.GetUint("count_co_primary_replicas") 374 a.IsCoPrimary = m.GetBool("is_co_primary") || (countCoPrimaryReplicas > 0) 375 a.AnalyzedInstanceKey = InstanceKey{Hostname: m.GetString("hostname"), Port: m.GetInt("port")} 376 a.AnalyzedInstancePrimaryKey = InstanceKey{Hostname: m.GetString("source_host"), Port: m.GetInt("source_port")} 377 a.AnalyzedInstanceDataCenter = m.GetString("data_center") 378 a.AnalyzedInstanceRegion = m.GetString("region") 379 a.AnalyzedInstancePhysicalEnvironment = m.GetString("physical_environment") 380 a.AnalyzedInstanceBinlogCoordinates = BinlogCoordinates{ 381 LogFile: m.GetString("binary_log_file"), 382 LogPos: m.GetInt64("binary_log_pos"), 383 Type: BinaryLog, 384 } 385 isStaleBinlogCoordinates := m.GetBool("is_stale_binlog_coordinates") 386 a.ClusterDetails.Keyspace = m.GetString("keyspace") 387 a.ClusterDetails.Shard = m.GetString("shard") 388 a.GTIDMode = m.GetString("gtid_mode") 389 a.LastCheckValid = m.GetBool("is_last_check_valid") 390 a.LastCheckPartialSuccess = m.GetBool("last_check_partial_success") 391 a.CountReplicas = m.GetUint("count_replicas") 392 a.CountValidReplicas = m.GetUint("count_valid_replicas") 393 a.CountValidReplicatingReplicas = m.GetUint("count_valid_replicating_replicas") 394 a.CountReplicasFailingToConnectToPrimary = m.GetUint("count_replicas_failing_to_connect_to_primary") 395 a.CountDowntimedReplicas = m.GetUint("count_downtimed_replicas") 396 a.ReplicationDepth = m.GetUint("replication_depth") 397 a.IsFailingToConnectToPrimary = m.GetBool("is_failing_to_connect_to_primary") 398 a.ReplicationStopped = m.GetBool("replication_stopped") 399 a.IsDowntimed = m.GetBool("is_downtimed") 400 a.DowntimeEndTimestamp = m.GetString("downtime_end_timestamp") 401 a.DowntimeRemainingSeconds = m.GetInt("downtime_remaining_seconds") 402 a.IsBinlogServer = m.GetBool("is_binlog_server") 403 a.ClusterDetails.ReadRecoveryInfo() 404 405 countValidOracleGTIDReplicas := m.GetUint("count_valid_oracle_gtid_replicas") 406 a.OracleGTIDImmediateTopology = countValidOracleGTIDReplicas == a.CountValidReplicas && a.CountValidReplicas > 0 407 countValidMariaDBGTIDReplicas := m.GetUint("count_valid_mariadb_gtid_replicas") 408 a.MariaDBGTIDImmediateTopology = countValidMariaDBGTIDReplicas == a.CountValidReplicas && a.CountValidReplicas > 0 409 countValidBinlogServerReplicas := m.GetUint("count_valid_binlog_server_replicas") 410 a.BinlogServerImmediateTopology = countValidBinlogServerReplicas == a.CountValidReplicas && a.CountValidReplicas > 0 411 a.SemiSyncPrimaryEnabled = m.GetBool("semi_sync_primary_enabled") 412 a.SemiSyncPrimaryStatus = m.GetBool("semi_sync_primary_status") 413 a.SemiSyncReplicaEnabled = m.GetBool("semi_sync_replica_enabled") 414 a.CountSemiSyncReplicasEnabled = m.GetUint("count_semi_sync_replicas") 415 // countValidSemiSyncReplicasEnabled := m.GetUint("count_valid_semi_sync_replicas") 416 a.SemiSyncPrimaryWaitForReplicaCount = m.GetUint("semi_sync_primary_wait_for_replica_count") 417 a.SemiSyncPrimaryClients = m.GetUint("semi_sync_primary_clients") 418 419 a.MinReplicaGTIDMode = m.GetString("min_replica_gtid_mode") 420 a.MaxReplicaGTIDMode = m.GetString("max_replica_gtid_mode") 421 a.MaxReplicaGTIDErrant = m.GetString("max_replica_gtid_errant") 422 423 a.CountLoggingReplicas = m.GetUint("count_logging_replicas") 424 a.CountStatementBasedLoggingReplicas = m.GetUint("count_statement_based_logging_replicas") 425 a.CountMixedBasedLoggingReplicas = m.GetUint("count_mixed_based_logging_replicas") 426 a.CountRowBasedLoggingReplicas = m.GetUint("count_row_based_logging_replicas") 427 a.CountDistinctMajorVersionsLoggingReplicas = m.GetUint("count_distinct_logging_major_versions") 428 429 a.CountDelayedReplicas = m.GetUint("count_delayed_replicas") 430 a.CountLaggingReplicas = m.GetUint("count_lagging_replicas") 431 432 a.IsReadOnly = m.GetUint("read_only") == 1 433 434 if !a.LastCheckValid { 435 analysisMessage := fmt.Sprintf("analysis: Keyspace: %+v, Shard: %+v, IsPrimary: %+v, LastCheckValid: %+v, LastCheckPartialSuccess: %+v, CountReplicas: %+v, CountValidReplicas: %+v, CountValidReplicatingReplicas: %+v, CountLaggingReplicas: %+v, CountDelayedReplicas: %+v, CountReplicasFailingToConnectToPrimary: %+v", 436 a.ClusterDetails.Keyspace, a.ClusterDetails.Shard, a.IsPrimary, a.LastCheckValid, a.LastCheckPartialSuccess, a.CountReplicas, a.CountValidReplicas, a.CountValidReplicatingReplicas, a.CountLaggingReplicas, a.CountDelayedReplicas, a.CountReplicasFailingToConnectToPrimary, 437 ) 438 if util.ClearToLog("analysis_dao", analysisMessage) { 439 log.Infof(analysisMessage) 440 } 441 } 442 keyspaceShard := getKeyspaceShardName(a.ClusterDetails.Keyspace, a.ClusterDetails.Shard) 443 if clusters[keyspaceShard] == nil { 444 clusters[keyspaceShard] = &clusterAnalysis{} 445 if a.TabletType == topodatapb.TabletType_PRIMARY { 446 a.IsClusterPrimary = true 447 clusters[keyspaceShard].primaryKey = &a.AnalyzedInstanceKey 448 } 449 durabilityPolicy := m.GetString("durability_policy") 450 if durabilityPolicy == "" { 451 log.Errorf("ignoring keyspace %v because no durability_policy is set. Please set it using SetKeyspaceDurabilityPolicy", a.AnalyzedKeyspace) 452 return nil 453 } 454 durability, err := reparentutil.GetDurabilityPolicy(durabilityPolicy) 455 if err != nil { 456 log.Errorf("can't get the durability policy %v - %v. Skipping keyspace - %v.", durabilityPolicy, err, a.AnalyzedKeyspace) 457 return nil 458 } 459 clusters[keyspaceShard].durability = durability 460 } 461 // ca has clusterwide info 462 ca := clusters[keyspaceShard] 463 if ca.hasClusterwideAction { 464 // We can only take one cluster level action at a time. 465 return nil 466 } 467 if ca.durability == nil { 468 // We failed to load the durability policy, so we shouldn't run any analysis 469 return nil 470 } 471 isInvalid := m.GetBool("is_invalid") 472 if isInvalid { 473 return nil 474 } 475 if a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas == 0 { 476 a.Analysis = DeadPrimaryWithoutReplicas 477 a.Description = "Primary cannot be reached by vtorc and has no replica" 478 ca.hasClusterwideAction = true 479 // 480 } else if a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { 481 a.Analysis = DeadPrimary 482 a.Description = "Primary cannot be reached by vtorc and none of its replicas is replicating" 483 ca.hasClusterwideAction = true 484 // 485 } else if a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == 0 && a.CountValidReplicatingReplicas == 0 { 486 a.Analysis = DeadPrimaryAndReplicas 487 a.Description = "Primary cannot be reached by vtorc and none of its replicas is replicating" 488 ca.hasClusterwideAction = true 489 // 490 } else if a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 { 491 a.Analysis = DeadPrimaryAndSomeReplicas 492 a.Description = "Primary cannot be reached by vtorc; some of its replicas are unreachable and none of its reachable replicas is replicating" 493 ca.hasClusterwideAction = true 494 // 495 } else if a.IsClusterPrimary && !a.IsPrimary { 496 a.Analysis = PrimaryHasPrimary 497 a.Description = "Primary is replicating from somewhere else" 498 ca.hasClusterwideAction = true 499 // 500 } else if a.IsClusterPrimary && a.IsReadOnly { 501 a.Analysis = PrimaryIsReadOnly 502 a.Description = "Primary is read-only" 503 // 504 } else if a.IsClusterPrimary && SemiSyncAckers(ca.durability, tablet) != 0 && !a.SemiSyncPrimaryEnabled { 505 a.Analysis = PrimarySemiSyncMustBeSet 506 a.Description = "Primary semi-sync must be set" 507 // 508 } else if a.IsClusterPrimary && SemiSyncAckers(ca.durability, tablet) == 0 && a.SemiSyncPrimaryEnabled { 509 a.Analysis = PrimarySemiSyncMustNotBeSet 510 a.Description = "Primary semi-sync must not be set" 511 // 512 } else if topo.IsReplicaType(a.TabletType) && ca.primaryKey == nil { 513 a.Analysis = ClusterHasNoPrimary 514 a.Description = "Cluster has no primary" 515 ca.hasClusterwideAction = true 516 } else if topo.IsReplicaType(a.TabletType) && !a.IsReadOnly { 517 a.Analysis = ReplicaIsWritable 518 a.Description = "Replica is writable" 519 // 520 } else if topo.IsReplicaType(a.TabletType) && a.IsPrimary { 521 a.Analysis = NotConnectedToPrimary 522 a.Description = "Not connected to the primary" 523 // 524 } else if topo.IsReplicaType(a.TabletType) && !a.IsPrimary && ca.primaryKey != nil && a.AnalyzedInstancePrimaryKey != *ca.primaryKey { 525 a.Analysis = ConnectedToWrongPrimary 526 a.Description = "Connected to wrong primary" 527 // 528 } else if topo.IsReplicaType(a.TabletType) && !a.IsPrimary && a.ReplicationStopped { 529 a.Analysis = ReplicationStopped 530 a.Description = "Replication is stopped" 531 // 532 } else if topo.IsReplicaType(a.TabletType) && !a.IsPrimary && IsReplicaSemiSync(ca.durability, primaryTablet, tablet) && !a.SemiSyncReplicaEnabled { 533 a.Analysis = ReplicaSemiSyncMustBeSet 534 a.Description = "Replica semi-sync must be set" 535 // 536 } else if topo.IsReplicaType(a.TabletType) && !a.IsPrimary && !IsReplicaSemiSync(ca.durability, primaryTablet, tablet) && a.SemiSyncReplicaEnabled { 537 a.Analysis = ReplicaSemiSyncMustNotBeSet 538 a.Description = "Replica semi-sync must not be set" 539 // 540 // TODO(sougou): Events below here are either ignored or not possible. 541 } else if a.IsPrimary && !a.LastCheckValid && a.CountLaggingReplicas == a.CountReplicas && a.CountDelayedReplicas < a.CountReplicas && a.CountValidReplicatingReplicas > 0 { 542 a.Analysis = UnreachablePrimaryWithLaggingReplicas 543 a.Description = "Primary cannot be reached by vtorc and all of its replicas are lagging" 544 // 545 } else if a.IsPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 { 546 // partial success is here to redice noise 547 a.Analysis = UnreachablePrimary 548 a.Description = "Primary cannot be reached by vtorc but it has replicating replicas; possibly a network/host issue" 549 // 550 } else if a.IsPrimary && !a.LastCheckValid && a.LastCheckPartialSuccess && a.CountReplicasFailingToConnectToPrimary > 0 && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 { 551 // there's partial success, but also at least one replica is failing to connect to primary 552 a.Analysis = UnreachablePrimary 553 a.Description = "Primary cannot be reached by vtorc but it has replicating replicas; possibly a network/host issue" 554 // 555 } else if a.IsPrimary && a.SemiSyncPrimaryEnabled && a.SemiSyncPrimaryStatus && a.SemiSyncPrimaryWaitForReplicaCount > 0 && a.SemiSyncPrimaryClients < a.SemiSyncPrimaryWaitForReplicaCount { 556 if isStaleBinlogCoordinates { 557 a.Analysis = LockedSemiSyncPrimary 558 a.Description = "Semi sync primary is locked since it doesn't get enough replica acknowledgements" 559 } else { 560 a.Analysis = LockedSemiSyncPrimaryHypothesis 561 a.Description = "Semi sync primary seems to be locked, more samplings needed to validate" 562 } 563 // 564 } else if a.IsPrimary && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { 565 a.Analysis = PrimarySingleReplicaNotReplicating 566 a.Description = "Primary is reachable but its single replica is not replicating" 567 } else if a.IsPrimary && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == 0 { 568 a.Analysis = PrimarySingleReplicaDead 569 a.Description = "Primary is reachable but its single replica is dead" 570 // 571 } else if a.IsPrimary && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { 572 a.Analysis = AllPrimaryReplicasNotReplicating 573 a.Description = "Primary is reachable but none of its replicas is replicating" 574 // 575 } else if a.IsPrimary && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 { 576 a.Analysis = AllPrimaryReplicasNotReplicatingOrDead 577 a.Description = "Primary is reachable but none of its replicas is replicating" 578 // 579 } else if a.IsBinlogServer && a.IsFailingToConnectToPrimary { 580 a.Analysis = BinlogServerFailingToConnectToPrimary 581 a.Description = "Binlog server is unable to connect to its primary" 582 // 583 } 584 // else if a.IsPrimary && a.CountReplicas == 0 { 585 // a.Analysis = PrimaryWithoutReplicas 586 // a.Description = "Primary has no replicas" 587 // } 588 589 appendAnalysis := func(analysis *ReplicationAnalysis) { 590 if a.Analysis == NoProblem && len(a.StructureAnalysis) == 0 && !hints.IncludeNoProblem { 591 return 592 } 593 if a.IsDowntimed { 594 a.SkippableDueToDowntime = true 595 } 596 if a.CountReplicas == a.CountDowntimedReplicas { 597 switch a.Analysis { 598 case AllPrimaryReplicasNotReplicating, 599 AllPrimaryReplicasNotReplicatingOrDead, 600 PrimarySingleReplicaDead: 601 a.IsReplicasDowntimed = true 602 a.SkippableDueToDowntime = true 603 } 604 } 605 if a.SkippableDueToDowntime && !hints.IncludeDowntimed { 606 return 607 } 608 result = append(result, a) 609 } 610 611 { 612 // Moving on to structure analysis 613 // We also do structural checks. See if there's potential danger in promotions 614 if a.IsPrimary && a.CountLoggingReplicas == 0 && a.CountReplicas > 1 { 615 a.StructureAnalysis = append(a.StructureAnalysis, NoLoggingReplicasStructureWarning) 616 } 617 if a.IsPrimary && a.CountReplicas > 1 && 618 !a.OracleGTIDImmediateTopology && 619 !a.MariaDBGTIDImmediateTopology && 620 !a.BinlogServerImmediateTopology { 621 a.StructureAnalysis = append(a.StructureAnalysis, NoFailoverSupportStructureWarning) 622 } 623 if a.IsPrimary && a.CountStatementBasedLoggingReplicas > 0 && a.CountMixedBasedLoggingReplicas > 0 { 624 a.StructureAnalysis = append(a.StructureAnalysis, StatementAndMixedLoggingReplicasStructureWarning) 625 } 626 if a.IsPrimary && a.CountStatementBasedLoggingReplicas > 0 && a.CountRowBasedLoggingReplicas > 0 { 627 a.StructureAnalysis = append(a.StructureAnalysis, StatementAndRowLoggingReplicasStructureWarning) 628 } 629 if a.IsPrimary && a.CountMixedBasedLoggingReplicas > 0 && a.CountRowBasedLoggingReplicas > 0 { 630 a.StructureAnalysis = append(a.StructureAnalysis, MixedAndRowLoggingReplicasStructureWarning) 631 } 632 if a.IsPrimary && a.CountDistinctMajorVersionsLoggingReplicas > 1 { 633 a.StructureAnalysis = append(a.StructureAnalysis, MultipleMajorVersionsLoggingReplicasStructureWarning) 634 } 635 636 if a.CountReplicas > 0 && (a.GTIDMode != a.MinReplicaGTIDMode || a.GTIDMode != a.MaxReplicaGTIDMode) { 637 a.StructureAnalysis = append(a.StructureAnalysis, DifferentGTIDModesStructureWarning) 638 } 639 if a.MaxReplicaGTIDErrant != "" { 640 a.StructureAnalysis = append(a.StructureAnalysis, ErrantGTIDStructureWarning) 641 } 642 643 if a.IsPrimary && a.IsReadOnly { 644 a.StructureAnalysis = append(a.StructureAnalysis, NoWriteablePrimaryStructureWarning) 645 } 646 647 if a.IsPrimary && a.SemiSyncPrimaryEnabled && !a.SemiSyncPrimaryStatus && a.SemiSyncPrimaryWaitForReplicaCount > 0 && a.SemiSyncPrimaryClients < a.SemiSyncPrimaryWaitForReplicaCount { 648 a.StructureAnalysis = append(a.StructureAnalysis, NotEnoughValidSemiSyncReplicasStructureWarning) 649 } 650 } 651 appendAnalysis(&a) 652 653 if a.CountReplicas > 0 && hints.AuditAnalysis { 654 // Interesting enough for analysis 655 go func() { 656 _ = auditInstanceAnalysisInChangelog(&a.AnalyzedInstanceKey, a.Analysis) 657 }() 658 } 659 return nil 660 }) 661 662 if err != nil { 663 log.Error(err) 664 } 665 // TODO: result, err = getConcensusReplicationAnalysis(result) 666 return result, err 667 } 668 669 // auditInstanceAnalysisInChangelog will write down an instance's analysis in the database_instance_analysis_changelog table. 670 // To not repeat recurring analysis code, the database_instance_last_analysis table is used, so that only changes to 671 // analysis codes are written. 672 func auditInstanceAnalysisInChangelog(instanceKey *InstanceKey, analysisCode AnalysisCode) error { 673 if lastWrittenAnalysis, found := recentInstantAnalysis.Get(instanceKey.DisplayString()); found { 674 if lastWrittenAnalysis == analysisCode { 675 // Surely nothing new. 676 // And let's expand the timeout 677 recentInstantAnalysis.Set(instanceKey.DisplayString(), analysisCode, cache.DefaultExpiration) 678 return nil 679 } 680 } 681 // Passed the cache; but does database agree that there's a change? Here's a persistent cache; this comes here 682 // to verify no two vtorc services are doing this without coordinating (namely, one dies, the other taking its place 683 // and has no familiarity of the former's cache) 684 analysisChangeWriteAttemptCounter.Inc(1) 685 686 lastAnalysisChanged := false 687 { 688 sqlResult, err := db.ExecVTOrc(` 689 update database_instance_last_analysis set 690 analysis = ?, 691 analysis_timestamp = now() 692 where 693 hostname = ? 694 and port = ? 695 and analysis != ? 696 `, 697 string(analysisCode), instanceKey.Hostname, instanceKey.Port, string(analysisCode), 698 ) 699 if err != nil { 700 log.Error(err) 701 return err 702 } 703 rows, err := sqlResult.RowsAffected() 704 if err != nil { 705 log.Error(err) 706 return err 707 } 708 lastAnalysisChanged = (rows > 0) 709 } 710 if !lastAnalysisChanged { 711 _, err := db.ExecVTOrc(` 712 insert ignore into database_instance_last_analysis ( 713 hostname, port, analysis_timestamp, analysis 714 ) values ( 715 ?, ?, now(), ? 716 ) 717 `, 718 instanceKey.Hostname, instanceKey.Port, string(analysisCode), 719 ) 720 if err != nil { 721 log.Error(err) 722 return err 723 } 724 } 725 recentInstantAnalysis.Set(instanceKey.DisplayString(), analysisCode, cache.DefaultExpiration) 726 if !lastAnalysisChanged { 727 return nil 728 } 729 730 _, err := db.ExecVTOrc(` 731 insert into database_instance_analysis_changelog ( 732 hostname, port, analysis_timestamp, analysis 733 ) values ( 734 ?, ?, now(), ? 735 ) 736 `, 737 instanceKey.Hostname, instanceKey.Port, string(analysisCode), 738 ) 739 if err == nil { 740 analysisChangeWriteCounter.Inc(1) 741 } else { 742 log.Error(err) 743 } 744 return err 745 } 746 747 // ExpireInstanceAnalysisChangelog removes old-enough analysis entries from the changelog 748 func ExpireInstanceAnalysisChangelog() error { 749 _, err := db.ExecVTOrc(` 750 delete 751 from database_instance_analysis_changelog 752 where 753 analysis_timestamp < now() - interval ? hour 754 `, 755 config.UnseenInstanceForgetHours, 756 ) 757 if err != nil { 758 log.Error(err) 759 } 760 return err 761 }