vitess.io/vitess@v0.16.2/go/vt/vttablet/tabletserver/health_streamer.go (about) 1 /* 2 Copyright 2020 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package tabletserver 18 19 import ( 20 "context" 21 "fmt" 22 "io" 23 "strings" 24 "sync" 25 "time" 26 27 "github.com/spf13/pflag" 28 29 "vitess.io/vitess/go/vt/servenv" 30 31 "vitess.io/vitess/go/sqltypes" 32 33 "vitess.io/vitess/go/vt/sqlparser" 34 35 "vitess.io/vitess/go/vt/dbconfigs" 36 37 "vitess.io/vitess/go/mysql" 38 "vitess.io/vitess/go/timer" 39 "vitess.io/vitess/go/vt/vttablet/tabletserver/connpool" 40 41 "google.golang.org/protobuf/proto" 42 43 "vitess.io/vitess/go/history" 44 "vitess.io/vitess/go/sync2" 45 "vitess.io/vitess/go/vt/log" 46 querypb "vitess.io/vitess/go/vt/proto/query" 47 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 48 vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" 49 "vitess.io/vitess/go/vt/vterrors" 50 "vitess.io/vitess/go/vt/vttablet/tabletmanager/vreplication" 51 "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" 52 ) 53 54 var ( 55 // blpFunc is a legaacy feature. 56 // TODO(sougou): remove after legacy resharding worflows are removed. 57 blpFunc = vreplication.StatusSummary 58 59 errUnintialized = "tabletserver uninitialized" 60 61 streamHealthBufferSize = uint(20) 62 ) 63 64 func init() { 65 servenv.OnParseFor("vtcombo", registerHealthStreamerFlags) 66 servenv.OnParseFor("vttablet", registerHealthStreamerFlags) 67 } 68 69 func registerHealthStreamerFlags(fs *pflag.FlagSet) { 70 fs.UintVar(&streamHealthBufferSize, "stream_health_buffer_size", streamHealthBufferSize, "max streaming health entries to buffer per streaming health client") 71 } 72 73 // healthStreamer streams health information to callers. 74 type healthStreamer struct { 75 stats *tabletenv.Stats 76 degradedThreshold time.Duration 77 unhealthyThreshold sync2.AtomicDuration 78 79 mu sync.Mutex 80 ctx context.Context 81 cancel context.CancelFunc 82 clients map[chan *querypb.StreamHealthResponse]struct{} 83 state *querypb.StreamHealthResponse 84 85 history *history.History 86 87 ticks *timer.Timer 88 dbConfig dbconfigs.Connector 89 conns *connpool.Pool 90 signalWhenSchemaChange bool 91 92 viewsEnabled bool 93 views map[string]string 94 } 95 96 func newHealthStreamer(env tabletenv.Env, alias *topodatapb.TabletAlias) *healthStreamer { 97 var newTimer *timer.Timer 98 var pool *connpool.Pool 99 if env.Config().SignalWhenSchemaChange { 100 reloadTime := env.Config().SignalSchemaChangeReloadIntervalSeconds.Get() 101 newTimer = timer.NewTimer(reloadTime) 102 // We need one connection for the reloader. 103 pool = connpool.NewPool(env, "", tabletenv.ConnPoolConfig{ 104 Size: 1, 105 IdleTimeoutSeconds: env.Config().OltpReadPool.IdleTimeoutSeconds, 106 }) 107 } 108 return &healthStreamer{ 109 stats: env.Stats(), 110 degradedThreshold: env.Config().Healthcheck.DegradedThresholdSeconds.Get(), 111 unhealthyThreshold: sync2.NewAtomicDuration(env.Config().Healthcheck.UnhealthyThresholdSeconds.Get()), 112 clients: make(map[chan *querypb.StreamHealthResponse]struct{}), 113 114 state: &querypb.StreamHealthResponse{ 115 Target: &querypb.Target{}, 116 TabletAlias: alias, 117 RealtimeStats: &querypb.RealtimeStats{ 118 HealthError: errUnintialized, 119 }, 120 }, 121 122 history: history.New(5), 123 ticks: newTimer, 124 conns: pool, 125 signalWhenSchemaChange: env.Config().SignalWhenSchemaChange, 126 viewsEnabled: env.Config().EnableViews, 127 views: map[string]string{}, 128 } 129 } 130 131 func (hs *healthStreamer) InitDBConfig(target *querypb.Target, cp dbconfigs.Connector) { 132 hs.state.Target = proto.Clone(target).(*querypb.Target) 133 hs.dbConfig = cp 134 } 135 136 func (hs *healthStreamer) Open() { 137 hs.mu.Lock() 138 defer hs.mu.Unlock() 139 140 if hs.cancel != nil { 141 return 142 } 143 hs.ctx, hs.cancel = context.WithCancel(context.Background()) 144 if hs.conns != nil { 145 // if we don't have a live conns object, it means we are not configured to signal when the schema changes 146 hs.conns.Open(hs.dbConfig, hs.dbConfig, hs.dbConfig) 147 hs.ticks.Start(func() { 148 if err := hs.reload(); err != nil { 149 log.Errorf("periodic schema reload failed in health stream: %v", err) 150 } 151 }) 152 153 } 154 155 } 156 157 func (hs *healthStreamer) Close() { 158 hs.mu.Lock() 159 defer hs.mu.Unlock() 160 161 if hs.cancel != nil { 162 if hs.ticks != nil { 163 hs.ticks.Stop() 164 hs.conns.Close() 165 } 166 hs.cancel() 167 hs.cancel = nil 168 } 169 } 170 171 func (hs *healthStreamer) Stream(ctx context.Context, callback func(*querypb.StreamHealthResponse) error) error { 172 ch, hsCtx := hs.register() 173 if hsCtx == nil { 174 return vterrors.Errorf(vtrpcpb.Code_UNAVAILABLE, "tabletserver is shutdown") 175 } 176 defer hs.unregister(ch) 177 178 // trigger the initial schema reload 179 if hs.signalWhenSchemaChange { 180 hs.ticks.Trigger() 181 } 182 183 for { 184 select { 185 case <-ctx.Done(): 186 return nil 187 case <-hsCtx.Done(): 188 return vterrors.Errorf(vtrpcpb.Code_UNAVAILABLE, "tabletserver is shutdown") 189 case shr, ok := <-ch: 190 if !ok { 191 return vterrors.Errorf(vtrpcpb.Code_RESOURCE_EXHAUSTED, "stream health buffer overflowed. client should reconnect for up-to-date status") 192 } 193 if err := callback(shr); err != nil { 194 if err == io.EOF { 195 return nil 196 } 197 return err 198 } 199 } 200 } 201 } 202 203 func (hs *healthStreamer) register() (chan *querypb.StreamHealthResponse, context.Context) { 204 hs.mu.Lock() 205 defer hs.mu.Unlock() 206 207 if hs.cancel == nil { 208 return nil, nil 209 } 210 211 ch := make(chan *querypb.StreamHealthResponse, streamHealthBufferSize) 212 hs.clients[ch] = struct{}{} 213 214 // Send the current state immediately. 215 ch <- proto.Clone(hs.state).(*querypb.StreamHealthResponse) 216 return ch, hs.ctx 217 } 218 219 func (hs *healthStreamer) unregister(ch chan *querypb.StreamHealthResponse) { 220 hs.mu.Lock() 221 defer hs.mu.Unlock() 222 223 delete(hs.clients, ch) 224 } 225 226 func (hs *healthStreamer) ChangeState(tabletType topodatapb.TabletType, terTimestamp time.Time, lag time.Duration, err error, serving bool) { 227 hs.mu.Lock() 228 defer hs.mu.Unlock() 229 230 hs.state.Target.TabletType = tabletType 231 if tabletType == topodatapb.TabletType_PRIMARY { 232 hs.state.TabletExternallyReparentedTimestamp = terTimestamp.Unix() 233 } else { 234 hs.state.TabletExternallyReparentedTimestamp = 0 235 } 236 if err != nil { 237 hs.state.RealtimeStats.HealthError = err.Error() 238 } else { 239 hs.state.RealtimeStats.HealthError = "" 240 } 241 hs.state.RealtimeStats.ReplicationLagSeconds = uint32(lag.Seconds()) 242 hs.state.Serving = serving 243 244 hs.state.RealtimeStats.FilteredReplicationLagSeconds, hs.state.RealtimeStats.BinlogPlayersCount = blpFunc() 245 hs.state.RealtimeStats.Qps = hs.stats.QPSRates.TotalRate() 246 247 shr := proto.Clone(hs.state).(*querypb.StreamHealthResponse) 248 249 hs.broadCastToClients(shr) 250 hs.history.Add(&historyRecord{ 251 Time: time.Now(), 252 serving: shr.Serving, 253 tabletType: shr.Target.TabletType, 254 lag: lag, 255 err: err, 256 }) 257 } 258 259 func (hs *healthStreamer) broadCastToClients(shr *querypb.StreamHealthResponse) { 260 for ch := range hs.clients { 261 select { 262 case ch <- shr: 263 default: 264 // We can't block this state change on broadcasting to a streaming health client, but we 265 // also don't want to silently fail to inform a streaming health client of a state change 266 // because it can allow a vtgate to get wedged in a state where it's wrong about whether 267 // a tablet is healthy and can't automatically recover (see 268 // https://github.com/vitessio/vitess/issues/5445). If we can't send a health update 269 // to this client we'll close() the channel which will ultimate fail the streaming health 270 // RPC and cause vtgates to reconnect. 271 // 272 // An alternative approach for streaming health would be to force a periodic broadcast even 273 // when there hasn't been an update and/or move away from using channels toward a model where 274 // old updates can be purged from the buffer in favor of more recent updates (since only the 275 // most recent health state really matters to gates). 276 log.Warning("A streaming health buffer is full. Closing the channel") 277 close(ch) 278 delete(hs.clients, ch) 279 } 280 } 281 } 282 283 func (hs *healthStreamer) AppendDetails(details []*kv) []*kv { 284 hs.mu.Lock() 285 defer hs.mu.Unlock() 286 if hs.state.Target.TabletType == topodatapb.TabletType_PRIMARY { 287 return details 288 } 289 sbm := time.Duration(hs.state.RealtimeStats.ReplicationLagSeconds) * time.Second 290 class := healthyClass 291 switch { 292 case sbm > hs.unhealthyThreshold.Get(): 293 class = unhealthyClass 294 case sbm > hs.degradedThreshold: 295 class = unhappyClass 296 } 297 details = append(details, &kv{ 298 Key: "Replication Lag", 299 Class: class, 300 Value: fmt.Sprintf("%ds", hs.state.RealtimeStats.ReplicationLagSeconds), 301 }) 302 if hs.state.RealtimeStats.HealthError != "" { 303 details = append(details, &kv{ 304 Key: "Replication Error", 305 Class: unhappyClass, 306 Value: hs.state.RealtimeStats.HealthError, 307 }) 308 } 309 310 return details 311 } 312 313 func (hs *healthStreamer) SetUnhealthyThreshold(v time.Duration) { 314 hs.unhealthyThreshold.Set(v) 315 shr := proto.Clone(hs.state).(*querypb.StreamHealthResponse) 316 for ch := range hs.clients { 317 select { 318 case ch <- shr: 319 default: 320 log.Info("Resetting health streamer clients due to unhealthy threshold change") 321 close(ch) 322 delete(hs.clients, ch) 323 } 324 } 325 } 326 327 // reload reloads the schema from the underlying mysql 328 func (hs *healthStreamer) reload() error { 329 hs.mu.Lock() 330 defer hs.mu.Unlock() 331 // Schema Reload to happen only on primary. 332 if hs.state.Target.TabletType != topodatapb.TabletType_PRIMARY { 333 return nil 334 } 335 336 ctx := hs.ctx 337 conn, err := hs.conns.Get(ctx, nil) 338 if err != nil { 339 return err 340 } 341 defer conn.Recycle() 342 343 tables, err := hs.getChangedTableNames(ctx, conn) 344 if err != nil { 345 return err 346 } 347 348 views, err := hs.getChangedViewNames(ctx, conn) 349 if err != nil { 350 return err 351 } 352 353 // no change detected 354 if len(tables) == 0 && len(views) == 0 { 355 return nil 356 } 357 358 hs.state.RealtimeStats.TableSchemaChanged = tables 359 hs.state.RealtimeStats.ViewSchemaChanged = views 360 shr := proto.Clone(hs.state).(*querypb.StreamHealthResponse) 361 hs.broadCastToClients(shr) 362 hs.state.RealtimeStats.TableSchemaChanged = nil 363 hs.state.RealtimeStats.ViewSchemaChanged = nil 364 365 return nil 366 } 367 368 func (hs *healthStreamer) getChangedTableNames(ctx context.Context, conn *connpool.DBConn) ([]string, error) { 369 var tables []string 370 var tableNames []string 371 372 callback := func(qr *sqltypes.Result) error { 373 for _, row := range qr.Rows { 374 table := row[0].ToString() 375 tables = append(tables, table) 376 377 escapedTblName := sqlparser.String(sqlparser.NewStrLiteral(table)) 378 tableNames = append(tableNames, escapedTblName) 379 } 380 381 return nil 382 } 383 alloc := func() *sqltypes.Result { return &sqltypes.Result{} } 384 bufferSize := 1000 385 386 schemaChangeQuery := mysql.DetectSchemaChange 387 // If views are enabled, then views are tracked/handled separately and schema change does not need to track them. 388 if hs.viewsEnabled { 389 schemaChangeQuery = mysql.DetectSchemaChangeOnlyBaseTable 390 } 391 err := conn.Stream(ctx, schemaChangeQuery, callback, alloc, bufferSize, 0) 392 if err != nil { 393 return nil, err 394 } 395 396 // If no change detected, then return 397 if len(tables) == 0 { 398 return nil, nil 399 } 400 401 tableNamePredicate := fmt.Sprintf("table_name IN (%s)", strings.Join(tableNames, ", ")) 402 del := fmt.Sprintf("%s AND %s", mysql.ClearSchemaCopy, tableNamePredicate) 403 upd := fmt.Sprintf("%s AND %s", mysql.InsertIntoSchemaCopy, tableNamePredicate) 404 405 // Reload the schema in a transaction. 406 _, err = conn.Exec(ctx, "begin", 1, false) 407 if err != nil { 408 return nil, err 409 } 410 defer conn.Exec(ctx, "rollback", 1, false) 411 412 _, err = conn.Exec(ctx, del, 1, false) 413 if err != nil { 414 return nil, err 415 } 416 417 _, err = conn.Exec(ctx, upd, 1, false) 418 if err != nil { 419 return nil, err 420 } 421 422 _, err = conn.Exec(ctx, "commit", 1, false) 423 if err != nil { 424 return nil, err 425 } 426 return tables, nil 427 } 428 429 func (hs *healthStreamer) getChangedViewNames(ctx context.Context, conn *connpool.DBConn) ([]string, error) { 430 if !hs.viewsEnabled { 431 return nil, nil 432 } 433 var changedViews []string 434 views := map[string]string{} 435 436 callback := func(qr *sqltypes.Result) error { 437 for _, row := range qr.Rows { 438 viewName := row[0].ToString() 439 lastUpdTime := row[1].ToString() 440 views[viewName] = lastUpdTime 441 } 442 443 return nil 444 } 445 alloc := func() *sqltypes.Result { return &sqltypes.Result{} } 446 bufferSize := 1000 447 err := conn.Stream(ctx, mysql.SelectAllViews, callback, alloc, bufferSize, 0) 448 if err != nil { 449 return nil, err 450 } 451 452 // If no change detected, then return 453 if len(views) == 0 && len(hs.views) == 0 { 454 return nil, nil 455 } 456 457 for viewName, lastUpdTime := range views { 458 t, exists := hs.views[viewName] 459 if !exists { // new view added 460 changedViews = append(changedViews, viewName) 461 continue 462 } 463 if t != lastUpdTime { // view updated 464 changedViews = append(changedViews, viewName) 465 } 466 delete(hs.views, viewName) 467 } 468 469 // views deleted 470 for viewName := range hs.views { 471 changedViews = append(changedViews, viewName) 472 } 473 474 // update hs.views with latest view info 475 hs.views = views 476 477 return changedViews, nil 478 }