vitess.io/vitess@v0.16.2/go/vt/vttablet/tabletserver/vstreamer/engine.go (about) 1 /* 2 Copyright 2019 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package vstreamer 18 19 import ( 20 "bytes" 21 "context" 22 "encoding/json" 23 "errors" 24 "fmt" 25 "net/http" 26 "strings" 27 "sync" 28 "sync/atomic" 29 "time" 30 31 "vitess.io/vitess/go/vt/dbconfigs" 32 "vitess.io/vitess/go/vt/mysqlctl" 33 "vitess.io/vitess/go/vt/servenv" 34 "vitess.io/vitess/go/vt/vterrors" 35 36 "vitess.io/vitess/go/acl" 37 "vitess.io/vitess/go/sqltypes" 38 "vitess.io/vitess/go/stats" 39 "vitess.io/vitess/go/vt/log" 40 "vitess.io/vitess/go/vt/srvtopo" 41 "vitess.io/vitess/go/vt/topo" 42 "vitess.io/vitess/go/vt/vtgate/vindexes" 43 "vitess.io/vitess/go/vt/vttablet/tabletserver/schema" 44 "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" 45 "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle" 46 47 binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" 48 vschemapb "vitess.io/vitess/go/vt/proto/vschema" 49 vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" 50 ) 51 52 const ( 53 throttlerAppName = "vstreamer" 54 ) 55 56 // Engine is the engine for handling vreplication streaming requests. 57 type Engine struct { 58 env tabletenv.Env 59 ts srvtopo.Server 60 se *schema.Engine 61 cell string 62 63 // keyspace is initialized by InitDBConfig 64 keyspace string 65 shard string 66 67 // wg is incremented for every Stream, and decremented on end. 68 // Close waits for all current streams to end by waiting on wg. 69 wg sync.WaitGroup 70 71 mu sync.Mutex 72 isOpen int32 // 0 or 1 in place of atomic.Bool added in go 1.19 73 streamIdx int 74 streamers map[int]*uvstreamer 75 rowStreamers map[int]*rowStreamer 76 resultStreamers map[int]*resultStreamer 77 78 // watcherOnce is used for initializing vschema 79 // and setting up the vschema watch. It's guaranteed that 80 // no stream will start until vschema is initialized by 81 // the first call through watcherOnce. 82 watcherOnce sync.Once 83 lvschema *localVSchema 84 85 // stats variables 86 vschemaErrors *stats.Counter 87 vschemaUpdates *stats.Counter 88 89 // vstreamer metrics 90 vstreamerPhaseTimings *servenv.TimingsWrapper 91 vstreamerCount *stats.Gauge 92 vstreamerEventsStreamed *stats.Counter 93 vstreamerPacketSize *stats.GaugeFunc 94 vstreamerNumPackets *stats.Counter 95 resultStreamerNumRows *stats.Counter 96 resultStreamerNumPackets *stats.Counter 97 rowStreamerNumRows *stats.Counter 98 rowStreamerNumPackets *stats.Counter 99 rowStreamerWaits *servenv.TimingsWrapper 100 errorCounts *stats.CountersWithSingleLabel 101 vstreamersCreated *stats.Counter 102 vstreamersEndedWithErrors *stats.Counter 103 vstreamerFlushedBinlogs *stats.Counter 104 105 throttlerClient *throttle.Client 106 } 107 108 // NewEngine creates a new Engine. 109 // Initialization sequence is: NewEngine->InitDBConfig->Open. 110 // Open and Close can be called multiple times and are idempotent. 111 func NewEngine(env tabletenv.Env, ts srvtopo.Server, se *schema.Engine, lagThrottler *throttle.Throttler, cell string) *Engine { 112 vse := &Engine{ 113 env: env, 114 ts: ts, 115 se: se, 116 cell: cell, 117 throttlerClient: throttle.NewBackgroundClient(lagThrottler, throttlerAppName, throttle.ThrottleCheckSelf), 118 119 streamers: make(map[int]*uvstreamer), 120 rowStreamers: make(map[int]*rowStreamer), 121 resultStreamers: make(map[int]*resultStreamer), 122 123 lvschema: &localVSchema{vschema: &vindexes.VSchema{}}, 124 125 vschemaErrors: env.Exporter().NewCounter("VSchemaErrors", "Count of VSchema errors"), 126 vschemaUpdates: env.Exporter().NewCounter("VSchemaUpdates", "Count of VSchema updates. Does not include errors"), 127 128 vstreamerPhaseTimings: env.Exporter().NewTimings("VStreamerPhaseTiming", "Time taken for different phases during vstream copy", "phase-timing"), 129 vstreamerCount: env.Exporter().NewGauge("VStreamerCount", "Current number of vstreamers"), 130 vstreamerEventsStreamed: env.Exporter().NewCounter("VStreamerEventsStreamed", "Count of events streamed in VStream API"), 131 vstreamerPacketSize: env.Exporter().NewGaugeFunc("VStreamPacketSize", "Max packet size for sending vstreamer events", getPacketSize), 132 vstreamerNumPackets: env.Exporter().NewCounter("VStreamerNumPackets", "Number of packets in vstreamer"), 133 resultStreamerNumPackets: env.Exporter().NewCounter("ResultStreamerNumPackets", "Number of packets in result streamer"), 134 resultStreamerNumRows: env.Exporter().NewCounter("ResultStreamerNumRows", "Number of rows sent in result streamer"), 135 rowStreamerNumPackets: env.Exporter().NewCounter("RowStreamerNumPackets", "Number of packets in row streamer"), 136 rowStreamerNumRows: env.Exporter().NewCounter("RowStreamerNumRows", "Number of rows sent in row streamer"), 137 rowStreamerWaits: env.Exporter().NewTimings("RowStreamerWaits", "Total counts and time we've waited when streaming rows in the vstream copy phase", "copy-phase-waits"), 138 vstreamersCreated: env.Exporter().NewCounter("VStreamersCreated", "Count of vstreamers created"), 139 vstreamersEndedWithErrors: env.Exporter().NewCounter("VStreamersEndedWithErrors", "Count of vstreamers that ended with errors"), 140 errorCounts: env.Exporter().NewCountersWithSingleLabel("VStreamerErrors", "Tracks errors in vstreamer", "type", "Catchup", "Copy", "Send", "TablePlan"), 141 vstreamerFlushedBinlogs: env.Exporter().NewCounter("VStreamerFlushedBinlogs", "Number of times we've successfully executed a FLUSH BINARY LOGS statement when starting a vstream"), 142 } 143 env.Exporter().NewGaugeFunc("RowStreamerMaxInnoDBTrxHistLen", "", func() int64 { return env.Config().RowStreamer.MaxInnoDBTrxHistLen }) 144 env.Exporter().NewGaugeFunc("RowStreamerMaxMySQLReplLagSecs", "", func() int64 { return env.Config().RowStreamer.MaxMySQLReplLagSecs }) 145 env.Exporter().HandleFunc("/debug/tablet_vschema", vse.ServeHTTP) 146 return vse 147 } 148 149 // InitDBConfig initializes the target parameters for the Engine. 150 func (vse *Engine) InitDBConfig(keyspace, shard string) { 151 vse.keyspace = keyspace 152 vse.shard = shard 153 } 154 155 // Open starts the Engine service. 156 func (vse *Engine) Open() { 157 log.Info("VStreamer: opening") 158 // If it's not already open, then open it now. 159 atomic.CompareAndSwapInt32(&vse.isOpen, 0, 1) 160 } 161 162 // IsOpen checks if the engine is opened 163 func (vse *Engine) IsOpen() bool { 164 return atomic.LoadInt32(&vse.isOpen) == 1 165 } 166 167 // Close closes the Engine service. 168 func (vse *Engine) Close() { 169 func() { 170 if atomic.LoadInt32(&vse.isOpen) == 0 { 171 return 172 } 173 vse.mu.Lock() 174 defer vse.mu.Unlock() 175 // cancels are non-blocking. 176 for _, s := range vse.streamers { 177 s.Cancel() 178 } 179 for _, s := range vse.rowStreamers { 180 s.Cancel() 181 } 182 for _, s := range vse.resultStreamers { 183 s.Cancel() 184 } 185 atomic.StoreInt32(&vse.isOpen, 0) 186 }() 187 188 // Wait only after releasing the lock because the end of every 189 // stream will use the lock to remove the entry from streamers. 190 vse.wg.Wait() 191 log.Info("VStreamer: closed") 192 } 193 194 func (vse *Engine) vschema() *vindexes.VSchema { 195 vse.mu.Lock() 196 defer vse.mu.Unlock() 197 return vse.lvschema.vschema 198 } 199 200 // Stream starts a new stream. 201 // This streams events from the binary logs 202 func (vse *Engine) Stream(ctx context.Context, startPos string, tablePKs []*binlogdatapb.TableLastPK, filter *binlogdatapb.Filter, send func([]*binlogdatapb.VEvent) error) error { 203 // Ensure vschema is initialized and the watcher is started. 204 // Starting of the watcher has to be delayed till the first call to Stream 205 // because this overhead should be incurred only if someone uses this feature. 206 vse.watcherOnce.Do(vse.setWatch) 207 208 // Create stream and add it to the map. 209 streamer, idx, err := func() (*uvstreamer, int, error) { 210 if atomic.LoadInt32(&vse.isOpen) == 0 { 211 return nil, 0, errors.New("VStreamer is not open") 212 } 213 vse.mu.Lock() 214 defer vse.mu.Unlock() 215 streamer := newUVStreamer(ctx, vse, vse.env.Config().DB.FilteredWithDB(), vse.se, startPos, tablePKs, filter, vse.lvschema, send) 216 idx := vse.streamIdx 217 vse.streamers[idx] = streamer 218 vse.streamIdx++ 219 // Now that we've added the stream, increment wg. 220 // This must be done before releasing the lock. 221 vse.wg.Add(1) 222 return streamer, idx, nil 223 }() 224 if err != nil { 225 return err 226 } 227 228 // Remove stream from map and decrement wg when it ends. 229 defer func() { 230 vse.mu.Lock() 231 defer vse.mu.Unlock() 232 delete(vse.streamers, idx) 233 vse.wg.Done() 234 }() 235 236 // No lock is held while streaming, but wg is incremented. 237 return streamer.Stream() 238 } 239 240 // StreamRows streams rows. 241 // This streams the table data rows (so we can copy the table data snapshot) 242 func (vse *Engine) StreamRows(ctx context.Context, query string, lastpk []sqltypes.Value, send func(*binlogdatapb.VStreamRowsResponse) error) error { 243 // Ensure vschema is initialized and the watcher is started. 244 // Starting of the watcher has to be delayed till the first call to Stream 245 // because this overhead should be incurred only if someone uses this feature. 246 vse.watcherOnce.Do(vse.setWatch) 247 log.Infof("Streaming rows for query %s, lastpk: %s", query, lastpk) 248 249 // Create stream and add it to the map. 250 rowStreamer, idx, err := func() (*rowStreamer, int, error) { 251 if atomic.LoadInt32(&vse.isOpen) == 0 { 252 return nil, 0, errors.New("VStreamer is not open") 253 } 254 vse.mu.Lock() 255 defer vse.mu.Unlock() 256 257 rowStreamer := newRowStreamer(ctx, vse.env.Config().DB.FilteredWithDB(), vse.se, query, lastpk, vse.lvschema, send, vse) 258 idx := vse.streamIdx 259 vse.rowStreamers[idx] = rowStreamer 260 vse.streamIdx++ 261 // Now that we've added the stream, increment wg. 262 // This must be done before releasing the lock. 263 vse.wg.Add(1) 264 return rowStreamer, idx, nil 265 }() 266 if err != nil { 267 return err 268 } 269 270 // Remove stream from map and decrement wg when it ends. 271 defer func() { 272 vse.mu.Lock() 273 defer vse.mu.Unlock() 274 delete(vse.rowStreamers, idx) 275 vse.wg.Done() 276 }() 277 278 // No lock is held while streaming, but wg is incremented. 279 return rowStreamer.Stream() 280 } 281 282 // StreamResults streams results of the query with the gtid. 283 func (vse *Engine) StreamResults(ctx context.Context, query string, send func(*binlogdatapb.VStreamResultsResponse) error) error { 284 // Create stream and add it to the map. 285 resultStreamer, idx, err := func() (*resultStreamer, int, error) { 286 if atomic.LoadInt32(&vse.isOpen) == 0 { 287 return nil, 0, errors.New("VStreamer is not open") 288 } 289 vse.mu.Lock() 290 defer vse.mu.Unlock() 291 resultStreamer := newResultStreamer(ctx, vse.env.Config().DB.FilteredWithDB(), query, send, vse) 292 idx := vse.streamIdx 293 vse.resultStreamers[idx] = resultStreamer 294 vse.streamIdx++ 295 // Now that we've added the stream, increment wg. 296 // This must be done before releasing the lock. 297 vse.wg.Add(1) 298 return resultStreamer, idx, nil 299 }() 300 if err != nil { 301 return err 302 } 303 304 // Remove stream from map and decrement wg when it ends. 305 defer func() { 306 vse.mu.Lock() 307 defer vse.mu.Unlock() 308 delete(vse.resultStreamers, idx) 309 vse.wg.Done() 310 }() 311 312 // No lock is held while streaming, but wg is incremented. 313 return resultStreamer.Stream() 314 } 315 316 // ServeHTTP shows the current VSchema. 317 func (vse *Engine) ServeHTTP(response http.ResponseWriter, request *http.Request) { 318 if err := acl.CheckAccessHTTP(request, acl.DEBUGGING); err != nil { 319 acl.SendError(response, err) 320 return 321 } 322 response.Header().Set("Content-Type", "application/json; charset=utf-8") 323 vs := vse.vschema() 324 if vs == nil { 325 response.Write([]byte("{}")) 326 } 327 b, err := json.MarshalIndent(vs, "", " ") 328 if err != nil { 329 response.Write([]byte(err.Error())) 330 return 331 } 332 buf := bytes.NewBuffer(nil) 333 json.HTMLEscape(buf, b) 334 response.Write(buf.Bytes()) 335 } 336 337 func (vse *Engine) setWatch() { 338 // If there's no toposerver, create an empty vschema. 339 if vse.ts == nil { 340 vse.lvschema = &localVSchema{ 341 keyspace: vse.keyspace, 342 vschema: &vindexes.VSchema{}, 343 } 344 return 345 } 346 347 // WatchSrvVSchema does not return until the inner func has been called at least once. 348 vse.ts.WatchSrvVSchema(context.TODO(), vse.cell, func(v *vschemapb.SrvVSchema, err error) bool { 349 switch { 350 case err == nil: 351 // Build vschema down below. 352 case topo.IsErrType(err, topo.NoNode): 353 v = nil 354 default: 355 log.Errorf("Error fetching vschema: %v", err) 356 vse.vschemaErrors.Add(1) 357 return true 358 } 359 var vschema *vindexes.VSchema 360 if v != nil { 361 vschema = vindexes.BuildVSchema(v) 362 if err != nil { 363 log.Errorf("Error building vschema: %v", err) 364 vse.vschemaErrors.Add(1) 365 return true 366 } 367 } else { 368 vschema = &vindexes.VSchema{} 369 } 370 371 // Broadcast the change to all streamers. 372 vse.mu.Lock() 373 defer vse.mu.Unlock() 374 vse.lvschema = &localVSchema{ 375 keyspace: vse.keyspace, 376 vschema: vschema, 377 } 378 b, _ := json.MarshalIndent(vschema, "", " ") 379 log.V(2).Infof("Updated vschema: %s", b) 380 for _, s := range vse.streamers { 381 s.SetVSchema(vse.lvschema) 382 } 383 vse.vschemaUpdates.Add(1) 384 return true 385 }) 386 } 387 388 func getPacketSize() int64 { 389 return int64(defaultPacketSize) 390 } 391 392 // waitForMySQL ensures that the source is able to stay within defined bounds for 393 // its MVCC history list (trx rollback segment linked list for old versions of rows 394 // that should be purged ASAP) and its replica lag (which will be -1 for non-replicas) 395 // to help ensure that the vstream does not have an outsized harmful impact on the 396 // source's ability to function normally. 397 func (vse *Engine) waitForMySQL(ctx context.Context, db dbconfigs.Connector, tableName string) error { 398 sourceEndpoint, _ := vse.getMySQLEndpoint(ctx, db) 399 backoff := 1 * time.Second 400 backoffLimit := backoff * 30 401 ready := false 402 recording := false 403 404 loopFunc := func() error { 405 // Exit if the context has been cancelled 406 if ctx.Err() != nil { 407 return ctx.Err() 408 } 409 // Check the config values each time as they can be updated in the running process via the /debug/env endpoint. 410 // This allows the user to break out of a wait w/o incurring any downtime or restarting the workflow if they 411 // need to. 412 mhll := vse.env.Config().RowStreamer.MaxInnoDBTrxHistLen 413 mrls := vse.env.Config().RowStreamer.MaxMySQLReplLagSecs 414 hll := vse.getInnoDBTrxHistoryLen(ctx, db) 415 rpl := vse.getMySQLReplicationLag(ctx, db) 416 if hll <= mhll && rpl <= mrls { 417 ready = true 418 } else { 419 log.Infof("VStream source (%s) is not ready to stream more rows. Max InnoDB history length is %d and it was %d, max replication lag is %d (seconds) and it was %d. Will pause and retry.", 420 sourceEndpoint, mhll, hll, mrls, rpl) 421 } 422 return nil 423 } 424 425 for { 426 if err := loopFunc(); err != nil { 427 return err 428 } 429 if ready { 430 break 431 } else { 432 if !recording { 433 defer func() { 434 ct := time.Now() 435 // Global row streamer waits on the source tablet 436 vse.rowStreamerWaits.Record("waitForMySQL", ct) 437 // Waits by the table we're copying from the source tablet 438 vse.vstreamerPhaseTimings.Record(fmt.Sprintf("%s:waitForMySQL", tableName), ct) 439 }() 440 recording = true 441 } 442 select { 443 case <-ctx.Done(): 444 return vterrors.Errorf(vtrpcpb.Code_CANCELED, "context has expired") 445 case <-time.After(backoff): 446 // Exponential backoff with 1.5 as a factor 447 if backoff != backoffLimit { 448 nb := time.Duration(float64(backoff) * 1.5) 449 if nb > backoffLimit { 450 backoff = backoffLimit 451 } else { 452 backoff = nb 453 } 454 } 455 } 456 } 457 } 458 459 return nil 460 } 461 462 // getInnoDBTrxHistoryLen attempts to query InnoDB's current transaction rollback segment's history 463 // list length. If the value cannot be determined for any reason then -1 is returned, which means 464 // "unknown". 465 func (vse *Engine) getInnoDBTrxHistoryLen(ctx context.Context, db dbconfigs.Connector) int64 { 466 histLen := int64(-1) 467 conn, err := db.Connect(ctx) 468 if err != nil { 469 return histLen 470 } 471 defer conn.Close() 472 473 res, err := conn.ExecuteFetch(trxHistoryLenQuery, 1, false) 474 if err != nil || len(res.Rows) != 1 || res.Rows[0] == nil { 475 return histLen 476 } 477 histLen, _ = res.Rows[0][0].ToInt64() 478 return histLen 479 } 480 481 // getMySQLReplicationLag attempts to get the seconds_behind_master value. 482 // If the value cannot be determined for any reason then -1 is returned, which 483 // means "unknown" or "irrelevant" (meaning it's not actively replicating). 484 func (vse *Engine) getMySQLReplicationLag(ctx context.Context, db dbconfigs.Connector) int64 { 485 lagSecs := int64(-1) 486 conn, err := db.Connect(ctx) 487 if err != nil { 488 return lagSecs 489 } 490 defer conn.Close() 491 492 res, err := conn.ExecuteFetch(replicaLagQuery, 1, true) 493 if err != nil || len(res.Rows) != 1 || res.Rows[0] == nil { 494 return lagSecs 495 } 496 row := res.Named().Row() 497 return row.AsInt64("Seconds_Behind_Master", -1) 498 } 499 500 // getMySQLEndpoint returns the host:port value for the vstreamer (MySQL) instance 501 func (vse *Engine) getMySQLEndpoint(ctx context.Context, db dbconfigs.Connector) (string, error) { 502 conn, err := db.Connect(ctx) 503 if err != nil { 504 return "", err 505 } 506 defer conn.Close() 507 508 res, err := conn.ExecuteFetch(hostQuery, 1, false) 509 if err != nil || len(res.Rows) != 1 || res.Rows[0] == nil { 510 return "", vterrors.Wrap(err, "could not get vstreamer MySQL endpoint") 511 } 512 host := res.Rows[0][0].ToString() 513 port, _ := res.Rows[0][1].ToInt64() 514 return fmt.Sprintf("%s:%d", host, port), nil 515 } 516 517 // mapPKEquivalentCols gets a PK equivalent from mysqld for the table 518 // and maps the column names to field indexes in the MinimalTable struct. 519 func (vse *Engine) mapPKEquivalentCols(ctx context.Context, table *binlogdatapb.MinimalTable) ([]int, error) { 520 mysqld := mysqlctl.NewMysqld(vse.env.Config().DB) 521 pkeColNames, err := mysqld.GetPrimaryKeyEquivalentColumns(ctx, vse.env.Config().DB.DBName, table.Name) 522 if err != nil { 523 return nil, err 524 } 525 pkeCols := make([]int, len(pkeColNames)) 526 matches := 0 527 for n, field := range table.Fields { 528 for i, pkeColName := range pkeColNames { 529 if strings.EqualFold(field.Name, pkeColName) { 530 pkeCols[i] = n 531 matches++ 532 break 533 } 534 } 535 if matches == len(pkeColNames) { 536 break 537 } 538 } 539 return pkeCols, nil 540 }