vitess.io/vitess@v0.16.2/go/vt/vttablet/tabletmanager/vdiff/engine.go (about) 1 /* 2 Copyright 2022 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package vdiff 18 19 import ( 20 "context" 21 "encoding/json" 22 "errors" 23 "fmt" 24 "sync" 25 "time" 26 27 "vitess.io/vitess/go/mysql" 28 "vitess.io/vitess/go/vt/proto/tabletmanagerdata" 29 "vitess.io/vitess/go/vt/proto/topodata" 30 "vitess.io/vitess/go/vt/vttablet/tabletmanager/vreplication" 31 "vitess.io/vitess/go/vt/vttablet/tmclient" 32 33 "vitess.io/vitess/go/sqltypes" 34 "vitess.io/vitess/go/sync2" 35 "vitess.io/vitess/go/vt/binlog/binlogplayer" 36 "vitess.io/vitess/go/vt/dbconfigs" 37 "vitess.io/vitess/go/vt/log" 38 "vitess.io/vitess/go/vt/topo" 39 "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" 40 ) 41 42 type Engine struct { 43 isOpen bool 44 45 mu sync.Mutex // guards controllers 46 controllers map[int64]*controller 47 48 // ctx is the root context for all controllers 49 ctx context.Context 50 cancel context.CancelFunc 51 cancelRetry context.CancelFunc 52 53 ts *topo.Server 54 tmClientFactory func() tmclient.TabletManagerClient 55 dbClientFactoryFiltered func() binlogplayer.DBClient 56 dbClientFactoryDba func() binlogplayer.DBClient 57 dbName string 58 59 vre *vreplication.Engine 60 61 wg sync.WaitGroup 62 thisTablet *topodata.Tablet 63 64 // snapshotMu is used to ensure that only one vdiff snapshot cycle is active at a time, 65 // because we stop/start vreplication workflows during this process 66 snapshotMu sync.Mutex 67 68 vdiffSchemaCreateOnce sync.Once 69 70 // This should only be set when the engine is being used in tests. It then provides 71 // modified behavior for that env, e.g. not starting the retry goroutine. This should 72 // NOT be set in production. 73 fortests bool 74 } 75 76 func NewEngine(config *tabletenv.TabletConfig, ts *topo.Server, tablet *topodata.Tablet) *Engine { 77 vde := &Engine{ 78 controllers: make(map[int64]*controller), 79 ts: ts, 80 thisTablet: tablet, 81 tmClientFactory: func() tmclient.TabletManagerClient { return tmclient.NewTabletManagerClient() }, 82 } 83 return vde 84 } 85 86 // NewTestEngine creates an Engine for use in tests. It uses the custom db client factory and 87 // tablet manager client factory, while setting the fortests field to true to modify any engine 88 // behavior when used in tests (e.g. not starting the retry goroutine). 89 func NewTestEngine(ts *topo.Server, tablet *topodata.Tablet, dbn string, dbcf func() binlogplayer.DBClient, tmcf func() tmclient.TabletManagerClient) *Engine { 90 vde := &Engine{ 91 controllers: make(map[int64]*controller), 92 ts: ts, 93 thisTablet: tablet, 94 dbName: dbn, 95 dbClientFactoryFiltered: dbcf, 96 dbClientFactoryDba: dbcf, 97 tmClientFactory: tmcf, 98 fortests: true, 99 } 100 return vde 101 } 102 103 func (vde *Engine) InitDBConfig(dbcfgs *dbconfigs.DBConfigs) { 104 // If it's a test engine and we're already initilized then do nothing. 105 if vde.fortests && vde.dbClientFactoryFiltered != nil && vde.dbClientFactoryDba != nil { 106 return 107 } 108 vde.dbClientFactoryFiltered = func() binlogplayer.DBClient { 109 return binlogplayer.NewDBClient(dbcfgs.FilteredWithDB()) 110 } 111 vde.dbClientFactoryDba = func() binlogplayer.DBClient { 112 return binlogplayer.NewDBClient(dbcfgs.DbaWithDB()) 113 } 114 vde.dbName = dbcfgs.DBName 115 } 116 117 func (vde *Engine) Open(ctx context.Context, vre *vreplication.Engine) { 118 vde.mu.Lock() 119 defer vde.mu.Unlock() 120 if vde.ts == nil || vde.isOpen { 121 return 122 } 123 log.Infof("VDiff Engine: opening...") 124 125 if vde.cancelRetry != nil { 126 vde.cancelRetry() 127 vde.cancelRetry = nil 128 } 129 vde.vre = vre 130 if err := vde.openLocked(ctx); err != nil { 131 log.Infof("openLocked error: %s", err) 132 ctx, cancel := context.WithCancel(ctx) 133 vde.cancelRetry = cancel 134 go vde.retry(ctx, err) 135 } 136 } 137 138 func (vde *Engine) openLocked(ctx context.Context) error { 139 // This should never happen 140 if len(vde.controllers) > 0 { 141 log.Warningf("VDiff Engine invalid state detected: %d controllers existed when opening; resetting state", len(vde.controllers)) 142 vde.resetControllers() 143 } 144 145 // At this point the tablet has no controllers running. So 146 // we want to start any VDiffs that have not been explicitly 147 // stopped or otherwise finished. 148 rows, err := vde.getVDiffsToRun(ctx) 149 if err != nil { 150 return err 151 } 152 vde.ctx, vde.cancel = context.WithCancel(ctx) 153 vde.isOpen = true // now we are open and have things to close 154 if err := vde.initControllers(rows); err != nil { 155 return err 156 } 157 158 // At this point we've fully and succesfully opened so begin 159 // retrying error'd VDiffs until the engine is closed. 160 vde.wg.Add(1) 161 go func() { 162 defer vde.wg.Done() 163 if vde.fortests { 164 return 165 } 166 vde.retryErroredVDiffs() 167 }() 168 169 return nil 170 } 171 172 var openRetryInterval = sync2.NewAtomicDuration(1 * time.Second) 173 174 func (vde *Engine) retry(ctx context.Context, err error) { 175 log.Errorf("Error starting vdiff engine: %v, will keep retrying.", err) 176 for { 177 timer := time.NewTimer(openRetryInterval.Get()) 178 select { 179 case <-ctx.Done(): 180 timer.Stop() 181 return 182 case <-timer.C: 183 } 184 vde.mu.Lock() 185 // Recheck the context within the lock. 186 // This guarantees that we will not retry 187 // after the context was canceled. This 188 // can almost never happen. 189 select { 190 case <-ctx.Done(): 191 vde.mu.Unlock() 192 return 193 default: 194 } 195 if err := vde.openLocked(ctx); err == nil { 196 log.Infof("VDiff engine: opened successfully") 197 // Don't invoke cancelRetry because openLocked 198 // will hold on to this context for later cancelation. 199 vde.cancelRetry = nil 200 vde.mu.Unlock() 201 return 202 } 203 vde.mu.Unlock() 204 } 205 } 206 207 // addController creates a new controller using the given vdiff record and adds it to the engine. 208 // You must already have the main engine mutex (mu) locked before calling this. 209 func (vde *Engine) addController(row sqltypes.RowNamedValues, options *tabletmanagerdata.VDiffOptions) error { 210 ct, err := newController(vde.ctx, row, vde.dbClientFactoryDba, vde.ts, vde, options) 211 if err != nil { 212 return fmt.Errorf("controller could not be initialized for stream %+v on tablet %v", 213 row, vde.thisTablet.Alias) 214 } 215 vde.controllers[ct.id] = ct 216 return nil 217 } 218 219 func (vde *Engine) initControllers(qr *sqltypes.Result) error { 220 if qr == nil || len(qr.Rows) == 0 { 221 return nil 222 } 223 for _, row := range qr.Named().Rows { 224 options := &tabletmanagerdata.VDiffOptions{} 225 if err := json.Unmarshal(row.AsBytes("options", []byte("{}")), options); err != nil { 226 return err 227 } 228 if err := vde.addController(row, options); err != nil { 229 return err 230 } 231 } 232 return nil 233 } 234 235 // IsOpen returns true if Engine is open. 236 func (vde *Engine) IsOpen() bool { 237 vde.mu.Lock() 238 defer vde.mu.Unlock() 239 return vde.isOpen 240 } 241 242 // Close closes the Engine service. 243 func (vde *Engine) Close() { 244 vde.mu.Lock() 245 defer vde.mu.Unlock() 246 247 // If we're retrying, we're not open. 248 // Just cancel the retry loop. 249 if vde.cancelRetry != nil { 250 vde.cancelRetry() 251 vde.cancelRetry = nil 252 return 253 } 254 255 if !vde.isOpen { 256 return 257 } 258 259 vde.cancel() 260 261 // We still have to wait for all controllers to stop. 262 vde.resetControllers() 263 264 // Wait for long-running functions to exit. 265 vde.wg.Wait() 266 267 vde.isOpen = false 268 269 log.Infof("VDiff Engine: closed") 270 } 271 272 func (vde *Engine) getVDiffsToRun(ctx context.Context) (*sqltypes.Result, error) { 273 dbClient := vde.dbClientFactoryFiltered() 274 if err := dbClient.Connect(); err != nil { 275 return nil, err 276 } 277 defer dbClient.Close() 278 279 // We have to use ExecIgnore here so as not to block quick tablet state 280 // transitions from primary to non-primary when starting the engine 281 qr, err := dbClient.ExecuteFetch(sqlGetVDiffsToRun, -1) 282 if err != nil { 283 return nil, err 284 } 285 if len(qr.Rows) == 0 { 286 return nil, nil 287 } 288 return qr, nil 289 } 290 291 func (vde *Engine) getVDiffsToRetry(ctx context.Context, dbClient binlogplayer.DBClient) (*sqltypes.Result, error) { 292 qr, err := dbClient.ExecuteFetch(sqlGetVDiffsToRetry, -1) 293 if err != nil { 294 return nil, err 295 } 296 if len(qr.Rows) == 0 { 297 return nil, nil 298 } 299 return qr, nil 300 } 301 302 func (vde *Engine) getVDiffByID(ctx context.Context, dbClient binlogplayer.DBClient, id int64) (*sqltypes.Result, error) { 303 qr, err := dbClient.ExecuteFetch(fmt.Sprintf(sqlGetVDiffByID, id), -1) 304 if err != nil { 305 return nil, err 306 } 307 if len(qr.Rows) != 1 { 308 return nil, fmt.Errorf("no vdiff found for id %d on tablet %v", 309 id, vde.thisTablet.Alias) 310 } 311 return qr, nil 312 } 313 314 func (vde *Engine) retryVDiffs(ctx context.Context) error { 315 vde.mu.Lock() 316 defer vde.mu.Unlock() 317 dbClient := vde.dbClientFactoryFiltered() 318 if err := dbClient.Connect(); err != nil { 319 return err 320 } 321 defer dbClient.Close() 322 323 qr, err := vde.getVDiffsToRetry(ctx, dbClient) 324 if err != nil { 325 return err 326 } 327 if qr == nil || len(qr.Rows) == 0 { 328 return nil 329 } 330 for _, row := range qr.Named().Rows { 331 select { 332 case <-ctx.Done(): 333 return ctx.Err() 334 default: 335 } 336 lastError := mysql.NewSQLErrorFromError(errors.New(row.AsString("last_error", ""))) 337 if !mysql.IsEphemeralError(lastError) { 338 continue 339 } 340 uuid := row.AsString("vdiff_uuid", "") 341 id, err := row.ToInt64("id") 342 if err != nil { 343 return err 344 } 345 log.Infof("Retrying vdiff %s that had an ephemeral error of '%v'", uuid, lastError) 346 if _, err = dbClient.ExecuteFetch(fmt.Sprintf(sqlRetryVDiff, id), 1); err != nil { 347 return err 348 } 349 options := &tabletmanagerdata.VDiffOptions{} 350 if err := json.Unmarshal(row.AsBytes("options", []byte("{}")), options); err != nil { 351 return err 352 } 353 if err := vde.addController(row, options); err != nil { 354 return err 355 } 356 } 357 return nil 358 } 359 360 func (vde *Engine) retryErroredVDiffs() { 361 tkr := time.NewTicker(time.Second * 30) 362 defer tkr.Stop() 363 for { 364 select { 365 case <-vde.ctx.Done(): 366 log.Info("VDiff engine: closing...") 367 return 368 case <-tkr.C: 369 } 370 371 if err := vde.retryVDiffs(vde.ctx); err != nil { 372 log.Errorf("Error retrying vdiffs: %v", err) 373 } 374 } 375 } 376 377 func (vde *Engine) resetControllers() { 378 for _, ct := range vde.controllers { 379 ct.Stop() 380 } 381 vde.controllers = make(map[int64]*controller) 382 }