vitess.io/vitess@v0.16.2/go/vt/vttablet/tabletmanager/vdiff/controller.go (about) 1 /* 2 Copyright 2022 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package vdiff 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "strings" 24 "time" 25 26 "vitess.io/vitess/go/vt/proto/tabletmanagerdata" 27 "vitess.io/vitess/go/vt/vterrors" 28 29 "google.golang.org/protobuf/encoding/prototext" 30 31 "vitess.io/vitess/go/mysql" 32 "vitess.io/vitess/go/sqltypes" 33 "vitess.io/vitess/go/vt/binlog/binlogplayer" 34 "vitess.io/vitess/go/vt/log" 35 binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" 36 vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" 37 "vitess.io/vitess/go/vt/topo" 38 "vitess.io/vitess/go/vt/vttablet/tmclient" 39 ) 40 41 /* 42 vdiff operation states: pending/started/stopped/completed/error/unknown 43 vdiff table states: pending/started/stopped/completed/error/unknown 44 */ 45 type VDiffState string //nolint 46 const ( 47 PendingState VDiffState = "pending" 48 StartedState VDiffState = "started" 49 StoppedState VDiffState = "stopped" 50 CompletedState VDiffState = "completed" 51 ErrorState VDiffState = "error" 52 UnknownState VDiffState = "" 53 TimestampFormat = "2006-01-02 15:04:05" 54 ) 55 56 type controller struct { 57 id int64 // id from row in _vt.vdiff 58 uuid string 59 workflow string 60 cancel context.CancelFunc 61 dbClientFactory func() binlogplayer.DBClient 62 ts *topo.Server 63 vde *Engine // the singleton vdiff engine 64 done chan struct{} 65 66 sources map[string]*migrationSource // currently picked source tablets for this shard's data 67 workflowFilter string 68 sourceKeyspace string 69 tmc tmclient.TabletManagerClient 70 71 targetShardStreamer *shardStreamer 72 filter *binlogdatapb.Filter // vreplication row filter 73 options *tabletmanagerdata.VDiffOptions // options initially from vtctld command and later from _vt.vdiff 74 75 sourceTimeZone, targetTimeZone string // named time zones if conversions are necessary for datetime values 76 77 externalCluster string // for Mount+Migrate 78 } 79 80 func newController(ctx context.Context, row sqltypes.RowNamedValues, dbClientFactory func() binlogplayer.DBClient, 81 ts *topo.Server, vde *Engine, options *tabletmanagerdata.VDiffOptions) (*controller, error) { 82 83 log.Infof("VDiff controller initializing for %+v", row) 84 id, _ := row["id"].ToInt64() 85 86 ct := &controller{ 87 id: id, 88 uuid: row["vdiff_uuid"].ToString(), 89 workflow: row["workflow"].ToString(), 90 dbClientFactory: dbClientFactory, 91 ts: ts, 92 vde: vde, 93 done: make(chan struct{}), 94 tmc: vde.tmClientFactory(), 95 sources: make(map[string]*migrationSource), 96 options: options, 97 } 98 ctx, ct.cancel = context.WithCancel(ctx) 99 go ct.run(ctx) 100 101 return ct, nil 102 } 103 104 func (ct *controller) Stop() { 105 ct.cancel() 106 <-ct.done 107 } 108 109 func (ct *controller) run(ctx context.Context) { 110 defer func() { 111 log.Infof("Run finished for vdiff %s", ct.uuid) 112 close(ct.done) 113 }() 114 115 dbClient := ct.vde.dbClientFactoryFiltered() 116 if err := dbClient.Connect(); err != nil { 117 log.Errorf("Encountered an error connecting to database for vdiff %s: %v", ct.uuid, err) 118 return 119 } 120 defer dbClient.Close() 121 122 qr, err := ct.vde.getVDiffByID(ctx, dbClient, ct.id) 123 if err != nil { 124 log.Errorf("Encountered an error getting vdiff record for %s: %v", ct.uuid, err) 125 return 126 } 127 128 row := qr.Named().Row() 129 state := VDiffState(strings.ToLower(row["state"].ToString())) 130 switch state { 131 case PendingState, StartedState: 132 action := "Starting" 133 if state == StartedState { 134 action = "Restarting" 135 } 136 log.Infof("%s vdiff %s", action, ct.uuid) 137 if err := ct.start(ctx, dbClient); err != nil { 138 log.Errorf("Encountered an error for vdiff %s: %s", ct.uuid, err) 139 if err := ct.saveErrorState(ctx, err); err != nil { 140 log.Errorf("Unable to save error state for vdiff %s; giving up because %s", ct.uuid, err.Error()) 141 } 142 } 143 default: 144 log.Infof("VDiff %s was not marked as runnable (state: %s), doing nothing", ct.uuid, state) 145 } 146 } 147 148 type migrationSource struct { 149 *shardStreamer 150 151 vrID int64 152 position mysql.Position 153 } 154 155 func (ct *controller) updateState(dbClient binlogplayer.DBClient, state VDiffState, err error) error { 156 extraCols := "" 157 switch state { 158 case StartedState: 159 extraCols = ", started_at = utc_timestamp()" 160 case CompletedState: 161 extraCols = ", completed_at = utc_timestamp()" 162 default: 163 } 164 if err == nil { 165 // Clear out any previous error for the vdiff on this shard 166 err = errors.New("") 167 } 168 query := fmt.Sprintf(sqlUpdateVDiffState, encodeString(string(state)), encodeString(err.Error()), extraCols, ct.id) 169 if _, err := dbClient.ExecuteFetch(query, 1); err != nil { 170 return err 171 } 172 insertVDiffLog(ct.vde.ctx, dbClient, ct.id, fmt.Sprintf("State changed to: %s", state)) 173 return nil 174 } 175 176 func (ct *controller) start(ctx context.Context, dbClient binlogplayer.DBClient) error { 177 select { 178 case <-ctx.Done(): 179 return vterrors.Errorf(vtrpcpb.Code_CANCELED, "context has expired") 180 default: 181 } 182 ct.workflowFilter = fmt.Sprintf("where workflow = %s and db_name = %s", encodeString(ct.workflow), encodeString(ct.vde.dbName)) 183 query := fmt.Sprintf(sqlGetVReplicationEntry, ct.workflowFilter) 184 qr, err := dbClient.ExecuteFetch(query, -1) 185 if err != nil { 186 return err 187 } 188 log.Infof("Found %d vreplication streams for %s", len(qr.Rows), ct.workflow) 189 for i, row := range qr.Named().Rows { 190 select { 191 case <-ctx.Done(): 192 return vterrors.Errorf(vtrpcpb.Code_CANCELED, "context has expired") 193 default: 194 } 195 source := newMigrationSource() 196 sourceBytes, err := row["source"].ToBytes() 197 if err != nil { 198 return err 199 } 200 var bls binlogdatapb.BinlogSource 201 if err := prototext.Unmarshal(sourceBytes, &bls); err != nil { 202 log.Errorf("Encountered an error unmarshalling vdiff binlog source for %s: %v", ct.uuid, err) 203 return err 204 } 205 source.shard = bls.Shard 206 source.vrID, _ = row["id"].ToInt64() 207 ct.sourceTimeZone = bls.SourceTimeZone 208 ct.targetTimeZone = bls.TargetTimeZone 209 210 if bls.ExternalCluster != "" { 211 ct.externalCluster = bls.ExternalCluster 212 } 213 214 ct.sources[source.shard] = source 215 if i == 0 { 216 ct.sourceKeyspace = bls.Keyspace 217 ct.filter = bls.Filter 218 } 219 } 220 221 if err := ct.validate(); err != nil { 222 return err 223 } 224 225 wd, err := newWorkflowDiffer(ct, ct.options) 226 if err != nil { 227 return err 228 } 229 if err := ct.updateState(dbClient, StartedState, nil); err != nil { 230 return err 231 } 232 if err := wd.diff(ctx); err != nil { 233 log.Errorf("Encountered an error performing workflow diff for vdiff %s: %v", ct.uuid, err) 234 return err 235 } 236 237 return nil 238 } 239 240 // markStoppedByRequest records the fact that this VDiff was stopped via user 241 // request and resets the error generated by cancelling the context to stop it: 242 // 243 // "vttablet: rpc error: code = Canceled desc = context canceled" 244 // 245 // This differentiates non-user requested stops that would occur e.g. during 246 // PlannedReparentShard or tablet restart, in those cases the error will be saved 247 // and will cause the VDiff to be retried ASAP -- which is NOT what we want here. 248 func (ct *controller) markStoppedByRequest() error { 249 dbClient := ct.vde.dbClientFactoryFiltered() 250 if err := dbClient.Connect(); err != nil { 251 return fmt.Errorf("encountered an error marking vdiff %s as stopped: %v", ct.uuid, err) 252 } 253 defer dbClient.Close() 254 255 query := fmt.Sprintf(sqlUpdateVDiffStopped, ct.id) 256 var res *sqltypes.Result 257 var err error 258 if res, err = dbClient.ExecuteFetch(query, 1); err != nil { 259 return fmt.Errorf("encountered an error marking vdiff %s as stopped: %v", ct.uuid, err) 260 } 261 // We don't mark it as stopped if it's already completed 262 if res.RowsAffected > 0 { 263 insertVDiffLog(ct.vde.ctx, dbClient, ct.id, fmt.Sprintf("State changed to: %s (by user request)", StoppedState)) 264 } 265 266 return nil 267 } 268 269 func newMigrationSource() *migrationSource { 270 return &migrationSource{shardStreamer: &shardStreamer{}} 271 } 272 273 func (ct *controller) validate() error { 274 // TODO: check if vreplication workflow has errors, what else? 275 return nil 276 } 277 278 // saveErrorState saves the error state for the vdiff in the database. 279 // It never gives up trying to save the error state, unless the context 280 // has been cancelled or the done channel has been closed -- indicating 281 // that the engine is closing or the vdiff has been explicitly stopped. 282 // Note that when the engine is later opened the started vdiff will be 283 // restarted even though we were unable to save the error state. 284 // It uses exponential backoff with a factor of 1.5 to avoid creating 285 // too many database connections. 286 func (ct *controller) saveErrorState(ctx context.Context, saveErr error) error { 287 retryDelay := 100 * time.Millisecond 288 maxRetryDelay := 60 * time.Second 289 save := func() error { 290 dbClient := ct.vde.dbClientFactoryFiltered() 291 if err := dbClient.Connect(); err != nil { 292 return err 293 } 294 defer dbClient.Close() 295 296 if err := ct.updateState(dbClient, ErrorState, saveErr); err != nil { 297 return err 298 } 299 insertVDiffLog(ctx, dbClient, ct.id, fmt.Sprintf("Error: %s", saveErr)) 300 301 return nil 302 } 303 304 for { 305 if err := save(); err != nil { 306 log.Warningf("Failed to persist vdiff error state: %v. Will retry in %s", err, retryDelay.String()) 307 select { 308 case <-ctx.Done(): 309 return fmt.Errorf("engine is shutting down") 310 case <-ct.done: 311 return fmt.Errorf("vdiff was stopped") 312 case <-time.After(retryDelay): 313 if retryDelay < maxRetryDelay { 314 retryDelay = time.Duration(float64(retryDelay) * 1.5) 315 if retryDelay > maxRetryDelay { 316 retryDelay = maxRetryDelay 317 } 318 } 319 continue 320 } 321 } 322 323 // Success 324 return nil 325 } 326 }