vitess.io/vitess@v0.16.2/go/vt/vttablet/tabletmanager/vreplication/controller.go (about) 1 /* 2 Copyright 2019 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package vreplication 18 19 import ( 20 "fmt" 21 "strconv" 22 "strings" 23 "time" 24 25 "google.golang.org/protobuf/encoding/prototext" 26 27 "vitess.io/vitess/go/vt/discovery" 28 "vitess.io/vitess/go/vt/vterrors" 29 30 "context" 31 32 "vitess.io/vitess/go/sync2" 33 "vitess.io/vitess/go/tb" 34 "vitess.io/vitess/go/vt/binlog/binlogplayer" 35 "vitess.io/vitess/go/vt/log" 36 "vitess.io/vitess/go/vt/mysqlctl" 37 "vitess.io/vitess/go/vt/topo" 38 39 binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" 40 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 41 ) 42 43 const ( 44 // How many times to retry tablet selection before we 45 // give up and return an error message that the user 46 // can see and act upon if needed. 47 tabletPickerRetries = 5 48 ) 49 50 // controller is created by Engine. Members are initialized upfront. 51 // There is no mutex within a controller becaust its members are 52 // either read-only or self-synchronized. 53 type controller struct { 54 vre *Engine 55 dbClientFactory func() binlogplayer.DBClient 56 mysqld mysqlctl.MysqlDaemon 57 blpStats *binlogplayer.Stats 58 59 id uint32 60 workflow string 61 source *binlogdatapb.BinlogSource 62 stopPos string 63 tabletPicker *discovery.TabletPicker 64 65 cancel context.CancelFunc 66 done chan struct{} 67 68 // The following fields are updated after start. So, they need synchronization. 69 sourceTablet sync2.AtomicString 70 71 lastWorkflowError *vterrors.LastError 72 } 73 74 // newController creates a new controller. Unless a stream is explicitly 'Stopped', 75 // this function launches a goroutine to perform continuous vreplication. 76 func newController(ctx context.Context, params map[string]string, dbClientFactory func() binlogplayer.DBClient, mysqld mysqlctl.MysqlDaemon, ts *topo.Server, cell, tabletTypesStr string, blpStats *binlogplayer.Stats, vre *Engine) (*controller, error) { 77 if blpStats == nil { 78 blpStats = binlogplayer.NewStats() 79 } 80 81 ct := &controller{ 82 vre: vre, 83 dbClientFactory: dbClientFactory, 84 mysqld: mysqld, 85 blpStats: blpStats, 86 done: make(chan struct{}), 87 source: &binlogdatapb.BinlogSource{}, 88 } 89 log.Infof("creating controller with cell: %v, tabletTypes: %v, and params: %v", cell, tabletTypesStr, params) 90 91 // id 92 id, err := strconv.Atoi(params["id"]) 93 if err != nil { 94 return nil, err 95 } 96 ct.id = uint32(id) 97 ct.workflow = params["workflow"] 98 ct.lastWorkflowError = vterrors.NewLastError(fmt.Sprintf("VReplication controller %d for workflow %q", ct.id, ct.workflow), maxTimeToRetryError) 99 100 state := params["state"] 101 blpStats.State.Set(state) 102 // Nothing to do if replication is stopped or is known to have an unrecoverable error. 103 if state == binlogplayer.BlpStopped || state == binlogplayer.BlpError { 104 ct.cancel = func() {} 105 close(ct.done) 106 return ct, nil 107 } 108 109 // source, stopPos 110 if err := prototext.Unmarshal([]byte(params["source"]), ct.source); err != nil { 111 return nil, err 112 } 113 ct.stopPos = params["stop_pos"] 114 115 if ct.source.GetExternalMysql() == "" { 116 // tabletPicker 117 if v := params["cell"]; v != "" { 118 cell = v 119 } 120 if v := params["tablet_types"]; v != "" { 121 tabletTypesStr = v 122 } 123 log.Infof("creating tablet picker for source keyspace/shard %v/%v with cell: %v and tabletTypes: %v", ct.source.Keyspace, ct.source.Shard, cell, tabletTypesStr) 124 cells := strings.Split(cell, ",") 125 126 sourceTopo := ts 127 if ct.source.ExternalCluster != "" { 128 sourceTopo, err = sourceTopo.OpenExternalVitessClusterServer(ctx, ct.source.ExternalCluster) 129 if err != nil { 130 return nil, err 131 } 132 } 133 tp, err := discovery.NewTabletPicker(sourceTopo, cells, ct.source.Keyspace, ct.source.Shard, tabletTypesStr) 134 if err != nil { 135 return nil, err 136 } 137 ct.tabletPicker = tp 138 } 139 140 // cancel 141 ctx, ct.cancel = context.WithCancel(ctx) 142 143 go ct.run(ctx) 144 145 return ct, nil 146 } 147 148 func (ct *controller) run(ctx context.Context) { 149 defer func() { 150 log.Infof("stream %v: stopped", ct.id) 151 close(ct.done) 152 }() 153 154 for { 155 err := ct.runBlp(ctx) 156 if err == nil { 157 return 158 } 159 160 // Sometimes, canceled contexts get wrapped as errors. 161 select { 162 case <-ctx.Done(): 163 log.Warningf("context canceled: %s", err.Error()) 164 return 165 default: 166 } 167 168 ct.blpStats.ErrorCounts.Add([]string{"Stream Error"}, 1) 169 binlogplayer.LogError(fmt.Sprintf("error in stream %v, retrying after %v", ct.id, retryDelay), err) 170 timer := time.NewTimer(retryDelay) 171 select { 172 case <-ctx.Done(): 173 log.Warningf("context canceled: %s", err.Error()) 174 timer.Stop() 175 return 176 case <-timer.C: 177 } 178 } 179 } 180 181 func (ct *controller) runBlp(ctx context.Context) (err error) { 182 defer func() { 183 ct.sourceTablet.Set("") 184 if x := recover(); x != nil { 185 log.Errorf("stream %v: caught panic: %v\n%s", ct.id, x, tb.Stack(4)) 186 err = fmt.Errorf("panic: %v", x) 187 } 188 }() 189 190 select { 191 case <-ctx.Done(): 192 return nil 193 default: 194 } 195 196 // Call this for youtube-specific customization. 197 // This should be done every time, in case mysql was restarted. 198 if err := ct.mysqld.EnableBinlogPlayback(); err != nil { 199 return err 200 } 201 202 dbClient := ct.dbClientFactory() 203 if err := dbClient.Connect(); err != nil { 204 return vterrors.Wrap(err, "can't connect to database") 205 } 206 defer dbClient.Close() 207 208 var tablet *topodatapb.Tablet 209 if ct.source.GetExternalMysql() == "" { 210 log.Infof("trying to find a tablet eligible for vreplication. stream id: %v", ct.id) 211 tpCtx, tpCancel := context.WithTimeout(ctx, discovery.GetTabletPickerRetryDelay()*tabletPickerRetries) 212 defer tpCancel() 213 tablet, err = ct.tabletPicker.PickForStreaming(tpCtx) 214 if err != nil { 215 select { 216 case <-ctx.Done(): 217 default: 218 ct.blpStats.ErrorCounts.Add([]string{"No Source Tablet Found"}, 1) 219 ct.setMessage(dbClient, fmt.Sprintf("Error picking tablet: %s", err.Error())) 220 } 221 return err 222 } 223 ct.setMessage(dbClient, fmt.Sprintf("Picked source tablet: %s", tablet.Alias.String())) 224 log.Infof("found a tablet eligible for vreplication. stream id: %v tablet: %s", ct.id, tablet.Alias.String()) 225 ct.sourceTablet.Set(tablet.Alias.String()) 226 } 227 switch { 228 case len(ct.source.Tables) > 0: 229 // Table names can have search patterns. Resolve them against the schema. 230 tables, err := mysqlctl.ResolveTables(ctx, ct.mysqld, dbClient.DBName(), ct.source.Tables) 231 if err != nil { 232 ct.blpStats.ErrorCounts.Add([]string{"Invalid Source"}, 1) 233 return vterrors.Wrap(err, "failed to resolve table names") 234 } 235 236 player := binlogplayer.NewBinlogPlayerTables(dbClient, tablet, tables, ct.id, ct.blpStats) 237 return player.ApplyBinlogEvents(ctx) 238 case ct.source.KeyRange != nil: 239 player := binlogplayer.NewBinlogPlayerKeyRange(dbClient, tablet, ct.source.KeyRange, ct.id, ct.blpStats) 240 return player.ApplyBinlogEvents(ctx) 241 case ct.source.Filter != nil: 242 // Timestamp fields from binlogs are always sent as UTC. 243 // So, we should set the timezone to be UTC for those values to be correctly inserted. 244 if _, err := dbClient.ExecuteFetch("set @@session.time_zone = '+00:00'", 10000); err != nil { 245 return err 246 } 247 // Tables may have varying character sets. To ship the bits without interpreting them 248 // we set the character set to be binary. 249 if _, err := dbClient.ExecuteFetch("set names binary", 10000); err != nil { 250 return err 251 } 252 // We must apply AUTO_INCREMENT values precisely as we got them. This include the 0 value, which is not recommended in AUTO_INCREMENT, and yet is valid. 253 if _, err := dbClient.ExecuteFetch("set @@session.sql_mode = CONCAT(@@session.sql_mode, ',NO_AUTO_VALUE_ON_ZERO')", 10000); err != nil { 254 return err 255 } 256 257 var vsClient VStreamerClient 258 var err error 259 if name := ct.source.GetExternalMysql(); name != "" { 260 vsClient, err = ct.vre.ec.Get(name) 261 if err != nil { 262 return err 263 } 264 } else { 265 vsClient = newTabletConnector(tablet) 266 } 267 if err := vsClient.Open(ctx); err != nil { 268 return err 269 } 270 defer vsClient.Close(ctx) 271 272 vr := newVReplicator(ct.id, ct.source, vsClient, ct.blpStats, dbClient, ct.mysqld, ct.vre) 273 err = vr.Replicate(ctx) 274 ct.lastWorkflowError.Record(err) 275 // If this is a mysql error that we know needs manual intervention OR 276 // we cannot identify this as non-recoverable, but it has persisted beyond the retry limit (maxTimeToRetryError) 277 if isUnrecoverableError(err) || !ct.lastWorkflowError.ShouldRetry() { 278 log.Errorf("vreplication stream %d going into error state due to %+v", ct.id, err) 279 if errSetState := vr.setState(binlogplayer.BlpError, err.Error()); errSetState != nil { 280 return err // yes, err and not errSetState. 281 } 282 return nil // this will cause vreplicate to quit the workflow 283 } 284 return err 285 } 286 ct.blpStats.ErrorCounts.Add([]string{"Invalid Source"}, 1) 287 return fmt.Errorf("missing source") 288 } 289 290 func (ct *controller) setMessage(dbClient binlogplayer.DBClient, message string) error { 291 ct.blpStats.History.Add(&binlogplayer.StatsHistoryRecord{ 292 Time: time.Now(), 293 Message: message, 294 }) 295 query := fmt.Sprintf("update _vt.vreplication set message=%v where id=%v", encodeString(binlogplayer.MessageTruncate(message)), ct.id) 296 if _, err := dbClient.ExecuteFetch(query, 1); err != nil { 297 return fmt.Errorf("could not set message: %v: %v", query, err) 298 } 299 return nil 300 } 301 func (ct *controller) Stop() { 302 ct.cancel() 303 <-ct.done 304 }