github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/sqle/cluster/mysqldb_persister.go (about) 1 // Copyright 2023 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package cluster 16 17 import ( 18 "context" 19 "errors" 20 "fmt" 21 "sync" 22 "time" 23 24 "github.com/cenkalti/backoff/v4" 25 "github.com/dolthub/go-mysql-server/sql" 26 "github.com/dolthub/go-mysql-server/sql/mysql_db" 27 "github.com/sirupsen/logrus" 28 29 replicationapi "github.com/dolthub/dolt/go/gen/proto/dolt/services/replicationapi/v1alpha1" 30 "github.com/dolthub/dolt/go/libraries/doltcore/doltdb" 31 "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" 32 ) 33 34 type MySQLDbPersister interface { 35 mysql_db.MySQLDbPersistence 36 LoadData(context.Context) ([]byte, error) 37 } 38 39 type replicatingMySQLDbPersister struct { 40 base MySQLDbPersister 41 42 current []byte 43 version uint32 44 replicas []*mysqlDbReplica 45 46 mu sync.Mutex 47 } 48 49 type mysqlDbReplica struct { 50 shutdown bool 51 role Role 52 53 contents []byte 54 version uint32 55 56 replicatedVersion uint32 57 backoff backoff.BackOff 58 nextAttempt time.Time 59 60 client *replicationServiceClient 61 lgr *logrus.Entry 62 63 waitNotify func() 64 65 progressNotifier ProgressNotifier 66 fastFailReplicationWait bool 67 68 mu sync.Mutex 69 cond *sync.Cond 70 } 71 72 func (r *mysqlDbReplica) UpdateMySQLDb(ctx context.Context, contents []byte, version uint32) func(context.Context) error { 73 r.mu.Lock() 74 defer r.mu.Unlock() 75 r.lgr.Infof("mysqlDbReplica got new contents at version %d", version) 76 r.contents = contents 77 r.version = version 78 r.nextAttempt = time.Time{} 79 r.backoff.Reset() 80 r.cond.Broadcast() 81 82 if r.fastFailReplicationWait { 83 remote := r.client.remote 84 return func(ctx context.Context) error { 85 return fmt.Errorf("circuit breaker for replication to %s/mysql is open. this update to users and grants did not necessarily replicate successfully.", remote) 86 } 87 } else { 88 w := r.progressNotifier.Wait() 89 return func(ctx context.Context) error { 90 err := w(ctx) 91 if err != nil && errors.Is(err, doltdb.ErrReplicationWaitFailed) { 92 r.setFastFailReplicationWait(true) 93 } 94 return err 95 } 96 } 97 } 98 99 func (r *mysqlDbReplica) setFastFailReplicationWait(v bool) { 100 r.mu.Lock() 101 defer r.mu.Unlock() 102 r.fastFailReplicationWait = v 103 } 104 105 func (r *mysqlDbReplica) Run() { 106 r.mu.Lock() 107 defer r.mu.Unlock() 108 r.lgr.Tracef("mysqlDbReplica[%s]: running", r.client.remote) 109 defer r.client.closer() 110 for !r.shutdown { 111 if r.role != RolePrimary { 112 r.wait() 113 continue 114 } 115 if r.version == 0 { 116 r.wait() 117 continue 118 } 119 if r.replicatedVersion == r.version { 120 r.wait() 121 continue 122 } 123 if r.nextAttempt.After(time.Now()) { 124 r.wait() 125 continue 126 } 127 if len(r.contents) > 0 { 128 // We do not call into the client with the lock held 129 // here. Client interceptors could call 130 // `controller.setRoleAndEpoch()`, which will call back 131 // into this replica with the new role. We need to 132 // release this lock in order to avoid deadlock. 133 contents := r.contents 134 client := r.client.client 135 version := r.version 136 attempt := r.progressNotifier.BeginAttempt() 137 r.mu.Unlock() 138 ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) 139 _, err := client.UpdateUsersAndGrants(ctx, &replicationapi.UpdateUsersAndGrantsRequest{ 140 SerializedContents: contents, 141 }) 142 cancel() 143 r.mu.Lock() 144 if err != nil { 145 r.progressNotifier.RecordFailure(attempt) 146 r.lgr.Warnf("mysqlDbReplica[%s]: error replicating users and grants. backing off. %v", r.client.remote, err) 147 r.nextAttempt = time.Now().Add(r.backoff.NextBackOff()) 148 next := r.nextAttempt 149 go func() { 150 <-time.After(time.Until(next)) 151 r.mu.Lock() 152 defer r.mu.Unlock() 153 if r.nextAttempt == next { 154 r.nextAttempt = time.Time{} 155 } 156 r.cond.Broadcast() 157 }() 158 continue 159 } 160 r.progressNotifier.RecordSuccess(attempt) 161 r.fastFailReplicationWait = false 162 r.backoff.Reset() 163 r.lgr.Debugf("mysqlDbReplica[%s]: sucessfully replicated users and grants at version %d.", r.client.remote, version) 164 r.replicatedVersion = version 165 } else { 166 r.lgr.Debugf("mysqlDbReplica[%s]: not replicating empty users and grants at version %d.", r.client.remote, r.version) 167 r.replicatedVersion = r.version 168 } 169 } 170 } 171 172 func (r *mysqlDbReplica) isCaughtUp() bool { 173 return r.version == r.replicatedVersion || r.role != RolePrimary 174 } 175 176 func (r *mysqlDbReplica) setWaitNotify(notify func()) bool { 177 r.mu.Lock() 178 defer r.mu.Unlock() 179 if notify != nil { 180 if r.waitNotify != nil { 181 return false 182 } 183 notify() 184 } 185 r.waitNotify = notify 186 return true 187 } 188 189 func (r *mysqlDbReplica) wait() { 190 if r.waitNotify != nil { 191 r.waitNotify() 192 } 193 r.lgr.Infof("mysqlDbReplica waiting...") 194 if r.isCaughtUp() { 195 attempt := r.progressNotifier.BeginAttempt() 196 r.progressNotifier.RecordSuccess(attempt) 197 } 198 r.cond.Wait() 199 } 200 201 func (r *mysqlDbReplica) GracefulStop() { 202 r.mu.Lock() 203 defer r.mu.Unlock() 204 r.shutdown = true 205 r.cond.Broadcast() 206 } 207 208 func (r *mysqlDbReplica) setRole(role Role) { 209 r.mu.Lock() 210 defer r.mu.Unlock() 211 r.role = role 212 r.nextAttempt = time.Time{} 213 r.backoff.Reset() 214 r.cond.Broadcast() 215 } 216 217 func (p *replicatingMySQLDbPersister) setRole(role Role) { 218 for _, r := range p.replicas { 219 r.setRole(role) 220 } 221 p.mu.Lock() 222 // If we are transitioning to primary and we are already initialized, 223 // then we reload data so that we have the most recent persisted users 224 // and grants to replicate. 225 needsLoad := p.version != 0 && role == RolePrimary 226 p.mu.Unlock() 227 if needsLoad { 228 p.LoadData(context.Background()) 229 } 230 } 231 232 func (p *replicatingMySQLDbPersister) Run() { 233 var wg sync.WaitGroup 234 for _, r := range p.replicas { 235 r := r 236 wg.Add(1) 237 go func() { 238 defer wg.Done() 239 r.Run() 240 }() 241 } 242 wg.Wait() 243 } 244 245 func (p *replicatingMySQLDbPersister) GracefulStop() { 246 for _, r := range p.replicas { 247 r.GracefulStop() 248 } 249 } 250 251 func (p *replicatingMySQLDbPersister) Persist(ctx *sql.Context, data []byte) error { 252 p.mu.Lock() 253 err := p.base.Persist(ctx, data) 254 if err == nil { 255 p.current = data 256 p.version += 1 257 var rsc doltdb.ReplicationStatusController 258 rsc.Wait = make([]func(context.Context) error, len(p.replicas)) 259 rsc.NotifyWaitFailed = make([]func(), len(p.replicas)) 260 for i, r := range p.replicas { 261 rsc.Wait[i] = r.UpdateMySQLDb(ctx, p.current, p.version) 262 rsc.NotifyWaitFailed[i] = func() {} 263 } 264 p.mu.Unlock() 265 dsess.WaitForReplicationController(ctx, rsc) 266 } else { 267 p.mu.Unlock() 268 } 269 return err 270 } 271 272 func (p *replicatingMySQLDbPersister) LoadData(ctx context.Context) ([]byte, error) { 273 p.mu.Lock() 274 defer p.mu.Unlock() 275 ret, err := p.base.LoadData(ctx) 276 if err == nil { 277 p.current = ret 278 p.version += 1 279 for _, r := range p.replicas { 280 r.UpdateMySQLDb(ctx, p.current, p.version) 281 } 282 } 283 return ret, err 284 } 285 286 func (p *replicatingMySQLDbPersister) waitForReplication(timeout time.Duration) ([]graceTransitionResult, error) { 287 p.mu.Lock() 288 replicas := make([]*mysqlDbReplica, len(p.replicas)) 289 copy(replicas, p.replicas) 290 res := make([]graceTransitionResult, len(replicas)) 291 for i := range replicas { 292 res[i].database = "mysql" 293 res[i].remote = replicas[i].client.remote 294 res[i].remoteUrl = replicas[i].client.httpUrl() 295 } 296 var wg sync.WaitGroup 297 wg.Add(len(replicas)) 298 for li, lr := range replicas { 299 i := li 300 r := lr 301 ok := r.setWaitNotify(func() { 302 // called with r.mu locked. 303 if !res[i].caughtUp { 304 if r.isCaughtUp() { 305 res[i].caughtUp = true 306 wg.Done() 307 } else { 308 } 309 } 310 }) 311 if !ok { 312 for j := li - 1; j >= 0; j-- { 313 replicas[j].setWaitNotify(nil) 314 } 315 return nil, errors.New("cluster: mysqldb replication: could not wait for replication. Concurrent waiters conflicted with each other.") 316 } 317 } 318 p.mu.Unlock() 319 320 done := make(chan struct{}) 321 go func() { 322 wg.Wait() 323 close(done) 324 }() 325 select { 326 case <-done: 327 case <-time.After(timeout): 328 } 329 330 p.mu.Lock() 331 defer p.mu.Unlock() 332 for _, r := range replicas { 333 r.setWaitNotify(nil) 334 } 335 336 // Make certain we don't leak the wg.Wait goroutine in the failure case. 337 // At this point, none of the callbacks will ever be called again and 338 // ch.setWaitNotify grabs a lock and so establishes the happens before. 339 for _, b := range res { 340 if !b.caughtUp { 341 wg.Done() 342 } 343 } 344 <-done 345 346 return res, nil 347 }