github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/sqle/cluster/branch_control_replica.go (about) 1 // Copyright 2023 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package cluster 16 17 import ( 18 "context" 19 "errors" 20 "fmt" 21 "sync" 22 "time" 23 24 "github.com/cenkalti/backoff/v4" 25 "github.com/sirupsen/logrus" 26 27 replicationapi "github.com/dolthub/dolt/go/gen/proto/dolt/services/replicationapi/v1alpha1" 28 "github.com/dolthub/dolt/go/libraries/doltcore/branch_control" 29 "github.com/dolthub/dolt/go/libraries/doltcore/doltdb" 30 ) 31 32 type branchControlReplication struct { 33 current []byte 34 version uint32 35 replicas []*branchControlReplica 36 37 bcController *branch_control.Controller 38 39 mu sync.Mutex 40 } 41 42 type branchControlReplica struct { 43 shutdown bool 44 role Role 45 46 contents []byte 47 version uint32 48 replicatedVersion uint32 49 50 backoff backoff.BackOff 51 nextAttempt time.Time 52 53 client *replicationServiceClient 54 lgr *logrus.Entry 55 56 waitNotify func() 57 58 progressNotifier ProgressNotifier 59 fastFailReplicationWait bool 60 61 mu sync.Mutex 62 cond *sync.Cond 63 } 64 65 func (r *branchControlReplica) UpdateContents(contents []byte, version uint32) func(context.Context) error { 66 r.mu.Lock() 67 defer r.mu.Unlock() 68 r.contents = contents 69 r.version = version 70 r.nextAttempt = time.Time{} 71 r.backoff.Reset() 72 r.cond.Broadcast() 73 if r.fastFailReplicationWait { 74 remote := r.client.remote 75 return func(ctx context.Context) error { 76 return fmt.Errorf("circuit breaker for replication to %s/dolt_branch_control is open. this branch control update did not necessarily replicate successfully.", remote) 77 } 78 } 79 w := r.progressNotifier.Wait() 80 return func(ctx context.Context) error { 81 err := w(ctx) 82 if err != nil && errors.Is(err, doltdb.ErrReplicationWaitFailed) { 83 r.setFastFailReplicationWait(true) 84 } 85 return err 86 } 87 } 88 89 func (r *branchControlReplica) Run() { 90 r.mu.Lock() 91 defer r.mu.Unlock() 92 r.lgr.Tracef("branchControlReplica[%s]: running", r.client.remote) 93 for !r.shutdown { 94 if r.role != RolePrimary { 95 r.wait() 96 continue 97 } 98 if r.version == 0 { 99 r.wait() 100 continue 101 } 102 if r.replicatedVersion == r.version { 103 r.wait() 104 continue 105 } 106 if r.nextAttempt.After(time.Now()) { 107 r.wait() 108 continue 109 } 110 // We do not call into the client with the lock held here. 111 // Client interceptors could call 112 // `controller.setRoleAndEpoch()`, which will call back into 113 // this replica with the new role. We need to release this lock 114 // in order to avoid deadlock. 115 contents := r.contents 116 client := r.client.client 117 version := r.version 118 attempt := r.progressNotifier.BeginAttempt() 119 r.mu.Unlock() 120 ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) 121 _, err := client.UpdateBranchControl(ctx, &replicationapi.UpdateBranchControlRequest{ 122 SerializedContents: contents, 123 }) 124 cancel() 125 r.mu.Lock() 126 if err != nil { 127 r.progressNotifier.RecordFailure(attempt) 128 r.lgr.Warnf("branchControlReplica[%s]: error replicating branch control permissions. backing off. %v", r.client.remote, err) 129 r.nextAttempt = time.Now().Add(r.backoff.NextBackOff()) 130 next := r.nextAttempt 131 go func() { 132 <-time.After(time.Until(next)) 133 r.mu.Lock() 134 defer r.mu.Unlock() 135 if r.nextAttempt == next { 136 r.nextAttempt = time.Time{} 137 } 138 r.cond.Broadcast() 139 }() 140 continue 141 } 142 r.progressNotifier.RecordSuccess(attempt) 143 r.fastFailReplicationWait = false 144 r.backoff.Reset() 145 r.lgr.Debugf("branchControlReplica[%s]: sucessfully replicated branch control permissions.", r.client.remote) 146 r.replicatedVersion = version 147 } 148 } 149 150 func (r *branchControlReplica) wait() { 151 if r.waitNotify != nil { 152 r.waitNotify() 153 } 154 if r.isCaughtUp() { 155 attempt := r.progressNotifier.BeginAttempt() 156 r.progressNotifier.RecordSuccess(attempt) 157 } 158 r.cond.Wait() 159 } 160 161 func (r *branchControlReplica) isCaughtUp() bool { 162 return r.version == r.replicatedVersion || r.role != RolePrimary 163 } 164 165 func (r *branchControlReplica) setFastFailReplicationWait(v bool) { 166 r.mu.Lock() 167 defer r.mu.Unlock() 168 r.fastFailReplicationWait = v 169 } 170 171 func (r *branchControlReplica) setWaitNotify(notify func()) bool { 172 r.mu.Lock() 173 defer r.mu.Unlock() 174 if notify != nil { 175 if r.waitNotify != nil { 176 return false 177 } 178 notify() 179 } 180 r.waitNotify = notify 181 return true 182 } 183 184 func (r *branchControlReplica) GracefulStop() { 185 r.mu.Lock() 186 defer r.mu.Unlock() 187 r.shutdown = true 188 r.cond.Broadcast() 189 } 190 191 func (r *branchControlReplica) setRole(role Role) { 192 r.mu.Lock() 193 defer r.mu.Unlock() 194 r.role = role 195 r.nextAttempt = time.Time{} 196 r.fastFailReplicationWait = false 197 r.cond.Broadcast() 198 } 199 200 func (p *branchControlReplication) setRole(role Role) { 201 if role == RolePrimary { 202 cur := p.bcController.Serialized.Load() 203 if cur == nil { 204 p.UpdateBranchControlContents(context.Background(), []byte{}, nil) 205 } else { 206 p.UpdateBranchControlContents(context.Background(), *cur, nil) 207 } 208 } 209 for _, r := range p.replicas { 210 r.setRole(role) 211 } 212 } 213 214 func (p *branchControlReplication) Run() { 215 var wg sync.WaitGroup 216 for _, r := range p.replicas { 217 r := r 218 wg.Add(1) 219 go func() { 220 defer wg.Done() 221 r.Run() 222 }() 223 } 224 wg.Wait() 225 } 226 227 func (p *branchControlReplication) GracefulStop() { 228 for _, r := range p.replicas { 229 r.GracefulStop() 230 } 231 } 232 233 func (p *branchControlReplication) UpdateBranchControlContents(ctx context.Context, contents []byte, rsc *doltdb.ReplicationStatusController) { 234 p.mu.Lock() 235 defer p.mu.Unlock() 236 p.current = contents 237 p.version += 1 238 239 var j int 240 if rsc != nil { 241 j = len(rsc.Wait) 242 rsc.Wait = append(rsc.Wait, make([]func(ctx context.Context) error, len(p.replicas))...) 243 rsc.NotifyWaitFailed = append(rsc.NotifyWaitFailed, make([]func(), len(p.replicas))...) 244 } 245 for i, r := range p.replicas { 246 w := r.UpdateContents(p.current, p.version) 247 if rsc != nil { 248 rsc.Wait[i+j] = w 249 rsc.NotifyWaitFailed[i+j] = func() {} 250 } 251 } 252 } 253 254 func (p *branchControlReplication) waitForReplication(timeout time.Duration) ([]graceTransitionResult, error) { 255 p.mu.Lock() 256 replicas := make([]*branchControlReplica, len(p.replicas)) 257 copy(replicas, p.replicas) 258 res := make([]graceTransitionResult, len(replicas)) 259 for i := range res { 260 res[i].database = "dolt_branch_control" 261 res[i].remote = replicas[i].client.remote 262 res[i].remoteUrl = replicas[i].client.httpUrl() 263 } 264 var wg sync.WaitGroup 265 wg.Add(len(replicas)) 266 for li, lr := range replicas { 267 i := li 268 r := lr 269 ok := r.setWaitNotify(func() { 270 // called with r.mu locked. 271 if !res[i].caughtUp { 272 if r.isCaughtUp() { 273 res[i].caughtUp = true 274 wg.Done() 275 } else { 276 } 277 } 278 }) 279 if !ok { 280 for j := li - 1; j >= 0; j-- { 281 replicas[j].setWaitNotify(nil) 282 } 283 return nil, errors.New("cluster: dolt_branch_control replication: could not wait for replication. Concurrent waiters conflicted with each other.") 284 } 285 } 286 p.mu.Unlock() 287 288 done := make(chan struct{}) 289 go func() { 290 wg.Wait() 291 close(done) 292 }() 293 select { 294 case <-done: 295 case <-time.After(timeout): 296 } 297 298 p.mu.Lock() 299 defer p.mu.Unlock() 300 for _, r := range replicas { 301 r.setWaitNotify(nil) 302 } 303 304 // Make certain we don't leak the wg.Wait goroutine in the failure case. 305 // At this point, none of the callbacks will ever be called again and 306 // ch.setWaitNotify grabs a lock and so establishes the happens before. 307 for _, b := range res { 308 if !b.caughtUp { 309 wg.Done() 310 } 311 } 312 <-done 313 314 return res, nil 315 }