github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/sqle/cluster/branch_control_replica.go (about)

     1  // Copyright 2023 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package cluster
    16  
    17  import (
    18  	"context"
    19  	"errors"
    20  	"fmt"
    21  	"sync"
    22  	"time"
    23  
    24  	"github.com/cenkalti/backoff/v4"
    25  	"github.com/sirupsen/logrus"
    26  
    27  	replicationapi "github.com/dolthub/dolt/go/gen/proto/dolt/services/replicationapi/v1alpha1"
    28  	"github.com/dolthub/dolt/go/libraries/doltcore/branch_control"
    29  	"github.com/dolthub/dolt/go/libraries/doltcore/doltdb"
    30  )
    31  
    32  type branchControlReplication struct {
    33  	current  []byte
    34  	version  uint32
    35  	replicas []*branchControlReplica
    36  
    37  	bcController *branch_control.Controller
    38  
    39  	mu sync.Mutex
    40  }
    41  
    42  type branchControlReplica struct {
    43  	shutdown bool
    44  	role     Role
    45  
    46  	contents          []byte
    47  	version           uint32
    48  	replicatedVersion uint32
    49  
    50  	backoff     backoff.BackOff
    51  	nextAttempt time.Time
    52  
    53  	client *replicationServiceClient
    54  	lgr    *logrus.Entry
    55  
    56  	waitNotify func()
    57  
    58  	progressNotifier        ProgressNotifier
    59  	fastFailReplicationWait bool
    60  
    61  	mu   sync.Mutex
    62  	cond *sync.Cond
    63  }
    64  
    65  func (r *branchControlReplica) UpdateContents(contents []byte, version uint32) func(context.Context) error {
    66  	r.mu.Lock()
    67  	defer r.mu.Unlock()
    68  	r.contents = contents
    69  	r.version = version
    70  	r.nextAttempt = time.Time{}
    71  	r.backoff.Reset()
    72  	r.cond.Broadcast()
    73  	if r.fastFailReplicationWait {
    74  		remote := r.client.remote
    75  		return func(ctx context.Context) error {
    76  			return fmt.Errorf("circuit breaker for replication to %s/dolt_branch_control is open. this branch control update did not necessarily replicate successfully.", remote)
    77  		}
    78  	}
    79  	w := r.progressNotifier.Wait()
    80  	return func(ctx context.Context) error {
    81  		err := w(ctx)
    82  		if err != nil && errors.Is(err, doltdb.ErrReplicationWaitFailed) {
    83  			r.setFastFailReplicationWait(true)
    84  		}
    85  		return err
    86  	}
    87  }
    88  
    89  func (r *branchControlReplica) Run() {
    90  	r.mu.Lock()
    91  	defer r.mu.Unlock()
    92  	r.lgr.Tracef("branchControlReplica[%s]: running", r.client.remote)
    93  	for !r.shutdown {
    94  		if r.role != RolePrimary {
    95  			r.wait()
    96  			continue
    97  		}
    98  		if r.version == 0 {
    99  			r.wait()
   100  			continue
   101  		}
   102  		if r.replicatedVersion == r.version {
   103  			r.wait()
   104  			continue
   105  		}
   106  		if r.nextAttempt.After(time.Now()) {
   107  			r.wait()
   108  			continue
   109  		}
   110  		// We do not call into the client with the lock held here.
   111  		// Client interceptors could call
   112  		// `controller.setRoleAndEpoch()`, which will call back into
   113  		// this replica with the new role. We need to release this lock
   114  		// in order to avoid deadlock.
   115  		contents := r.contents
   116  		client := r.client.client
   117  		version := r.version
   118  		attempt := r.progressNotifier.BeginAttempt()
   119  		r.mu.Unlock()
   120  		ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
   121  		_, err := client.UpdateBranchControl(ctx, &replicationapi.UpdateBranchControlRequest{
   122  			SerializedContents: contents,
   123  		})
   124  		cancel()
   125  		r.mu.Lock()
   126  		if err != nil {
   127  			r.progressNotifier.RecordFailure(attempt)
   128  			r.lgr.Warnf("branchControlReplica[%s]: error replicating branch control permissions. backing off. %v", r.client.remote, err)
   129  			r.nextAttempt = time.Now().Add(r.backoff.NextBackOff())
   130  			next := r.nextAttempt
   131  			go func() {
   132  				<-time.After(time.Until(next))
   133  				r.mu.Lock()
   134  				defer r.mu.Unlock()
   135  				if r.nextAttempt == next {
   136  					r.nextAttempt = time.Time{}
   137  				}
   138  				r.cond.Broadcast()
   139  			}()
   140  			continue
   141  		}
   142  		r.progressNotifier.RecordSuccess(attempt)
   143  		r.fastFailReplicationWait = false
   144  		r.backoff.Reset()
   145  		r.lgr.Debugf("branchControlReplica[%s]: sucessfully replicated branch control permissions.", r.client.remote)
   146  		r.replicatedVersion = version
   147  	}
   148  }
   149  
   150  func (r *branchControlReplica) wait() {
   151  	if r.waitNotify != nil {
   152  		r.waitNotify()
   153  	}
   154  	if r.isCaughtUp() {
   155  		attempt := r.progressNotifier.BeginAttempt()
   156  		r.progressNotifier.RecordSuccess(attempt)
   157  	}
   158  	r.cond.Wait()
   159  }
   160  
   161  func (r *branchControlReplica) isCaughtUp() bool {
   162  	return r.version == r.replicatedVersion || r.role != RolePrimary
   163  }
   164  
   165  func (r *branchControlReplica) setFastFailReplicationWait(v bool) {
   166  	r.mu.Lock()
   167  	defer r.mu.Unlock()
   168  	r.fastFailReplicationWait = v
   169  }
   170  
   171  func (r *branchControlReplica) setWaitNotify(notify func()) bool {
   172  	r.mu.Lock()
   173  	defer r.mu.Unlock()
   174  	if notify != nil {
   175  		if r.waitNotify != nil {
   176  			return false
   177  		}
   178  		notify()
   179  	}
   180  	r.waitNotify = notify
   181  	return true
   182  }
   183  
   184  func (r *branchControlReplica) GracefulStop() {
   185  	r.mu.Lock()
   186  	defer r.mu.Unlock()
   187  	r.shutdown = true
   188  	r.cond.Broadcast()
   189  }
   190  
   191  func (r *branchControlReplica) setRole(role Role) {
   192  	r.mu.Lock()
   193  	defer r.mu.Unlock()
   194  	r.role = role
   195  	r.nextAttempt = time.Time{}
   196  	r.fastFailReplicationWait = false
   197  	r.cond.Broadcast()
   198  }
   199  
   200  func (p *branchControlReplication) setRole(role Role) {
   201  	if role == RolePrimary {
   202  		cur := p.bcController.Serialized.Load()
   203  		if cur == nil {
   204  			p.UpdateBranchControlContents(context.Background(), []byte{}, nil)
   205  		} else {
   206  			p.UpdateBranchControlContents(context.Background(), *cur, nil)
   207  		}
   208  	}
   209  	for _, r := range p.replicas {
   210  		r.setRole(role)
   211  	}
   212  }
   213  
   214  func (p *branchControlReplication) Run() {
   215  	var wg sync.WaitGroup
   216  	for _, r := range p.replicas {
   217  		r := r
   218  		wg.Add(1)
   219  		go func() {
   220  			defer wg.Done()
   221  			r.Run()
   222  		}()
   223  	}
   224  	wg.Wait()
   225  }
   226  
   227  func (p *branchControlReplication) GracefulStop() {
   228  	for _, r := range p.replicas {
   229  		r.GracefulStop()
   230  	}
   231  }
   232  
   233  func (p *branchControlReplication) UpdateBranchControlContents(ctx context.Context, contents []byte, rsc *doltdb.ReplicationStatusController) {
   234  	p.mu.Lock()
   235  	defer p.mu.Unlock()
   236  	p.current = contents
   237  	p.version += 1
   238  
   239  	var j int
   240  	if rsc != nil {
   241  		j = len(rsc.Wait)
   242  		rsc.Wait = append(rsc.Wait, make([]func(ctx context.Context) error, len(p.replicas))...)
   243  		rsc.NotifyWaitFailed = append(rsc.NotifyWaitFailed, make([]func(), len(p.replicas))...)
   244  	}
   245  	for i, r := range p.replicas {
   246  		w := r.UpdateContents(p.current, p.version)
   247  		if rsc != nil {
   248  			rsc.Wait[i+j] = w
   249  			rsc.NotifyWaitFailed[i+j] = func() {}
   250  		}
   251  	}
   252  }
   253  
   254  func (p *branchControlReplication) waitForReplication(timeout time.Duration) ([]graceTransitionResult, error) {
   255  	p.mu.Lock()
   256  	replicas := make([]*branchControlReplica, len(p.replicas))
   257  	copy(replicas, p.replicas)
   258  	res := make([]graceTransitionResult, len(replicas))
   259  	for i := range res {
   260  		res[i].database = "dolt_branch_control"
   261  		res[i].remote = replicas[i].client.remote
   262  		res[i].remoteUrl = replicas[i].client.httpUrl()
   263  	}
   264  	var wg sync.WaitGroup
   265  	wg.Add(len(replicas))
   266  	for li, lr := range replicas {
   267  		i := li
   268  		r := lr
   269  		ok := r.setWaitNotify(func() {
   270  			// called with r.mu locked.
   271  			if !res[i].caughtUp {
   272  				if r.isCaughtUp() {
   273  					res[i].caughtUp = true
   274  					wg.Done()
   275  				} else {
   276  				}
   277  			}
   278  		})
   279  		if !ok {
   280  			for j := li - 1; j >= 0; j-- {
   281  				replicas[j].setWaitNotify(nil)
   282  			}
   283  			return nil, errors.New("cluster: dolt_branch_control replication: could not wait for replication. Concurrent waiters conflicted with each other.")
   284  		}
   285  	}
   286  	p.mu.Unlock()
   287  
   288  	done := make(chan struct{})
   289  	go func() {
   290  		wg.Wait()
   291  		close(done)
   292  	}()
   293  	select {
   294  	case <-done:
   295  	case <-time.After(timeout):
   296  	}
   297  
   298  	p.mu.Lock()
   299  	defer p.mu.Unlock()
   300  	for _, r := range replicas {
   301  		r.setWaitNotify(nil)
   302  	}
   303  
   304  	// Make certain we don't leak the wg.Wait goroutine in the failure case.
   305  	// At this point, none of the callbacks will ever be called again and
   306  	// ch.setWaitNotify grabs a lock and so establishes the happens before.
   307  	for _, b := range res {
   308  		if !b.caughtUp {
   309  			wg.Done()
   310  		}
   311  	}
   312  	<-done
   313  
   314  	return res, nil
   315  }