github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/scheduler.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"container/list"
    15  	"context"
    16  	"fmt"
    17  	"sync"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    20  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    21  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    22  )
    23  
    24  const rangeIDChunkSize = 1000
    25  
    26  type rangeIDChunk struct {
    27  	// Valid contents are buf[rd:wr], read at buf[rd], write at buf[wr].
    28  	buf    [rangeIDChunkSize]roachpb.RangeID
    29  	rd, wr int
    30  }
    31  
    32  func (c *rangeIDChunk) PushBack(id roachpb.RangeID) bool {
    33  	if c.WriteCap() == 0 {
    34  		return false
    35  	}
    36  	c.buf[c.wr] = id
    37  	c.wr++
    38  	return true
    39  }
    40  
    41  func (c *rangeIDChunk) PopFront() (roachpb.RangeID, bool) {
    42  	if c.Len() == 0 {
    43  		return 0, false
    44  	}
    45  	id := c.buf[c.rd]
    46  	c.rd++
    47  	return id, true
    48  }
    49  
    50  func (c *rangeIDChunk) WriteCap() int {
    51  	return len(c.buf) - c.wr
    52  }
    53  
    54  func (c *rangeIDChunk) Len() int {
    55  	return c.wr - c.rd
    56  }
    57  
    58  // rangeIDQueue is a chunked queue of range IDs. Instead of a separate list
    59  // element for every range ID, it uses a rangeIDChunk to hold many range IDs,
    60  // amortizing the allocation/GC cost. Using a chunk queue avoids any copying
    61  // that would occur if a slice were used (the copying would occur on slice
    62  // reallocation).
    63  type rangeIDQueue struct {
    64  	chunks list.List
    65  	len    int
    66  }
    67  
    68  func (q *rangeIDQueue) PushBack(id roachpb.RangeID) {
    69  	if q.chunks.Len() == 0 || q.back().WriteCap() == 0 {
    70  		q.chunks.PushBack(&rangeIDChunk{})
    71  	}
    72  	q.len++
    73  	if !q.back().PushBack(id) {
    74  		panic(fmt.Sprintf(
    75  			"unable to push rangeID to chunk: len=%d, cap=%d",
    76  			q.back().Len(), q.back().WriteCap()))
    77  	}
    78  }
    79  
    80  func (q *rangeIDQueue) PopFront() (roachpb.RangeID, bool) {
    81  	if q.len == 0 {
    82  		return 0, false
    83  	}
    84  	frontElem := q.chunks.Front()
    85  	front := frontElem.Value.(*rangeIDChunk)
    86  	id, ok := front.PopFront()
    87  	if !ok {
    88  		panic("encountered empty chunk")
    89  	}
    90  	q.len--
    91  	if front.Len() == 0 && front.WriteCap() == 0 {
    92  		q.chunks.Remove(frontElem)
    93  	}
    94  	return id, true
    95  }
    96  
    97  func (q *rangeIDQueue) Len() int {
    98  	return q.len
    99  }
   100  
   101  func (q *rangeIDQueue) back() *rangeIDChunk {
   102  	return q.chunks.Back().Value.(*rangeIDChunk)
   103  }
   104  
   105  type raftProcessor interface {
   106  	// Process a raft.Ready struct containing entries and messages that are
   107  	// ready to read, be saved to stable storage, committed, or sent to other
   108  	// peers.
   109  	processReady(context.Context, roachpb.RangeID)
   110  	// Process all queued messages for the specified range.
   111  	// Return true if the range should be queued for ready processing.
   112  	processRequestQueue(context.Context, roachpb.RangeID) bool
   113  	// Process a raft tick for the specified range.
   114  	// Return true if the range should be queued for ready processing.
   115  	processTick(context.Context, roachpb.RangeID) bool
   116  }
   117  
   118  type raftScheduleState int
   119  
   120  const (
   121  	stateQueued raftScheduleState = 1 << iota
   122  	stateRaftReady
   123  	stateRaftRequest
   124  	stateRaftTick
   125  )
   126  
   127  type raftScheduler struct {
   128  	processor  raftProcessor
   129  	numWorkers int
   130  
   131  	mu struct {
   132  		syncutil.Mutex
   133  		cond    *sync.Cond
   134  		queue   rangeIDQueue
   135  		state   map[roachpb.RangeID]raftScheduleState
   136  		stopped bool
   137  	}
   138  
   139  	done sync.WaitGroup
   140  }
   141  
   142  func newRaftScheduler(
   143  	metrics *StoreMetrics, processor raftProcessor, numWorkers int,
   144  ) *raftScheduler {
   145  	s := &raftScheduler{
   146  		processor:  processor,
   147  		numWorkers: numWorkers,
   148  	}
   149  	s.mu.cond = sync.NewCond(&s.mu.Mutex)
   150  	s.mu.state = make(map[roachpb.RangeID]raftScheduleState)
   151  	return s
   152  }
   153  
   154  func (s *raftScheduler) Start(ctx context.Context, stopper *stop.Stopper) {
   155  	stopper.RunWorker(ctx, func(ctx context.Context) {
   156  		<-stopper.ShouldStop()
   157  		s.mu.Lock()
   158  		s.mu.stopped = true
   159  		s.mu.Unlock()
   160  		s.mu.cond.Broadcast()
   161  	})
   162  
   163  	s.done.Add(s.numWorkers)
   164  	for i := 0; i < s.numWorkers; i++ {
   165  		stopper.RunWorker(ctx, func(ctx context.Context) {
   166  			s.worker(ctx)
   167  		})
   168  	}
   169  }
   170  
   171  func (s *raftScheduler) Wait(context.Context) {
   172  	s.done.Wait()
   173  }
   174  
   175  func (s *raftScheduler) worker(ctx context.Context) {
   176  	defer s.done.Done()
   177  
   178  	// We use a sync.Cond for worker notification instead of a buffered
   179  	// channel. Buffered channels have internal overhead for maintaining the
   180  	// buffer even when the elements are empty. And the buffer isn't necessary as
   181  	// the raftScheduler work is already buffered on the internal queue. Lastly,
   182  	// signaling a sync.Cond is significantly faster than selecting and sending
   183  	// on a buffered channel.
   184  
   185  	s.mu.Lock()
   186  	for {
   187  		var id roachpb.RangeID
   188  		for {
   189  			if s.mu.stopped {
   190  				s.mu.Unlock()
   191  				return
   192  			}
   193  			var ok bool
   194  			if id, ok = s.mu.queue.PopFront(); ok {
   195  				break
   196  			}
   197  			s.mu.cond.Wait()
   198  		}
   199  
   200  		// Grab and clear the existing state for the range ID. Note that we leave
   201  		// the range ID marked as "queued" so that a concurrent Enqueue* will not
   202  		// queue the range ID again.
   203  		state := s.mu.state[id]
   204  		s.mu.state[id] = stateQueued
   205  		s.mu.Unlock()
   206  
   207  		// Process requests first. This avoids a scenario where a tick and a
   208  		// "quiesce" message are processed in the same iteration and intervening
   209  		// raft ready processing unquiesces the replica because the tick triggers
   210  		// an election.
   211  		if state&stateRaftRequest != 0 {
   212  			// processRequestQueue returns true if the range should perform ready
   213  			// processing. Do not reorder this below the call to processReady.
   214  			if s.processor.processRequestQueue(ctx, id) {
   215  				state |= stateRaftReady
   216  			}
   217  		}
   218  		if state&stateRaftTick != 0 {
   219  			// processRaftTick returns true if the range should perform ready
   220  			// processing. Do not reorder this below the call to processReady.
   221  			if s.processor.processTick(ctx, id) {
   222  				state |= stateRaftReady
   223  			}
   224  		}
   225  		if state&stateRaftReady != 0 {
   226  			s.processor.processReady(ctx, id)
   227  		}
   228  
   229  		s.mu.Lock()
   230  		state = s.mu.state[id]
   231  		if state == stateQueued {
   232  			// No further processing required by the range ID, clear it from the
   233  			// state map.
   234  			delete(s.mu.state, id)
   235  		} else {
   236  			// There was a concurrent call to one of the Enqueue* methods. Queue the
   237  			// range ID for further processing.
   238  			s.mu.queue.PushBack(id)
   239  			s.mu.cond.Signal()
   240  		}
   241  	}
   242  }
   243  
   244  func (s *raftScheduler) enqueue1Locked(addState raftScheduleState, id roachpb.RangeID) int {
   245  	prevState := s.mu.state[id]
   246  	if prevState&addState == addState {
   247  		return 0
   248  	}
   249  	var queued int
   250  	newState := prevState | addState
   251  	if newState&stateQueued == 0 {
   252  		newState |= stateQueued
   253  		queued++
   254  		s.mu.queue.PushBack(id)
   255  	}
   256  	s.mu.state[id] = newState
   257  	return queued
   258  }
   259  
   260  func (s *raftScheduler) enqueue1(addState raftScheduleState, id roachpb.RangeID) int {
   261  	s.mu.Lock()
   262  	count := s.enqueue1Locked(addState, id)
   263  	s.mu.Unlock()
   264  	return count
   265  }
   266  
   267  func (s *raftScheduler) enqueueN(addState raftScheduleState, ids ...roachpb.RangeID) int {
   268  	// Enqueue the ids in chunks to avoid hold raftScheduler.mu for too long.
   269  	const enqueueChunkSize = 128
   270  
   271  	var count int
   272  	s.mu.Lock()
   273  	for i, id := range ids {
   274  		count += s.enqueue1Locked(addState, id)
   275  		if (i+1)%enqueueChunkSize == 0 {
   276  			s.mu.Unlock()
   277  			s.mu.Lock()
   278  		}
   279  	}
   280  	s.mu.Unlock()
   281  	return count
   282  }
   283  
   284  func (s *raftScheduler) signal(count int) {
   285  	if count >= s.numWorkers {
   286  		s.mu.cond.Broadcast()
   287  	} else {
   288  		for i := 0; i < count; i++ {
   289  			s.mu.cond.Signal()
   290  		}
   291  	}
   292  }
   293  
   294  func (s *raftScheduler) EnqueueRaftReady(id roachpb.RangeID) {
   295  	s.signal(s.enqueue1(stateRaftReady, id))
   296  }
   297  
   298  func (s *raftScheduler) EnqueueRaftRequest(id roachpb.RangeID) {
   299  	s.signal(s.enqueue1(stateRaftRequest, id))
   300  }
   301  
   302  func (s *raftScheduler) EnqueueRaftTick(ids ...roachpb.RangeID) {
   303  	s.signal(s.enqueueN(stateRaftTick, ids...))
   304  }