github.com/m3db/m3@v1.5.0/src/dbnode/storage/index_insert_queue.go (about)

     1  // Copyright (c) 2018 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package storage
    22  
    23  import (
    24  	"errors"
    25  	"strconv"
    26  	"sync"
    27  	"time"
    28  
    29  	"github.com/m3db/m3/src/dbnode/namespace"
    30  	"github.com/m3db/m3/src/dbnode/storage/index"
    31  	"github.com/m3db/m3/src/dbnode/ts/writes"
    32  	"github.com/m3db/m3/src/x/clock"
    33  	xsync "github.com/m3db/m3/src/x/sync"
    34  
    35  	"github.com/uber-go/tally"
    36  )
    37  
    38  var (
    39  	errIndexInsertQueueNotOpen             = errors.New("index insert queue is not open")
    40  	errIndexInsertQueueAlreadyOpenOrClosed = errors.New("index insert queue already open or is closed")
    41  )
    42  
    43  type nsIndexInsertQueueState int
    44  
    45  const (
    46  	nsIndexInsertQueueStateNotOpen nsIndexInsertQueueState = iota
    47  	nsIndexInsertQueueStateOpen
    48  	nsIndexInsertQueueStateClosed
    49  
    50  	// TODO(prateek): runtime options for this stuff
    51  	defaultIndexBatchBackoff = 2 * time.Millisecond
    52  
    53  	indexResetAllInsertsEvery = 3 * time.Minute
    54  )
    55  
    56  type nsIndexInsertQueue struct {
    57  	sync.RWMutex
    58  
    59  	namespaceMetadata namespace.Metadata
    60  
    61  	state nsIndexInsertQueueState
    62  
    63  	// rate limits
    64  	indexBatchBackoff time.Duration
    65  
    66  	// active batch pending execution
    67  	currBatch *nsIndexInsertBatch
    68  
    69  	indexBatchFn nsIndexInsertBatchFn
    70  	nowFn        clock.NowFn
    71  	sleepFn      func(time.Duration)
    72  	coreFn       xsync.CoreFn
    73  	notifyInsert chan struct{}
    74  	closeCh      chan struct{}
    75  
    76  	scope tally.Scope
    77  
    78  	metrics nsIndexInsertQueueMetrics
    79  }
    80  
    81  type newNamespaceIndexInsertQueueFn func(
    82  	nsIndexInsertBatchFn, namespace.Metadata, clock.NowFn, xsync.CoreFn, tally.Scope) namespaceIndexInsertQueue
    83  
    84  // newNamespaceIndexInsertQueue returns a new index insert queue.
    85  // Note: No limit appears on the index insert queue since any items making
    86  // it into the index insert queue must first pass through the shard insert
    87  // queue which has it's own limits in place.
    88  // Any error returned from this queue would cause the series to not be indexed
    89  // and there is no way to return this error to the client over the network
    90  // (unlike the shard insert queue at which point if an error is returned
    91  // is returned all the way back to the DB node client).
    92  // FOLLOWUP(prateek): subsequent PR to wire up rate limiting to runtime.Options
    93  func newNamespaceIndexInsertQueue(
    94  	indexBatchFn nsIndexInsertBatchFn,
    95  	namespaceMetadata namespace.Metadata,
    96  	nowFn clock.NowFn,
    97  	coreFn xsync.CoreFn,
    98  	scope tally.Scope,
    99  ) namespaceIndexInsertQueue {
   100  	subscope := scope.SubScope("insert-queue")
   101  	q := &nsIndexInsertQueue{
   102  		namespaceMetadata: namespaceMetadata,
   103  		indexBatchBackoff: defaultIndexBatchBackoff,
   104  		indexBatchFn:      indexBatchFn,
   105  		nowFn:             nowFn,
   106  		sleepFn:           time.Sleep,
   107  		coreFn:            coreFn,
   108  		// NB(r): Use 2 * num cores so that each CPU insert queue which
   109  		// is 1 per num CPU core can always enqueue a notification without
   110  		// it being lost.
   111  		notifyInsert: make(chan struct{}, 2*xsync.NumCores()),
   112  		closeCh:      make(chan struct{}, 1),
   113  		scope:        subscope,
   114  		metrics:      newNamespaceIndexInsertQueueMetrics(subscope),
   115  	}
   116  	q.currBatch = q.newBatch(newBatchOptions{instrumented: true})
   117  	return q
   118  }
   119  
   120  type newBatchOptions struct {
   121  	instrumented bool
   122  }
   123  
   124  func (q *nsIndexInsertQueue) newBatch(opts newBatchOptions) *nsIndexInsertBatch {
   125  	scope := tally.NoopScope
   126  	if opts.instrumented {
   127  		scope = q.scope
   128  	}
   129  	return newNsIndexInsertBatch(q.namespaceMetadata, q.nowFn, scope)
   130  }
   131  
   132  func (q *nsIndexInsertQueue) insertLoop() {
   133  	defer func() {
   134  		close(q.closeCh)
   135  	}()
   136  
   137  	var lastInsert time.Time
   138  	batch := q.newBatch(newBatchOptions{})
   139  	for range q.notifyInsert {
   140  		// Check if inserting too fast
   141  		elapsedSinceLastInsert := q.nowFn().Sub(lastInsert)
   142  
   143  		// Rotate batches
   144  		var (
   145  			state   nsIndexInsertQueueState
   146  			backoff time.Duration
   147  		)
   148  		q.Lock()
   149  		state = q.state
   150  		if elapsedSinceLastInsert < q.indexBatchBackoff {
   151  			// Need to backoff before rotate and insert
   152  			backoff = q.indexBatchBackoff - elapsedSinceLastInsert
   153  		}
   154  		q.Unlock()
   155  
   156  		if backoff > 0 {
   157  			q.sleepFn(backoff)
   158  		}
   159  
   160  		// Rotate after backoff
   161  		batchWg := q.currBatch.Rotate(batch)
   162  
   163  		all := batch.AllInserts()
   164  		if all.Len() > 0 {
   165  			q.indexBatchFn(all)
   166  		}
   167  
   168  		batchWg.Done()
   169  
   170  		lastInsert = q.nowFn()
   171  
   172  		if state != nsIndexInsertQueueStateOpen {
   173  			return // Break if the queue closed
   174  		}
   175  	}
   176  }
   177  
   178  func (q *nsIndexInsertQueue) InsertBatch(
   179  	batch *index.WriteBatch,
   180  ) (*sync.WaitGroup, error) {
   181  	batchLen := batch.Len()
   182  
   183  	// Choose the queue relevant to current CPU index.
   184  	// Note: since inserts by CPU core is allocated when
   185  	// nsIndexInsertBatch is constructed and then never modified
   186  	// it is safe to concurently read (but not modify obviously).
   187  	inserts := q.currBatch.insertsByCPUCore[q.coreFn()]
   188  	inserts.Lock()
   189  	firstInsert := len(inserts.shardInserts) == 0
   190  	inserts.shardInserts = append(inserts.shardInserts, batch)
   191  	wg := inserts.wg
   192  	inserts.Unlock()
   193  
   194  	// Notify insert loop, only required if first to insert for this
   195  	// this CPU core.
   196  	if firstInsert {
   197  		select {
   198  		case q.notifyInsert <- struct{}{}:
   199  		default:
   200  			// Loop busy, already ready to consume notification.
   201  		}
   202  	}
   203  
   204  	q.metrics.numPending.Inc(int64(batchLen))
   205  	return wg, nil
   206  }
   207  
   208  func (q *nsIndexInsertQueue) InsertPending(
   209  	pending []writes.PendingIndexInsert,
   210  ) (*sync.WaitGroup, error) {
   211  	batchLen := len(pending)
   212  
   213  	// Choose the queue relevant to current CPU index.
   214  	// Note: since inserts by CPU core is allocated when
   215  	// nsIndexInsertBatch is constructed and then never modified
   216  	// it is safe to concurently read (but not modify obviously).
   217  	inserts := q.currBatch.insertsByCPUCore[q.coreFn()]
   218  	inserts.Lock()
   219  	firstInsert := len(inserts.batchInserts) == 0
   220  	inserts.batchInserts = append(inserts.batchInserts, pending...)
   221  	wg := inserts.wg
   222  	inserts.Unlock()
   223  
   224  	// Notify insert loop, only required if first to insert for this
   225  	// this CPU core.
   226  	if firstInsert {
   227  		select {
   228  		case q.notifyInsert <- struct{}{}:
   229  		default:
   230  			// Loop busy, already ready to consume notification.
   231  		}
   232  	}
   233  
   234  	q.metrics.numPending.Inc(int64(batchLen))
   235  	return wg, nil
   236  }
   237  
   238  func (q *nsIndexInsertQueue) Start() error {
   239  	q.Lock()
   240  	defer q.Unlock()
   241  
   242  	if q.state != nsIndexInsertQueueStateNotOpen {
   243  		return errIndexInsertQueueAlreadyOpenOrClosed
   244  	}
   245  
   246  	q.state = nsIndexInsertQueueStateOpen
   247  	go q.insertLoop()
   248  	return nil
   249  }
   250  
   251  func (q *nsIndexInsertQueue) Stop() error {
   252  	q.Lock()
   253  
   254  	if q.state != nsIndexInsertQueueStateOpen {
   255  		q.Unlock()
   256  		return errIndexInsertQueueNotOpen
   257  	}
   258  
   259  	q.state = nsIndexInsertQueueStateClosed
   260  	q.Unlock()
   261  
   262  	// Final flush
   263  	select {
   264  	case q.notifyInsert <- struct{}{}:
   265  	default:
   266  		// Loop busy, already ready to consume notification
   267  	}
   268  
   269  	// wait till other go routine is done
   270  	<-q.closeCh
   271  
   272  	return nil
   273  }
   274  
   275  type nsIndexInsertBatchFn func(inserts *index.WriteBatch)
   276  
   277  type nsIndexInsertBatch struct {
   278  	namespace namespace.Metadata
   279  	nowFn     clock.NowFn
   280  	wg        *sync.WaitGroup
   281  	// Note: since inserts by CPU core is allocated when
   282  	// nsIndexInsertBatch is constructed and then never modified
   283  	// it is safe to concurently read (but not modify obviously).
   284  	insertsByCPUCore    []*nsIndexInsertsByCPUCore
   285  	allInserts          *index.WriteBatch
   286  	allInsertsLastReset time.Time
   287  }
   288  
   289  type nsIndexInsertsByCPUCore struct {
   290  	sync.Mutex
   291  	shardInserts []*index.WriteBatch
   292  	batchInserts []writes.PendingIndexInsert
   293  	wg           *sync.WaitGroup
   294  	metrics      nsIndexInsertsByCPUCoreMetrics
   295  }
   296  
   297  type nsIndexInsertsByCPUCoreMetrics struct {
   298  	rotateInsertsShard   tally.Counter
   299  	rotateInsertsPending tally.Counter
   300  }
   301  
   302  func newNamespaceIndexInsertsByCPUCoreMetrics(
   303  	cpuIndex int,
   304  	scope tally.Scope,
   305  ) nsIndexInsertsByCPUCoreMetrics {
   306  	scope = scope.Tagged(map[string]string{
   307  		"cpu-index": strconv.Itoa(cpuIndex),
   308  	})
   309  
   310  	const rotate = "rotate-inserts"
   311  	return nsIndexInsertsByCPUCoreMetrics{
   312  		rotateInsertsShard: scope.Tagged(map[string]string{
   313  			"rotate-type": "shard-insert",
   314  		}).Counter(rotate),
   315  		rotateInsertsPending: scope.Tagged(map[string]string{
   316  			"rotate-type": "pending-insert",
   317  		}).Counter(rotate),
   318  	}
   319  }
   320  
   321  func newNsIndexInsertBatch(
   322  	namespace namespace.Metadata,
   323  	nowFn clock.NowFn,
   324  	scope tally.Scope,
   325  ) *nsIndexInsertBatch {
   326  	b := &nsIndexInsertBatch{
   327  		namespace: namespace,
   328  		nowFn:     nowFn,
   329  	}
   330  	numCores := xsync.NumCores()
   331  	for i := 0; i < numCores; i++ {
   332  		b.insertsByCPUCore = append(b.insertsByCPUCore, &nsIndexInsertsByCPUCore{
   333  			metrics: newNamespaceIndexInsertsByCPUCoreMetrics(i, scope),
   334  		})
   335  	}
   336  
   337  	b.allocateAllInserts()
   338  	b.Rotate(nil)
   339  	return b
   340  }
   341  
   342  func (b *nsIndexInsertBatch) allocateAllInserts() {
   343  	b.allInserts = index.NewWriteBatch(index.WriteBatchOptions{
   344  		IndexBlockSize: b.namespace.Options().IndexOptions().BlockSize(),
   345  	})
   346  	b.allInsertsLastReset = b.nowFn()
   347  }
   348  
   349  func (b *nsIndexInsertBatch) AllInserts() *index.WriteBatch {
   350  	b.allInserts.Reset()
   351  	for _, inserts := range b.insertsByCPUCore {
   352  		inserts.Lock()
   353  		for _, shardInserts := range inserts.shardInserts {
   354  			b.allInserts.AppendAll(shardInserts)
   355  		}
   356  		for _, insert := range inserts.batchInserts {
   357  			b.allInserts.Append(insert.Entry, insert.Document)
   358  		}
   359  		inserts.Unlock()
   360  	}
   361  	return b.allInserts
   362  }
   363  
   364  func (b *nsIndexInsertBatch) Rotate(target *nsIndexInsertBatch) *sync.WaitGroup {
   365  	prevWg := b.wg
   366  
   367  	// We always expect to be waiting for an index.
   368  	b.wg = &sync.WaitGroup{}
   369  	b.wg.Add(1)
   370  
   371  	// Rotate to target if we need to.
   372  	for idx, inserts := range b.insertsByCPUCore {
   373  		if target == nil {
   374  			// No target to rotate with.
   375  			inserts.Lock()
   376  			// Reset
   377  			inserts.shardInserts = inserts.shardInserts[:0]
   378  			inserts.batchInserts = inserts.batchInserts[:0]
   379  			// Use new wait group.
   380  			inserts.wg = b.wg
   381  			inserts.Unlock()
   382  			continue
   383  		}
   384  
   385  		// First prepare the target to take the current batch's inserts.
   386  		targetInserts := target.insertsByCPUCore[idx]
   387  		targetInserts.Lock()
   388  
   389  		// Reset the target inserts since we'll take ref to them in a second.
   390  		for i := range targetInserts.shardInserts {
   391  			// TODO(prateek): if we start pooling `[]index.WriteBatchEntry`, then we could return to the pool here.
   392  			targetInserts.shardInserts[i] = nil
   393  		}
   394  		prevTargetShardInserts := targetInserts.shardInserts[:0]
   395  
   396  		// memset optimization
   397  		var zero writes.PendingIndexInsert
   398  		for i := range targetInserts.batchInserts {
   399  			targetInserts.batchInserts[i] = zero
   400  		}
   401  		prevTargetBatchInserts := targetInserts.batchInserts[:0]
   402  
   403  		// Lock the current batch inserts now ready to rotate to the target.
   404  		inserts.Lock()
   405  
   406  		// Update current slice refs to take target's inserts.
   407  		targetInserts.shardInserts = inserts.shardInserts
   408  		targetInserts.batchInserts = inserts.batchInserts
   409  		targetInserts.wg = inserts.wg
   410  
   411  		// Reuse the target's old slices.
   412  		inserts.shardInserts = prevTargetShardInserts
   413  		inserts.batchInserts = prevTargetBatchInserts
   414  
   415  		// Use new wait group.
   416  		inserts.wg = b.wg
   417  
   418  		// Unlock as early as possible for writes to keep enqueuing.
   419  		inserts.Unlock()
   420  
   421  		numTargetInsertsShard := len(targetInserts.shardInserts)
   422  		numTargetInsertsPending := len(targetInserts.batchInserts)
   423  
   424  		// Now can unlock target inserts too.
   425  		targetInserts.Unlock()
   426  
   427  		if n := numTargetInsertsShard; n > 0 {
   428  			inserts.metrics.rotateInsertsShard.Inc(int64(n))
   429  		}
   430  		if n := numTargetInsertsPending; n > 0 {
   431  			inserts.metrics.rotateInsertsPending.Inc(int64(n))
   432  		}
   433  	}
   434  
   435  	if b.nowFn().Sub(b.allInsertsLastReset) > indexResetAllInsertsEvery {
   436  		// NB(r): Sometimes this can grow very high, so we reset it relatively frequently
   437  		b.allocateAllInserts()
   438  	}
   439  
   440  	return prevWg
   441  }
   442  
   443  type nsIndexInsertQueueMetrics struct {
   444  	numPending tally.Counter
   445  }
   446  
   447  func newNamespaceIndexInsertQueueMetrics(
   448  	scope tally.Scope,
   449  ) nsIndexInsertQueueMetrics {
   450  	subScope := scope.SubScope("index-queue")
   451  	return nsIndexInsertQueueMetrics{
   452  		numPending: subScope.Counter("num-pending"),
   453  	}
   454  }