github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/storage/block/wired_list.go (about)

     1  // Copyright (c) 2018 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  // The wired list is the primary data structure that is used to support the LRU
    22  // caching policy. It is a global (per-database) structure that is shared
    23  // between all namespaces, shards, and series. It is responsible for determining
    24  // which blocks should be kept "wired" (cached) in memory, and which should be
    25  // closed and fetched again from disk if they need to be retrieved in the future.
    26  //
    27  // The WiredList is basically a specialized LRU, except that it doesn't store the
    28  // data itself, it just keeps track of which data is currently in memory and makes
    29  // decisions about which data to remove from memory. Updating the Wired List is
    30  // asynchronous: callers put an operation to modify the list into a channel and
    31  // a background goroutine pulls from that channels and performs updates to the
    32  // list which may include removing items from memory ("unwiring" blocks).
    33  //
    34  // The WiredList itself does not allocate a per-entry datastructure to keep track
    35  // of what is active and what is not. Instead, it creates a "virtual list" ontop
    36  // of the existing blocks that are in memory by manipulating struct-level pointers
    37  // on the DatabaseBlocks which are "owned" by the list. In other words, the
    38  // DatabaseBlocks are scattered among numerous namespaces/shards/series, but they
    39  // existed in virtual sorted order via the prev/next pointers they contain, but
    40  // which are only manipulated by the WiredList.
    41  //
    42  // The WiredList ONLY keeps track of blocks that are read from disk. Blocks that
    43  // are created by rotating recently-written data out of buffers and into new
    44  // DatabaseBlocks are managed by the background ticks of the series. The background
    45  // tick will avoid closing blocks that were read from disk, and a block will never
    46  // be provided to the WiredList if it wasn't read from disk. This prevents tricky
    47  // ownership semantics where both the background tick and and the WiredList are
    48  // competing for ownership / trying to close the same blocks.
    49  
    50  package block
    51  
    52  import (
    53  	"errors"
    54  	"sync"
    55  	"sync/atomic"
    56  	"time"
    57  
    58  	"github.com/m3db/m3/src/dbnode/runtime"
    59  	"github.com/m3db/m3/src/x/clock"
    60  	"github.com/m3db/m3/src/x/instrument"
    61  
    62  	"github.com/uber-go/tally"
    63  	"go.uber.org/zap"
    64  )
    65  
    66  const (
    67  	defaultWiredListEventsChannelSize = 65536
    68  	wiredListSampleGaugesEvery        = 100
    69  )
    70  
    71  var (
    72  	errAlreadyStarted = errors.New("wired list already started")
    73  	errAlreadyStopped = errors.New("wired list already stopped")
    74  )
    75  
    76  // WiredList is a database block wired list.
    77  type WiredList struct {
    78  	mu sync.RWMutex
    79  
    80  	nowFn clock.NowFn
    81  
    82  	// Max wired blocks, must use atomic store and load to access.
    83  	maxWired int64
    84  
    85  	root          dbBlock
    86  	length        int
    87  	updatesChSize int
    88  	updatesCh     chan DatabaseBlock
    89  	doneCh        chan struct{}
    90  
    91  	metrics wiredListMetrics
    92  	iOpts   instrument.Options
    93  }
    94  
    95  type wiredListMetrics struct {
    96  	unwireable           tally.Gauge
    97  	limit                tally.Gauge
    98  	evicted              tally.Counter
    99  	pushedBack           tally.Counter
   100  	inserted             tally.Counter
   101  	evictedAfterDuration tally.Timer
   102  }
   103  
   104  func newWiredListMetrics(scope tally.Scope) wiredListMetrics {
   105  	return wiredListMetrics{
   106  		// Keeps track of how many blocks are in the list
   107  		unwireable: scope.Gauge("unwireable"),
   108  		limit:      scope.Gauge("limit"),
   109  		// Incremented when a block is evicted
   110  		evicted: scope.Counter("evicted"),
   111  		// Incremented when a block is "pushed back" in the list, I.E
   112  		// it was already in the list
   113  		pushedBack: scope.Counter("pushed-back"),
   114  		// Incremented when a block is inserted into the list, I.E
   115  		// it wasn't already present
   116  		inserted: scope.Counter("inserted"),
   117  		// Measure how much time blocks spend in the list before being evicted
   118  		evictedAfterDuration: scope.Timer("evicted-after-duration"),
   119  	}
   120  }
   121  
   122  // WiredListOptions is the options struct for the WiredList constructor.
   123  type WiredListOptions struct {
   124  	RuntimeOptionsManager runtime.OptionsManager
   125  	InstrumentOptions     instrument.Options
   126  	ClockOptions          clock.Options
   127  	EventsChannelSize     int
   128  }
   129  
   130  // NewWiredList returns a new database block wired list.
   131  func NewWiredList(opts WiredListOptions) *WiredList {
   132  	scope := opts.InstrumentOptions.MetricsScope().
   133  		SubScope("wired-list")
   134  	l := &WiredList{
   135  		nowFn:   opts.ClockOptions.NowFn(),
   136  		metrics: newWiredListMetrics(scope),
   137  		iOpts:   opts.InstrumentOptions,
   138  	}
   139  	if opts.EventsChannelSize > 0 {
   140  		l.updatesChSize = opts.EventsChannelSize
   141  	} else {
   142  		l.updatesChSize = defaultWiredListEventsChannelSize
   143  	}
   144  	l.root.setNext(&l.root)
   145  	l.root.setPrev(&l.root)
   146  	opts.RuntimeOptionsManager.RegisterListener(l)
   147  	return l
   148  }
   149  
   150  // SetRuntimeOptions sets the current runtime options to
   151  // be consumed by the wired list
   152  func (l *WiredList) SetRuntimeOptions(value runtime.Options) {
   153  	atomic.StoreInt64(&l.maxWired, int64(value.MaxWiredBlocks()))
   154  }
   155  
   156  // Start starts processing the wired list
   157  func (l *WiredList) Start() error {
   158  	l.mu.Lock()
   159  	defer l.mu.Unlock()
   160  	if l.updatesCh != nil {
   161  		return errAlreadyStarted
   162  	}
   163  
   164  	l.updatesCh = make(chan DatabaseBlock, l.updatesChSize)
   165  	l.doneCh = make(chan struct{}, 1)
   166  	go func() {
   167  		i := 0
   168  		for v := range l.updatesCh {
   169  			l.processUpdateBlock(v)
   170  			if i%wiredListSampleGaugesEvery == 0 {
   171  				l.metrics.unwireable.Update(float64(l.length))
   172  				l.metrics.limit.Update(float64(atomic.LoadInt64(&l.maxWired)))
   173  			}
   174  			i++
   175  		}
   176  		l.doneCh <- struct{}{}
   177  	}()
   178  
   179  	return nil
   180  }
   181  
   182  // Stop stops processing the wired list
   183  func (l *WiredList) Stop() error {
   184  	l.mu.Lock()
   185  	defer l.mu.Unlock()
   186  
   187  	if l.updatesCh == nil {
   188  		return errAlreadyStopped
   189  	}
   190  
   191  	close(l.updatesCh)
   192  	<-l.doneCh
   193  
   194  	l.updatesCh = nil
   195  	close(l.doneCh)
   196  	l.doneCh = nil
   197  
   198  	return nil
   199  }
   200  
   201  // BlockingUpdate places the block into the channel of blocks which are waiting to notify the
   202  // wired list that they were accessed. All updates must be processed through this channel
   203  // to force synchronization.
   204  //
   205  // We use a channel and a background processing goroutine to reduce blocking / lock contention.
   206  func (l *WiredList) BlockingUpdate(v DatabaseBlock) {
   207  	// Fast path, don't use defer (in Go 1.14 this won't matter anymore since
   208  	// defer is basically compile time for simple callsites).
   209  	l.mu.RLock()
   210  	if l.updatesCh == nil {
   211  		l.mu.RUnlock()
   212  		return
   213  	}
   214  	l.updatesCh <- v
   215  	l.mu.RUnlock()
   216  }
   217  
   218  // NonBlockingUpdate will attempt to put the block in the events channel, but will not block
   219  // if the channel is full. Used in cases where a blocking update could trigger deadlock with
   220  // the WiredList itself.
   221  func (l *WiredList) NonBlockingUpdate(v DatabaseBlock) bool {
   222  	l.mu.RLock()
   223  	defer l.mu.RUnlock()
   224  
   225  	if l.updatesCh == nil {
   226  		return false
   227  	}
   228  
   229  	select {
   230  	case l.updatesCh <- v:
   231  		return true
   232  	default:
   233  		return false
   234  	}
   235  }
   236  
   237  // processUpdateBlock inspects a block that has been modified or read recently
   238  // and determines what outcome its state should have on the wired list.
   239  func (l *WiredList) processUpdateBlock(v DatabaseBlock) {
   240  	entry := v.wiredListEntry()
   241  
   242  	// In some cases the WiredList can receive blocks that are closed. This can happen if a block is
   243  	// in the updatesCh (because it was read) but also already in the WiredList, and while its still
   244  	// in the updatesCh, it is evicted from the wired list to make room for some other block that is
   245  	// being processed. The eviction of the block will close it, but the enqueued update is still in
   246  	// the updateCh even though its an update for a closed block. For the same reason, the wired list
   247  	// can receive blocks that were not retrieved from disk because the closed block was returned to
   248  	// a pool and then re-used.
   249  	unwireable := !entry.closed && entry.wasRetrievedFromDisk
   250  
   251  	// If a block is still unwireable then its worth keeping track of in the wired list
   252  	// so we push it back.
   253  	if unwireable {
   254  		l.pushBack(v)
   255  		return
   256  	}
   257  
   258  	// If a block is not unwireable there is no point in keeping track of it in the WiredList,
   259  	// so we remove it or don't add it in the first place. This works because the remove method
   260  	// is a noop for blocks that aren't already in the WiredList and the pushBack method used
   261  	// above is the only way for blocks to be added.
   262  	l.remove(v)
   263  }
   264  
   265  func (l *WiredList) insertAfter(v, at DatabaseBlock) {
   266  	now := l.nowFn()
   267  
   268  	n := at.next()
   269  	at.setNext(v)
   270  	v.setPrev(at)
   271  	v.setNext(n)
   272  	n.setPrev(v)
   273  	l.length++
   274  
   275  	maxWired := int(atomic.LoadInt64(&l.maxWired))
   276  	if maxWired <= 0 {
   277  		// Not enforcing max wired blocks
   278  		return
   279  	}
   280  
   281  	// Try to unwire all blocks possible
   282  	bl := l.root.next()
   283  	for l.length > maxWired && bl != &l.root {
   284  		entry := bl.wiredListEntry()
   285  		if !entry.wasRetrievedFromDisk {
   286  			// This should never happen because processUpdateBlock performs the same
   287  			// check, and a block should never be pooled in-between those steps because
   288  			// the wired list is supposed to have sole ownership over that lifecycle and
   289  			// is single-threaded.
   290  			instrument.EmitAndLogInvariantViolation(l.iOpts, func(l *zap.Logger) {
   291  				l.With(
   292  					zap.Time("blockStart", entry.startTime.ToTime()),
   293  					zap.Bool("closed", entry.closed),
   294  					zap.Bool("wasRetrievedFromDisk", entry.wasRetrievedFromDisk),
   295  				).Error("wired list tried to process a block that was not retrieved from disk")
   296  			})
   297  
   298  		}
   299  
   300  		// Evict the block before closing it so that callers of series.ReadEncoded()
   301  		// don't get errors about trying to read from a closed block.
   302  		if onEvict := bl.OnEvictedFromWiredList(); onEvict != nil {
   303  			if entry.seriesID == nil {
   304  				// Entry should always have a series ID attached
   305  				instrument.EmitAndLogInvariantViolation(l.iOpts, func(l *zap.Logger) {
   306  					l.With(
   307  						zap.Time("blockStart", entry.startTime.ToTime()),
   308  						zap.Bool("closed", entry.closed),
   309  						zap.Bool("wasRetrievedFromDisk", entry.wasRetrievedFromDisk),
   310  					).Error("wired list entry does not have seriesID set")
   311  				})
   312  
   313  			} else {
   314  				onEvict.OnEvictedFromWiredList(entry.seriesID, entry.startTime)
   315  			}
   316  		}
   317  
   318  		// bl.CloseIfFromDisk() will return the block to the pool. In order to avoid
   319  		// races with the pool itself, we capture the value of the next block and
   320  		// remove the block from the wired list before we close it.
   321  		nextBl := bl.next()
   322  		l.remove(bl)
   323  		if wasFromDisk := bl.CloseIfFromDisk(); !wasFromDisk {
   324  			// Should never happen
   325  			instrument.EmitAndLogInvariantViolation(l.iOpts, func(l *zap.Logger) {
   326  				l.With(
   327  					zap.Time("blockStart", entry.startTime.ToTime()),
   328  					zap.Bool("closed", entry.closed),
   329  					zap.Bool("wasRetrievedFromDisk", entry.wasRetrievedFromDisk),
   330  				).Error("wired list tried to close a block that was not from disk")
   331  			})
   332  		}
   333  
   334  		l.metrics.evicted.Inc(1)
   335  
   336  		enteredListAt := time.Unix(0, bl.enteredListAtUnixNano())
   337  		l.metrics.evictedAfterDuration.Record(now.Sub(enteredListAt))
   338  
   339  		bl = nextBl
   340  	}
   341  }
   342  
   343  func (l *WiredList) remove(v DatabaseBlock) {
   344  	if !l.exists(v) {
   345  		// Already removed
   346  		return
   347  	}
   348  	v.prev().setNext(v.next())
   349  	v.next().setPrev(v.prev())
   350  	v.setNext(nil) // avoid memory leaks
   351  	v.setPrev(nil) // avoid memory leaks
   352  	l.length--
   353  }
   354  
   355  func (l *WiredList) pushBack(v DatabaseBlock) {
   356  	if l.exists(v) {
   357  		l.metrics.pushedBack.Inc(1)
   358  		l.moveToBack(v)
   359  		return
   360  	}
   361  
   362  	l.metrics.inserted.Inc(1)
   363  	l.insertAfter(v, l.root.prev())
   364  	v.setEnteredListAtUnixNano(l.nowFn().UnixNano())
   365  }
   366  
   367  func (l *WiredList) moveToBack(v DatabaseBlock) {
   368  	if !l.exists(v) || l.root.prev() == v {
   369  		return
   370  	}
   371  	l.remove(v)
   372  	l.insertAfter(v, l.root.prev())
   373  }
   374  
   375  func (l *WiredList) exists(v DatabaseBlock) bool {
   376  	return v.next() != nil || v.prev() != nil
   377  }