github.com/m3db/m3@v1.5.0/src/dbnode/storage/block/wired_list.go (about) 1 // Copyright (c) 2018 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 // The wired list is the primary data structure that is used to support the LRU 22 // caching policy. It is a global (per-database) structure that is shared 23 // between all namespaces, shards, and series. It is responsible for determining 24 // which blocks should be kept "wired" (cached) in memory, and which should be 25 // closed and fetched again from disk if they need to be retrieved in the future. 26 // 27 // The WiredList is basically a specialized LRU, except that it doesn't store the 28 // data itself, it just keeps track of which data is currently in memory and makes 29 // decisions about which data to remove from memory. Updating the Wired List is 30 // asynchronous: callers put an operation to modify the list into a channel and 31 // a background goroutine pulls from that channels and performs updates to the 32 // list which may include removing items from memory ("unwiring" blocks). 33 // 34 // The WiredList itself does not allocate a per-entry datastructure to keep track 35 // of what is active and what is not. Instead, it creates a "virtual list" ontop 36 // of the existing blocks that are in memory by manipulating struct-level pointers 37 // on the DatabaseBlocks which are "owned" by the list. In other words, the 38 // DatabaseBlocks are scattered among numerous namespaces/shards/series, but they 39 // existed in virtual sorted order via the prev/next pointers they contain, but 40 // which are only manipulated by the WiredList. 41 // 42 // The WiredList ONLY keeps track of blocks that are read from disk. Blocks that 43 // are created by rotating recently-written data out of buffers and into new 44 // DatabaseBlocks are managed by the background ticks of the series. The background 45 // tick will avoid closing blocks that were read from disk, and a block will never 46 // be provided to the WiredList if it wasn't read from disk. This prevents tricky 47 // ownership semantics where both the background tick and and the WiredList are 48 // competing for ownership / trying to close the same blocks. 49 50 package block 51 52 import ( 53 "errors" 54 "sync" 55 "sync/atomic" 56 "time" 57 58 "github.com/m3db/m3/src/dbnode/runtime" 59 "github.com/m3db/m3/src/x/clock" 60 "github.com/m3db/m3/src/x/instrument" 61 62 "github.com/uber-go/tally" 63 "go.uber.org/zap" 64 ) 65 66 const ( 67 defaultWiredListEventsChannelSize = 65536 68 wiredListSampleGaugesEvery = 100 69 ) 70 71 var ( 72 errAlreadyStarted = errors.New("wired list already started") 73 errAlreadyStopped = errors.New("wired list already stopped") 74 ) 75 76 // WiredList is a database block wired list. 77 type WiredList struct { 78 mu sync.RWMutex 79 80 nowFn clock.NowFn 81 82 // Max wired blocks, must use atomic store and load to access. 83 maxWired int64 84 85 root dbBlock 86 length int 87 updatesChSize int 88 updatesCh chan DatabaseBlock 89 doneCh chan struct{} 90 91 metrics wiredListMetrics 92 iOpts instrument.Options 93 } 94 95 type wiredListMetrics struct { 96 unwireable tally.Gauge 97 limit tally.Gauge 98 evicted tally.Counter 99 pushedBack tally.Counter 100 inserted tally.Counter 101 evictedAfterDuration tally.Timer 102 } 103 104 func newWiredListMetrics(scope tally.Scope) wiredListMetrics { 105 return wiredListMetrics{ 106 // Keeps track of how many blocks are in the list 107 unwireable: scope.Gauge("unwireable"), 108 limit: scope.Gauge("limit"), 109 // Incremented when a block is evicted 110 evicted: scope.Counter("evicted"), 111 // Incremented when a block is "pushed back" in the list, I.E 112 // it was already in the list 113 pushedBack: scope.Counter("pushed-back"), 114 // Incremented when a block is inserted into the list, I.E 115 // it wasn't already present 116 inserted: scope.Counter("inserted"), 117 // Measure how much time blocks spend in the list before being evicted 118 evictedAfterDuration: scope.Timer("evicted-after-duration"), 119 } 120 } 121 122 // WiredListOptions is the options struct for the WiredList constructor. 123 type WiredListOptions struct { 124 RuntimeOptionsManager runtime.OptionsManager 125 InstrumentOptions instrument.Options 126 ClockOptions clock.Options 127 EventsChannelSize int 128 } 129 130 // NewWiredList returns a new database block wired list. 131 func NewWiredList(opts WiredListOptions) *WiredList { 132 scope := opts.InstrumentOptions.MetricsScope(). 133 SubScope("wired-list") 134 l := &WiredList{ 135 nowFn: opts.ClockOptions.NowFn(), 136 metrics: newWiredListMetrics(scope), 137 iOpts: opts.InstrumentOptions, 138 } 139 if opts.EventsChannelSize > 0 { 140 l.updatesChSize = opts.EventsChannelSize 141 } else { 142 l.updatesChSize = defaultWiredListEventsChannelSize 143 } 144 l.root.setNext(&l.root) 145 l.root.setPrev(&l.root) 146 opts.RuntimeOptionsManager.RegisterListener(l) 147 return l 148 } 149 150 // SetRuntimeOptions sets the current runtime options to 151 // be consumed by the wired list 152 func (l *WiredList) SetRuntimeOptions(value runtime.Options) { 153 atomic.StoreInt64(&l.maxWired, int64(value.MaxWiredBlocks())) 154 } 155 156 // Start starts processing the wired list 157 func (l *WiredList) Start() error { 158 l.mu.Lock() 159 defer l.mu.Unlock() 160 if l.updatesCh != nil { 161 return errAlreadyStarted 162 } 163 164 l.updatesCh = make(chan DatabaseBlock, l.updatesChSize) 165 l.doneCh = make(chan struct{}, 1) 166 go func() { 167 i := 0 168 for v := range l.updatesCh { 169 l.processUpdateBlock(v) 170 if i%wiredListSampleGaugesEvery == 0 { 171 l.metrics.unwireable.Update(float64(l.length)) 172 l.metrics.limit.Update(float64(atomic.LoadInt64(&l.maxWired))) 173 } 174 i++ 175 } 176 l.doneCh <- struct{}{} 177 }() 178 179 return nil 180 } 181 182 // Stop stops processing the wired list 183 func (l *WiredList) Stop() error { 184 l.mu.Lock() 185 defer l.mu.Unlock() 186 187 if l.updatesCh == nil { 188 return errAlreadyStopped 189 } 190 191 close(l.updatesCh) 192 <-l.doneCh 193 194 l.updatesCh = nil 195 close(l.doneCh) 196 l.doneCh = nil 197 198 return nil 199 } 200 201 // BlockingUpdate places the block into the channel of blocks which are waiting to notify the 202 // wired list that they were accessed. All updates must be processed through this channel 203 // to force synchronization. 204 // 205 // We use a channel and a background processing goroutine to reduce blocking / lock contention. 206 func (l *WiredList) BlockingUpdate(v DatabaseBlock) { 207 // Fast path, don't use defer (in Go 1.14 this won't matter anymore since 208 // defer is basically compile time for simple callsites). 209 l.mu.RLock() 210 if l.updatesCh == nil { 211 l.mu.RUnlock() 212 return 213 } 214 l.updatesCh <- v 215 l.mu.RUnlock() 216 } 217 218 // NonBlockingUpdate will attempt to put the block in the events channel, but will not block 219 // if the channel is full. Used in cases where a blocking update could trigger deadlock with 220 // the WiredList itself. 221 func (l *WiredList) NonBlockingUpdate(v DatabaseBlock) bool { 222 l.mu.RLock() 223 defer l.mu.RUnlock() 224 225 if l.updatesCh == nil { 226 return false 227 } 228 229 select { 230 case l.updatesCh <- v: 231 return true 232 default: 233 return false 234 } 235 } 236 237 // processUpdateBlock inspects a block that has been modified or read recently 238 // and determines what outcome its state should have on the wired list. 239 func (l *WiredList) processUpdateBlock(v DatabaseBlock) { 240 entry := v.wiredListEntry() 241 242 // In some cases the WiredList can receive blocks that are closed. This can happen if a block is 243 // in the updatesCh (because it was read) but also already in the WiredList, and while its still 244 // in the updatesCh, it is evicted from the wired list to make room for some other block that is 245 // being processed. The eviction of the block will close it, but the enqueued update is still in 246 // the updateCh even though its an update for a closed block. For the same reason, the wired list 247 // can receive blocks that were not retrieved from disk because the closed block was returned to 248 // a pool and then re-used. 249 unwireable := !entry.closed && entry.wasRetrievedFromDisk 250 251 // If a block is still unwireable then its worth keeping track of in the wired list 252 // so we push it back. 253 if unwireable { 254 l.pushBack(v) 255 return 256 } 257 258 // If a block is not unwireable there is no point in keeping track of it in the WiredList, 259 // so we remove it or don't add it in the first place. This works because the remove method 260 // is a noop for blocks that aren't already in the WiredList and the pushBack method used 261 // above is the only way for blocks to be added. 262 l.remove(v) 263 } 264 265 func (l *WiredList) insertAfter(v, at DatabaseBlock) { 266 now := l.nowFn() 267 268 n := at.next() 269 at.setNext(v) 270 v.setPrev(at) 271 v.setNext(n) 272 n.setPrev(v) 273 l.length++ 274 275 maxWired := int(atomic.LoadInt64(&l.maxWired)) 276 if maxWired <= 0 { 277 // Not enforcing max wired blocks 278 return 279 } 280 281 // Try to unwire all blocks possible 282 bl := l.root.next() 283 for l.length > maxWired && bl != &l.root { 284 entry := bl.wiredListEntry() 285 if !entry.wasRetrievedFromDisk { 286 // This should never happen because processUpdateBlock performs the same 287 // check, and a block should never be pooled in-between those steps because 288 // the wired list is supposed to have sole ownership over that lifecycle and 289 // is single-threaded. 290 instrument.EmitAndLogInvariantViolation(l.iOpts, func(l *zap.Logger) { 291 l.With( 292 zap.Time("blockStart", entry.startTime.ToTime()), 293 zap.Bool("closed", entry.closed), 294 zap.Bool("wasRetrievedFromDisk", entry.wasRetrievedFromDisk), 295 ).Error("wired list tried to process a block that was not retrieved from disk") 296 }) 297 298 } 299 300 // Evict the block before closing it so that callers of series.ReadEncoded() 301 // don't get errors about trying to read from a closed block. 302 if onEvict := bl.OnEvictedFromWiredList(); onEvict != nil { 303 if entry.seriesID == nil { 304 // Entry should always have a series ID attached 305 instrument.EmitAndLogInvariantViolation(l.iOpts, func(l *zap.Logger) { 306 l.With( 307 zap.Time("blockStart", entry.startTime.ToTime()), 308 zap.Bool("closed", entry.closed), 309 zap.Bool("wasRetrievedFromDisk", entry.wasRetrievedFromDisk), 310 ).Error("wired list entry does not have seriesID set") 311 }) 312 313 } else { 314 onEvict.OnEvictedFromWiredList(entry.seriesID, entry.startTime) 315 } 316 } 317 318 // bl.CloseIfFromDisk() will return the block to the pool. In order to avoid 319 // races with the pool itself, we capture the value of the next block and 320 // remove the block from the wired list before we close it. 321 nextBl := bl.next() 322 l.remove(bl) 323 if wasFromDisk := bl.CloseIfFromDisk(); !wasFromDisk { 324 // Should never happen 325 instrument.EmitAndLogInvariantViolation(l.iOpts, func(l *zap.Logger) { 326 l.With( 327 zap.Time("blockStart", entry.startTime.ToTime()), 328 zap.Bool("closed", entry.closed), 329 zap.Bool("wasRetrievedFromDisk", entry.wasRetrievedFromDisk), 330 ).Error("wired list tried to close a block that was not from disk") 331 }) 332 } 333 334 l.metrics.evicted.Inc(1) 335 336 enteredListAt := time.Unix(0, bl.enteredListAtUnixNano()) 337 l.metrics.evictedAfterDuration.Record(now.Sub(enteredListAt)) 338 339 bl = nextBl 340 } 341 } 342 343 func (l *WiredList) remove(v DatabaseBlock) { 344 if !l.exists(v) { 345 // Already removed 346 return 347 } 348 v.prev().setNext(v.next()) 349 v.next().setPrev(v.prev()) 350 v.setNext(nil) // avoid memory leaks 351 v.setPrev(nil) // avoid memory leaks 352 l.length-- 353 } 354 355 func (l *WiredList) pushBack(v DatabaseBlock) { 356 if l.exists(v) { 357 l.metrics.pushedBack.Inc(1) 358 l.moveToBack(v) 359 return 360 } 361 362 l.metrics.inserted.Inc(1) 363 l.insertAfter(v, l.root.prev()) 364 v.setEnteredListAtUnixNano(l.nowFn().UnixNano()) 365 } 366 367 func (l *WiredList) moveToBack(v DatabaseBlock) { 368 if !l.exists(v) || l.root.prev() == v { 369 return 370 } 371 l.remove(v) 372 l.insertAfter(v, l.root.prev()) 373 } 374 375 func (l *WiredList) exists(v DatabaseBlock) bool { 376 return v.next() != nil || v.prev() != nil 377 }