github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/storage/mediator.go (about) 1 // Copyright (c) 2016 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package storage 22 23 import ( 24 "errors" 25 "sync" 26 "time" 27 28 "github.com/m3db/m3/src/dbnode/persist/fs" 29 "github.com/m3db/m3/src/dbnode/persist/fs/commitlog" 30 "github.com/m3db/m3/src/x/clock" 31 "github.com/m3db/m3/src/x/instrument" 32 xtime "github.com/m3db/m3/src/x/time" 33 34 "github.com/uber-go/tally" 35 "go.uber.org/zap" 36 ) 37 38 type ( 39 mediatorState int 40 ) 41 42 const ( 43 fileOpCheckInterval = time.Second 44 45 mediatorNotOpen mediatorState = iota 46 mediatorOpen 47 mediatorClosed 48 ) 49 50 var ( 51 errMediatorAlreadyOpen = errors.New("mediator is already open") 52 errMediatorNotOpen = errors.New("mediator is not open") 53 errMediatorAlreadyClosed = errors.New("mediator is already closed") 54 errMediatorTimeTriedToProgressBackwards = errors.New("mediator time tried to progress backwards") 55 ) 56 57 type mediatorMetrics struct { 58 bootstrapStatus tally.Gauge 59 cleanupStatus tally.Gauge 60 flushStatus tally.Gauge 61 repairStatus tally.Gauge 62 } 63 64 func newMediatorMetrics(scope tally.Scope) mediatorMetrics { 65 return mediatorMetrics{ 66 bootstrapStatus: scope.Gauge("bootstrapped"), 67 cleanupStatus: scope.Gauge("cleanup"), 68 flushStatus: scope.Gauge("flush"), 69 repairStatus: scope.Gauge("repair"), 70 } 71 } 72 73 type mediator struct { 74 sync.RWMutex 75 database database 76 databaseBootstrapManager 77 databaseFileSystemManager 78 databaseColdFlushManager 79 databaseTickManager 80 81 opts Options 82 nowFn clock.NowFn 83 sleepFn sleepFn 84 metrics mediatorMetrics 85 state mediatorState 86 mediatorTimeBarrier mediatorTimeBarrier 87 closedCh chan struct{} 88 tickInterval time.Duration 89 fileOpsProcesses []FileOpsProcess 90 backgroundProcesses []BackgroundProcess 91 } 92 93 // TODO(r): Consider renaming "databaseMediator" to "databaseCoordinator" 94 // when we have time (now is not that time). 95 func newMediator(database database, commitlog commitlog.CommitLog, opts Options) (databaseMediator, error) { 96 var ( 97 iOpts = opts.InstrumentOptions() 98 scope = iOpts.MetricsScope() 99 nowFn = opts.ClockOptions().NowFn() 100 ) 101 d := &mediator{ 102 database: database, 103 opts: opts, 104 nowFn: opts.ClockOptions().NowFn(), 105 sleepFn: time.Sleep, 106 metrics: newMediatorMetrics(scope), 107 state: mediatorNotOpen, 108 closedCh: make(chan struct{}), 109 tickInterval: opts.MediatorTickInterval(), 110 } 111 fsm := newFileSystemManager(database, commitlog, opts) 112 d.databaseFileSystemManager = fsm 113 d.fileOpsProcesses = []FileOpsProcess{ 114 FileOpsProcessFn(d.ongoingFileSystemProcesses), 115 FileOpsProcessFn(d.ongoingColdFlushProcesses), 116 } 117 d.mediatorTimeBarrier = newMediatorTimeBarrier(nowFn, iOpts, len(d.fileOpsProcesses)) 118 119 // NB(bodu): Cold flush needs its own persist manager now 120 // that its running in its own thread. 121 fsOpts := opts.CommitLogOptions().FilesystemOptions() 122 pm, err := fs.NewPersistManager(fsOpts) 123 if err != nil { 124 return nil, err 125 } 126 cfm := newColdFlushManager(database, pm, opts) 127 d.databaseColdFlushManager = cfm 128 129 d.databaseTickManager = newTickManager(database, opts) 130 d.databaseBootstrapManager = newBootstrapManager(database, d, opts) 131 return d, nil 132 } 133 134 func (m *mediator) RegisterBackgroundProcess(process BackgroundProcess) error { 135 m.Lock() 136 defer m.Unlock() 137 138 if m.state != mediatorNotOpen { 139 return errMediatorAlreadyOpen 140 } 141 142 m.backgroundProcesses = append(m.backgroundProcesses, process) 143 return nil 144 } 145 146 func (m *mediator) Open() error { 147 m.Lock() 148 defer m.Unlock() 149 if m.state != mediatorNotOpen { 150 return errMediatorAlreadyOpen 151 } 152 m.state = mediatorOpen 153 154 go m.reportLoop() 155 for _, fileOpsProcess := range m.fileOpsProcesses { 156 go fileOpsProcess.Start() 157 } 158 go m.ongoingTick() 159 160 for _, process := range m.backgroundProcesses { 161 process.Start() 162 } 163 164 return nil 165 } 166 167 func (m *mediator) DisableFileOpsAndWait() { 168 fsStatus := m.databaseFileSystemManager.Disable() 169 // Even though the cold flush runs separately, its still 170 // considered a fs process. 171 cfStatus := m.databaseColdFlushManager.Disable() 172 for fsStatus == fileOpInProgress { 173 m.sleepFn(fileOpCheckInterval) 174 fsStatus = m.databaseFileSystemManager.Status() 175 } 176 for cfStatus == fileOpInProgress { 177 m.sleepFn(fileOpCheckInterval) 178 cfStatus = m.databaseColdFlushManager.Status() 179 } 180 } 181 182 func (m *mediator) EnableFileOps() { 183 m.databaseFileSystemManager.Enable() 184 // Even though the cold flush runs separately, its still 185 // considered a fs process. 186 m.databaseColdFlushManager.Enable() 187 } 188 189 func (m *mediator) Report() { 190 m.databaseBootstrapManager.Report() 191 m.databaseFileSystemManager.Report() 192 m.databaseColdFlushManager.Report() 193 194 for _, process := range m.backgroundProcesses { 195 process.Report() 196 } 197 } 198 199 func (m *mediator) Close() error { 200 m.Lock() 201 defer m.Unlock() 202 if m.state == mediatorNotOpen { 203 return errMediatorNotOpen 204 } 205 if m.state == mediatorClosed { 206 return errMediatorAlreadyClosed 207 } 208 m.state = mediatorClosed 209 close(m.closedCh) 210 211 for _, process := range m.backgroundProcesses { 212 process.Stop() 213 } 214 215 return nil 216 } 217 218 // The mediator mediates the relationship between ticks and warm flushes/snapshots. 219 // 220 // For example, the requirements to perform a flush are: 221 // 1) currentTime > blockStart.Add(blockSize).Add(bufferPast) 222 // 2) node is not bootstrapping (technically shard is not bootstrapping) 223 // 224 // Similarly, there is logic in the Tick flow for removing shard flush states from a map so that it doesn't 225 // grow infinitely for nodes that are not restarted. If the Tick path measured the current time when it made that 226 // decision instead of using the same measurement that is shared with the flush logic, it might end up removing 227 // a shard flush state (due to it expiring), but since the flush logic is using a slightly more stale timestamp it 228 // will think that the old block hasn't been flushed (even thought it has) and try to flush it even though the data 229 // is potentially still on disk (if it hasn't been cleaned up yet). 230 // 231 // See comment over mediatorTimeBarrier for more details on how this is implemented. 232 func (m *mediator) ongoingFileSystemProcesses() { 233 for { 234 select { 235 case <-m.closedCh: 236 return 237 default: 238 m.sleepFn(m.tickInterval) 239 240 // Check if the mediator is already closed. 241 if !m.IsOpen() { 242 return 243 } 244 245 m.runFileSystemProcesses() 246 } 247 } 248 } 249 250 // The mediator mediates the relationship between ticks and cold flushes/cleanup the same way it does for warm flushes/snapshots. 251 // We want to begin each cold/warm flush with an in sync view of time as a tick. 252 // NB(bodu): Cold flushes and cleanup have been separated out into it's own thread to avoid blocking snapshots. 253 func (m *mediator) ongoingColdFlushProcesses() { 254 for { 255 select { 256 case <-m.closedCh: 257 return 258 default: 259 m.sleepFn(m.tickInterval) 260 261 // Check if the mediator is already closed. 262 if !m.IsOpen() { 263 return 264 } 265 266 m.runColdFlushProcesses() 267 } 268 } 269 } 270 271 func (m *mediator) ongoingTick() { 272 var ( 273 log = m.opts.InstrumentOptions().Logger() 274 mediatorTime = m.mediatorTimeBarrier.initialMediatorTime() 275 ) 276 for { 277 select { 278 case <-m.closedCh: 279 return 280 default: 281 m.sleepFn(m.tickInterval) 282 283 // Check if the mediator is already closed. 284 if !m.IsOpen() { 285 return 286 } 287 288 // See comment over mediatorTimeBarrier for an explanation of this logic. 289 newMediatorTime, err := m.mediatorTimeBarrier.maybeRelease() 290 if err != nil { 291 log.Error("ongoing tick was unable to release time barrier", zap.Error(err)) 292 continue 293 } 294 mediatorTime = newMediatorTime 295 296 // NB(bodu): We may still hit a db closed error here since the db does not wait upon 297 // completion of ticks. 298 if err := m.Tick(force, mediatorTime); err != nil && err != errDatabaseIsClosed { 299 log.Error("error within tick", zap.Error(err)) 300 } 301 } 302 } 303 } 304 305 func (m *mediator) runFileSystemProcesses() { 306 // See comment over mediatorTimeBarrier for an explanation of this logic. 307 mediatorTime := m.mediatorTimeBarrier.fsProcessesWait() 308 m.databaseFileSystemManager.Run(mediatorTime) 309 } 310 311 func (m *mediator) runColdFlushProcesses() { 312 // See comment over mediatorTimeBarrier for an explanation of this logic. 313 mediatorTime := m.mediatorTimeBarrier.fsProcessesWait() 314 m.databaseColdFlushManager.Run(mediatorTime) 315 } 316 317 func (m *mediator) reportLoop() { 318 interval := m.opts.InstrumentOptions().ReportInterval() 319 t := time.NewTicker(interval) 320 321 for { 322 select { 323 case <-t.C: 324 m.Report() 325 case <-m.closedCh: 326 t.Stop() 327 return 328 } 329 } 330 } 331 332 func (m *mediator) IsOpen() bool { 333 m.RLock() 334 defer m.RUnlock() 335 return m.state == mediatorOpen 336 } 337 338 // mediatorTimeBarrier is used to prevent the tick process and the filesystem processes from ever running 339 // concurrently with an inconsistent view of time. Each time the filesystem processes want to run they first 340 // register for the next barrier by calling fsProcessesWait(). Once a tick completes it will call maybeRelease() 341 // which will detect that the filesystem processes are waiting for the next barrier at which point it will update 342 // the mediator time and propagate that information to the filesystem processes via the releaseCh. If the filesystem 343 // processes are still running when the tick completes, the call to maybeRelease() will just return the same time 344 // as the previous run and another tick will run with the same timestamp as the previous one. 345 // 346 // This cooperation ensures that multiple ticks can run during a single run of filesystem processes (although 347 // each tick will run with the same startTime), but that if a tick and run of filesystem processes are executing 348 // concurrently they will always have the same value for startTime. 349 // 350 // Note that this scheme (specifically the tick process calling maybeRelease() and the fs processes waiting instead 351 // of vice versa) is specifically designed such that the ticking process is never blocked and is constantly running. 352 // This means that once a run of filesystem processes completes it will always have to wait until the currently 353 // executing tick completes before performing the next run, but in practice this should not be much of an issue. 354 // 355 // Additionally, an independent cold flush process complicates this a bit more in that we have more than one filesystem 356 // process waiting on the mediator barrier. The invariant here is that both warm and cold flushes always start on a tick 357 // with a consistent view of time as the tick it is on. They don't necessarily need to start on the same tick. See the 358 // diagram below for an example case. 359 // 360 // ____________ ___________ _________________ 361 // | Flush (t0) | | Tick (t0) | | Cold Flush (t0) | 362 // | | | | | | 363 // | | |___________| | | 364 // | | ___________ | | 365 // | | | Tick (t0) | | | 366 // | | | | | | 367 // | | |___________| | | 368 // | | ___________ | | 369 // |____________| | Tick (t0) | | | 370 // barrier.wait() | | | | 371 // |___________| | | 372 // mediatorTime = t1 | | 373 // barrier.release() | | 374 // ____________ ___________ | | 375 // | Flush (t1) | | Tick (t1) | |_________________| 376 // | | | | barrier.wait() 377 // | | |___________| 378 // | | mediatorTime = t2 379 // | | barrier.release() 380 // | | ___________ _________________ 381 // | | | Tick (t2) | | Cold Flush (t2) | 382 // |____________| | | | | 383 // barrier.wait() |___________| | | 384 // mediatorTime = t3 | | 385 // barrier.release() | | 386 // ____________ ___________ | | 387 // | Flush (t3) | | Tick (t3) | | | 388 // | | | | | | 389 // | | |___________| | | 390 // | | ___________ | | 391 // | | | Tick (t3) | | | 392 // | | | | | | 393 // | | |___________| | | 394 // | | ___________ | | 395 // |____________| | Tick (t3) | |_________________| 396 // barrier.wait() | | barrier.wait() 397 // |___________| 398 // mediatorTime = t4 399 // barrier.release() 400 // ____________ ___________ _________________ 401 // | Flush (t4) | | Tick (t4) | | Cold Flush (t4) | 402 // | | | | | | 403 // ------------------------------------------------------------ 404 type mediatorTimeBarrier struct { 405 sync.Mutex 406 // Both mediatorTime and numFsProcessesWaiting are protected 407 // by the mutex. 408 mediatorTime xtime.UnixNano 409 numFsProcessesWaiting int 410 numMaxWaiters int 411 412 nowFn func() time.Time 413 iOpts instrument.Options 414 releaseCh chan xtime.UnixNano 415 } 416 417 // initialMediatorTime should only be used to obtain the initial time for 418 // the ongoing tick loop. All subsequent updates should come from the 419 // release method. 420 func (b *mediatorTimeBarrier) initialMediatorTime() xtime.UnixNano { 421 b.Lock() 422 defer b.Unlock() 423 return b.mediatorTime 424 } 425 426 func (b *mediatorTimeBarrier) fsProcessesWait() xtime.UnixNano { 427 b.Lock() 428 b.numFsProcessesWaiting++ 429 b.Unlock() 430 431 t := <-b.releaseCh 432 433 b.Lock() 434 b.numFsProcessesWaiting-- 435 b.Unlock() 436 return t 437 } 438 439 func (b *mediatorTimeBarrier) maybeRelease() (xtime.UnixNano, error) { 440 b.Lock() 441 numWaiters := b.numFsProcessesWaiting 442 mediatorTime := b.mediatorTime 443 b.Unlock() 444 445 if numWaiters == 0 { 446 // If there isn't a waiter yet then the filesystem processes may still 447 // be ongoing in which case we don't want to release the barrier / update 448 // the current time yet. Allow the tick to run again with the same time 449 // as before. 450 return mediatorTime, nil 451 } 452 453 // If the filesystem processes are waiting then update the time and allow 454 // both the filesystem processes and the tick to proceed with the new time. 455 newMediatorTime := xtime.ToUnixNano(b.nowFn()) 456 if newMediatorTime.Before(b.mediatorTime) { 457 instrument.EmitAndLogInvariantViolation(b.iOpts, func(l *zap.Logger) { 458 l.Error( 459 "mediator time attempted to move backwards in time", 460 zap.Time("prevTime", b.mediatorTime.ToTime()), 461 zap.Time("newTime", newMediatorTime.ToTime())) 462 }) 463 return 0, errMediatorTimeTriedToProgressBackwards 464 } 465 466 b.mediatorTime = newMediatorTime 467 for i := 0; i < numWaiters; i++ { 468 b.releaseCh <- b.mediatorTime 469 } 470 471 return b.mediatorTime, nil 472 } 473 474 func newMediatorTimeBarrier(nowFn func() time.Time, iOpts instrument.Options, maxWaiters int) mediatorTimeBarrier { 475 return mediatorTimeBarrier{ 476 mediatorTime: xtime.ToUnixNano(nowFn()), 477 nowFn: nowFn, 478 iOpts: iOpts, 479 numMaxWaiters: maxWaiters, 480 releaseCh: make(chan xtime.UnixNano), 481 } 482 }