github.com/m3db/m3@v1.5.0/src/dbnode/storage/mediator.go (about)

     1  // Copyright (c) 2016 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package storage
    22  
    23  import (
    24  	"errors"
    25  	"sync"
    26  	"time"
    27  
    28  	"github.com/m3db/m3/src/dbnode/persist/fs"
    29  	"github.com/m3db/m3/src/dbnode/persist/fs/commitlog"
    30  	"github.com/m3db/m3/src/x/clock"
    31  	"github.com/m3db/m3/src/x/instrument"
    32  	xtime "github.com/m3db/m3/src/x/time"
    33  
    34  	"github.com/uber-go/tally"
    35  	"go.uber.org/zap"
    36  )
    37  
    38  type (
    39  	mediatorState int
    40  )
    41  
    42  const (
    43  	fileOpCheckInterval = time.Second
    44  
    45  	mediatorNotOpen mediatorState = iota
    46  	mediatorOpen
    47  	mediatorClosed
    48  )
    49  
    50  var (
    51  	errMediatorAlreadyOpen                  = errors.New("mediator is already open")
    52  	errMediatorNotOpen                      = errors.New("mediator is not open")
    53  	errMediatorAlreadyClosed                = errors.New("mediator is already closed")
    54  	errMediatorTimeTriedToProgressBackwards = errors.New("mediator time tried to progress backwards")
    55  )
    56  
    57  type mediatorMetrics struct {
    58  	bootstrapStatus tally.Gauge
    59  	cleanupStatus   tally.Gauge
    60  	flushStatus     tally.Gauge
    61  	repairStatus    tally.Gauge
    62  }
    63  
    64  func newMediatorMetrics(scope tally.Scope) mediatorMetrics {
    65  	return mediatorMetrics{
    66  		bootstrapStatus: scope.Gauge("bootstrapped"),
    67  		cleanupStatus:   scope.Gauge("cleanup"),
    68  		flushStatus:     scope.Gauge("flush"),
    69  		repairStatus:    scope.Gauge("repair"),
    70  	}
    71  }
    72  
    73  type mediator struct {
    74  	sync.RWMutex
    75  	database database
    76  	databaseBootstrapManager
    77  	databaseFileSystemManager
    78  	databaseColdFlushManager
    79  	databaseTickManager
    80  
    81  	opts                Options
    82  	nowFn               clock.NowFn
    83  	sleepFn             sleepFn
    84  	metrics             mediatorMetrics
    85  	state               mediatorState
    86  	mediatorTimeBarrier mediatorTimeBarrier
    87  	closedCh            chan struct{}
    88  	tickInterval        time.Duration
    89  	fileOpsProcesses    []FileOpsProcess
    90  	backgroundProcesses []BackgroundProcess
    91  }
    92  
    93  // TODO(r): Consider renaming "databaseMediator" to "databaseCoordinator"
    94  // when we have time (now is not that time).
    95  func newMediator(database database, commitlog commitlog.CommitLog, opts Options) (databaseMediator, error) {
    96  	var (
    97  		iOpts = opts.InstrumentOptions()
    98  		scope = iOpts.MetricsScope()
    99  		nowFn = opts.ClockOptions().NowFn()
   100  	)
   101  	d := &mediator{
   102  		database:     database,
   103  		opts:         opts,
   104  		nowFn:        opts.ClockOptions().NowFn(),
   105  		sleepFn:      time.Sleep,
   106  		metrics:      newMediatorMetrics(scope),
   107  		state:        mediatorNotOpen,
   108  		closedCh:     make(chan struct{}),
   109  		tickInterval: opts.MediatorTickInterval(),
   110  	}
   111  	fsm := newFileSystemManager(database, commitlog, opts)
   112  	d.databaseFileSystemManager = fsm
   113  	d.fileOpsProcesses = []FileOpsProcess{
   114  		FileOpsProcessFn(d.ongoingFileSystemProcesses),
   115  		FileOpsProcessFn(d.ongoingColdFlushProcesses),
   116  	}
   117  	d.mediatorTimeBarrier = newMediatorTimeBarrier(nowFn, iOpts, len(d.fileOpsProcesses))
   118  
   119  	// NB(bodu): Cold flush needs its own persist manager now
   120  	// that its running in its own thread.
   121  	fsOpts := opts.CommitLogOptions().FilesystemOptions()
   122  	pm, err := fs.NewPersistManager(fsOpts)
   123  	if err != nil {
   124  		return nil, err
   125  	}
   126  	cfm := newColdFlushManager(database, pm, opts)
   127  	d.databaseColdFlushManager = cfm
   128  
   129  	d.databaseTickManager = newTickManager(database, opts)
   130  	d.databaseBootstrapManager = newBootstrapManager(database, d, opts)
   131  	return d, nil
   132  }
   133  
   134  func (m *mediator) RegisterBackgroundProcess(process BackgroundProcess) error {
   135  	m.Lock()
   136  	defer m.Unlock()
   137  
   138  	if m.state != mediatorNotOpen {
   139  		return errMediatorAlreadyOpen
   140  	}
   141  
   142  	m.backgroundProcesses = append(m.backgroundProcesses, process)
   143  	return nil
   144  }
   145  
   146  func (m *mediator) Open() error {
   147  	m.Lock()
   148  	defer m.Unlock()
   149  	if m.state != mediatorNotOpen {
   150  		return errMediatorAlreadyOpen
   151  	}
   152  	m.state = mediatorOpen
   153  
   154  	go m.reportLoop()
   155  	for _, fileOpsProcess := range m.fileOpsProcesses {
   156  		go fileOpsProcess.Start()
   157  	}
   158  	go m.ongoingTick()
   159  
   160  	for _, process := range m.backgroundProcesses {
   161  		process.Start()
   162  	}
   163  
   164  	return nil
   165  }
   166  
   167  func (m *mediator) DisableFileOpsAndWait() {
   168  	fsStatus := m.databaseFileSystemManager.Disable()
   169  	// Even though the cold flush runs separately, its still
   170  	// considered a fs process.
   171  	cfStatus := m.databaseColdFlushManager.Disable()
   172  	for fsStatus == fileOpInProgress {
   173  		m.sleepFn(fileOpCheckInterval)
   174  		fsStatus = m.databaseFileSystemManager.Status()
   175  	}
   176  	for cfStatus == fileOpInProgress {
   177  		m.sleepFn(fileOpCheckInterval)
   178  		cfStatus = m.databaseColdFlushManager.Status()
   179  	}
   180  }
   181  
   182  func (m *mediator) EnableFileOps() {
   183  	m.databaseFileSystemManager.Enable()
   184  	// Even though the cold flush runs separately, its still
   185  	// considered a fs process.
   186  	m.databaseColdFlushManager.Enable()
   187  }
   188  
   189  func (m *mediator) Report() {
   190  	m.databaseBootstrapManager.Report()
   191  	m.databaseFileSystemManager.Report()
   192  	m.databaseColdFlushManager.Report()
   193  
   194  	for _, process := range m.backgroundProcesses {
   195  		process.Report()
   196  	}
   197  }
   198  
   199  func (m *mediator) Close() error {
   200  	m.Lock()
   201  	defer m.Unlock()
   202  	if m.state == mediatorNotOpen {
   203  		return errMediatorNotOpen
   204  	}
   205  	if m.state == mediatorClosed {
   206  		return errMediatorAlreadyClosed
   207  	}
   208  	m.state = mediatorClosed
   209  	close(m.closedCh)
   210  
   211  	for _, process := range m.backgroundProcesses {
   212  		process.Stop()
   213  	}
   214  
   215  	return nil
   216  }
   217  
   218  // The mediator mediates the relationship between ticks and warm flushes/snapshots.
   219  //
   220  // For example, the requirements to perform a flush are:
   221  // 		1) currentTime > blockStart.Add(blockSize).Add(bufferPast)
   222  // 		2) node is not bootstrapping (technically shard is not bootstrapping)
   223  //
   224  // Similarly, there is logic in the Tick flow for removing shard flush states from a map so that it doesn't
   225  // grow infinitely for nodes that are not restarted. If the Tick path measured the current time when it made that
   226  // decision instead of using the same measurement that is shared with the flush logic, it might end up removing
   227  // a shard flush state (due to it expiring), but since the flush logic is using a slightly more stale timestamp it
   228  // will think that the old block hasn't been flushed (even thought it has) and try to flush it even though the data
   229  // is potentially still on disk (if it hasn't been cleaned up yet).
   230  //
   231  // See comment over mediatorTimeBarrier for more details on how this is implemented.
   232  func (m *mediator) ongoingFileSystemProcesses() {
   233  	for {
   234  		select {
   235  		case <-m.closedCh:
   236  			return
   237  		default:
   238  			m.sleepFn(m.tickInterval)
   239  
   240  			// Check if the mediator is already closed.
   241  			if !m.IsOpen() {
   242  				return
   243  			}
   244  
   245  			m.runFileSystemProcesses()
   246  		}
   247  	}
   248  }
   249  
   250  // The mediator mediates the relationship between ticks and cold flushes/cleanup the same way it does for warm flushes/snapshots.
   251  // We want to begin each cold/warm flush with an in sync view of time as a tick.
   252  // NB(bodu): Cold flushes and cleanup have been separated out into it's own thread to avoid blocking snapshots.
   253  func (m *mediator) ongoingColdFlushProcesses() {
   254  	for {
   255  		select {
   256  		case <-m.closedCh:
   257  			return
   258  		default:
   259  			m.sleepFn(m.tickInterval)
   260  
   261  			// Check if the mediator is already closed.
   262  			if !m.IsOpen() {
   263  				return
   264  			}
   265  
   266  			m.runColdFlushProcesses()
   267  		}
   268  	}
   269  }
   270  
   271  func (m *mediator) ongoingTick() {
   272  	var (
   273  		log          = m.opts.InstrumentOptions().Logger()
   274  		mediatorTime = m.mediatorTimeBarrier.initialMediatorTime()
   275  	)
   276  	for {
   277  		select {
   278  		case <-m.closedCh:
   279  			return
   280  		default:
   281  			m.sleepFn(m.tickInterval)
   282  
   283  			// Check if the mediator is already closed.
   284  			if !m.IsOpen() {
   285  				return
   286  			}
   287  
   288  			// See comment over mediatorTimeBarrier for an explanation of this logic.
   289  			newMediatorTime, err := m.mediatorTimeBarrier.maybeRelease()
   290  			if err != nil {
   291  				log.Error("ongoing tick was unable to release time barrier", zap.Error(err))
   292  				continue
   293  			}
   294  			mediatorTime = newMediatorTime
   295  
   296  			// NB(bodu): We may still hit a db closed error here since the db does not wait upon
   297  			// completion of ticks.
   298  			if err := m.Tick(force, mediatorTime); err != nil && err != errDatabaseIsClosed {
   299  				log.Error("error within tick", zap.Error(err))
   300  			}
   301  		}
   302  	}
   303  }
   304  
   305  func (m *mediator) runFileSystemProcesses() {
   306  	// See comment over mediatorTimeBarrier for an explanation of this logic.
   307  	mediatorTime := m.mediatorTimeBarrier.fsProcessesWait()
   308  	m.databaseFileSystemManager.Run(mediatorTime)
   309  }
   310  
   311  func (m *mediator) runColdFlushProcesses() {
   312  	// See comment over mediatorTimeBarrier for an explanation of this logic.
   313  	mediatorTime := m.mediatorTimeBarrier.fsProcessesWait()
   314  	m.databaseColdFlushManager.Run(mediatorTime)
   315  }
   316  
   317  func (m *mediator) reportLoop() {
   318  	interval := m.opts.InstrumentOptions().ReportInterval()
   319  	t := time.NewTicker(interval)
   320  
   321  	for {
   322  		select {
   323  		case <-t.C:
   324  			m.Report()
   325  		case <-m.closedCh:
   326  			t.Stop()
   327  			return
   328  		}
   329  	}
   330  }
   331  
   332  func (m *mediator) IsOpen() bool {
   333  	m.RLock()
   334  	defer m.RUnlock()
   335  	return m.state == mediatorOpen
   336  }
   337  
   338  // mediatorTimeBarrier is used to prevent the tick process and the filesystem processes from ever running
   339  // concurrently with an inconsistent view of time. Each time the filesystem processes want to run they first
   340  // register for the next barrier by calling fsProcessesWait(). Once a tick completes it will call maybeRelease()
   341  // which will detect that the filesystem processes are waiting for the next barrier at which point it will update
   342  // the mediator time and propagate that information to the filesystem processes via the releaseCh. If the filesystem
   343  // processes are still running when the tick completes, the call to maybeRelease() will just return the same time
   344  // as the previous run and another tick will run with the same timestamp as the previous one.
   345  //
   346  // This cooperation ensures that multiple ticks can run during a single run of filesystem processes (although
   347  // each tick will run with the same startTime), but that if a tick and run of filesystem processes are executing
   348  // concurrently they will always have the same value for startTime.
   349  //
   350  // Note that this scheme (specifically the tick process calling maybeRelease() and the fs processes waiting instead
   351  // of vice versa) is specifically designed such that the ticking process is never blocked and is constantly running.
   352  // This means that once a run of filesystem processes completes it will always have to wait until the currently
   353  // executing tick completes before performing the next run, but in practice this should not be much of an issue.
   354  //
   355  // Additionally, an independent cold flush process complicates this a bit more in that we have more than one filesystem
   356  // process waiting on the mediator barrier. The invariant here is that both warm and cold flushes always start on a tick
   357  // with a consistent view of time as the tick it is on. They don't necessarily need to start on the same tick. See the
   358  // diagram below for an example case.
   359  //
   360  //  ____________       ___________          _________________
   361  // | Flush (t0) |     | Tick (t0) |        | Cold Flush (t0) |
   362  // |            |     |           |        |                 |
   363  // |            |     |___________|        |                 |
   364  // |            |      ___________         |                 |
   365  // |            |     | Tick (t0) |        |                 |
   366  // |            |     |           |        |                 |
   367  // |            |     |___________|        |                 |
   368  // |            |      ___________         |                 |
   369  // |____________|     | Tick (t0) |        |                 |
   370  //  barrier.wait()    |           |        |                 |
   371  //                    |___________|        |                 |
   372  //                    mediatorTime = t1    |                 |
   373  //                    barrier.release()    |                 |
   374  //  ____________       ___________         |                 |
   375  // | Flush (t1) |     | Tick (t1) |        |_________________|
   376  // |            |     |           |         barrier.wait()
   377  // |            |     |___________|
   378  // |            |      mediatorTime = t2
   379  // |            |      barrier.release()
   380  // |            |       ___________         _________________
   381  // |            |      | Tick (t2) |       | Cold Flush (t2) |
   382  // |____________|      |           |       |                 |
   383  //  barrier.wait()     |___________|       |                 |
   384  //                     mediatorTime = t3   |                 |
   385  //                     barrier.release()   |                 |
   386  //   ____________       ___________        |                 |
   387  //  | Flush (t3) |     | Tick (t3) |       |                 |
   388  //  |            |     |           |       |                 |
   389  //  |            |     |___________|       |                 |
   390  //  |            |      ___________        |                 |
   391  //  |            |     | Tick (t3) |       |                 |
   392  //  |            |     |           |       |                 |
   393  //  |            |     |___________|       |                 |
   394  //  |            |      ___________        |                 |
   395  //  |____________|     | Tick (t3) |       |_________________|
   396  //   barrier.wait()    |           |        barrier.wait()
   397  //                     |___________|
   398  //                     mediatorTime = t4
   399  //                     barrier.release()
   400  //   ____________       ___________         _________________
   401  //  | Flush (t4) |     | Tick (t4) |       | Cold Flush (t4) |
   402  //  |            |     |           |       |                 |
   403  // ------------------------------------------------------------
   404  type mediatorTimeBarrier struct {
   405  	sync.Mutex
   406  	// Both mediatorTime and numFsProcessesWaiting are protected
   407  	// by the mutex.
   408  	mediatorTime          xtime.UnixNano
   409  	numFsProcessesWaiting int
   410  	numMaxWaiters         int
   411  
   412  	nowFn     func() time.Time
   413  	iOpts     instrument.Options
   414  	releaseCh chan xtime.UnixNano
   415  }
   416  
   417  // initialMediatorTime should only be used to obtain the initial time for
   418  // the ongoing tick loop. All subsequent updates should come from the
   419  // release method.
   420  func (b *mediatorTimeBarrier) initialMediatorTime() xtime.UnixNano {
   421  	b.Lock()
   422  	defer b.Unlock()
   423  	return b.mediatorTime
   424  }
   425  
   426  func (b *mediatorTimeBarrier) fsProcessesWait() xtime.UnixNano {
   427  	b.Lock()
   428  	b.numFsProcessesWaiting++
   429  	b.Unlock()
   430  
   431  	t := <-b.releaseCh
   432  
   433  	b.Lock()
   434  	b.numFsProcessesWaiting--
   435  	b.Unlock()
   436  	return t
   437  }
   438  
   439  func (b *mediatorTimeBarrier) maybeRelease() (xtime.UnixNano, error) {
   440  	b.Lock()
   441  	numWaiters := b.numFsProcessesWaiting
   442  	mediatorTime := b.mediatorTime
   443  	b.Unlock()
   444  
   445  	if numWaiters == 0 {
   446  		// If there isn't a waiter yet then the filesystem processes may still
   447  		// be ongoing in which case we don't want to release the barrier / update
   448  		// the current time yet. Allow the tick to run again with the same time
   449  		// as before.
   450  		return mediatorTime, nil
   451  	}
   452  
   453  	// If the filesystem processes are waiting then update the time and allow
   454  	// both the filesystem processes and the tick to proceed with the new time.
   455  	newMediatorTime := xtime.ToUnixNano(b.nowFn())
   456  	if newMediatorTime.Before(b.mediatorTime) {
   457  		instrument.EmitAndLogInvariantViolation(b.iOpts, func(l *zap.Logger) {
   458  			l.Error(
   459  				"mediator time attempted to move backwards in time",
   460  				zap.Time("prevTime", b.mediatorTime.ToTime()),
   461  				zap.Time("newTime", newMediatorTime.ToTime()))
   462  		})
   463  		return 0, errMediatorTimeTriedToProgressBackwards
   464  	}
   465  
   466  	b.mediatorTime = newMediatorTime
   467  	for i := 0; i < numWaiters; i++ {
   468  		b.releaseCh <- b.mediatorTime
   469  	}
   470  
   471  	return b.mediatorTime, nil
   472  }
   473  
   474  func newMediatorTimeBarrier(nowFn func() time.Time, iOpts instrument.Options, maxWaiters int) mediatorTimeBarrier {
   475  	return mediatorTimeBarrier{
   476  		mediatorTime:  xtime.ToUnixNano(nowFn()),
   477  		nowFn:         nowFn,
   478  		iOpts:         iOpts,
   479  		numMaxWaiters: maxWaiters,
   480  		releaseCh:     make(chan xtime.UnixNano),
   481  	}
   482  }