github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/ruler/storage/instance/manager.go (about)

     1  // This directory was copied and adapted from https://github.com/grafana/agent/tree/main/pkg/metrics.
     2  // We cannot vendor the agent in since the agent vendors loki in, which would cause a cyclic dependency.
     3  // NOTE: many changes have been made to the original code for our use-case.
     4  package instance
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/go-kit/log"
    14  	"github.com/go-kit/log/level"
    15  	"github.com/prometheus/prometheus/storage"
    16  
    17  	util_log "github.com/grafana/loki/pkg/util/log"
    18  )
    19  
    20  var (
    21  	// DefaultBasicManagerConfig is the default config for the BasicManager.
    22  	DefaultBasicManagerConfig = BasicManagerConfig{
    23  		InstanceRestartBackoff: 5 * time.Second,
    24  	}
    25  )
    26  
    27  // Manager represents a set of methods for manipulating running instances at
    28  // runtime.
    29  type Manager interface {
    30  	// GetInstance retrieves a ManagedInstance by name.
    31  	GetInstance(name string) (ManagedInstance, error)
    32  
    33  	// ListInstances returns all currently managed instances running
    34  	// within the Manager. The key will be the instance name from their config.
    35  	ListInstances() map[string]ManagedInstance
    36  
    37  	// ListConfigs returns the config objects associated with a managed
    38  	// instance. The key will be the Name field from Config.
    39  	ListConfigs() map[string]Config
    40  
    41  	// ApplyConfig creates a new Config or updates an existing Config if
    42  	// one with Config.Name already exists.
    43  	ApplyConfig(Config) error
    44  
    45  	// DeleteConfig deletes a given managed instance based on its Config.Name.
    46  	DeleteConfig(name string) error
    47  
    48  	// Ready indicates if all instances are ready for processing.
    49  	Ready() bool
    50  
    51  	// InstanceReady indicates if an instance is ready for processing.
    52  	InstanceReady(name string) bool
    53  
    54  	// Stop stops the Manager and all managed instances.
    55  	Stop()
    56  }
    57  
    58  // ManagedInstance is implemented by Instance. It is defined as an interface
    59  // for the sake of testing from Manager implementations.
    60  type ManagedInstance interface {
    61  	Ready() bool
    62  	Run(ctx context.Context) error
    63  	Update(c Config) error
    64  	StorageDirectory() string
    65  	Appender(ctx context.Context) storage.Appender
    66  	Stop() error
    67  	Tenant() string
    68  }
    69  
    70  // BasicManagerConfig controls the operations of a BasicManager.
    71  type BasicManagerConfig struct {
    72  	InstanceRestartBackoff time.Duration
    73  }
    74  
    75  // BasicManager creates a new BasicManager, implementing the Manager interface.
    76  // BasicManager will directly launch instances and perform no extra processing.
    77  //
    78  // Other implementations of Manager usually wrap a BasicManager.
    79  type BasicManager struct {
    80  	cfgMut  sync.Mutex
    81  	cfg     BasicManagerConfig
    82  	logger  log.Logger
    83  	metrics *Metrics
    84  
    85  	// Take care when locking mut: if you hold onto a lock of mut while calling
    86  	// Stop on a process, you will deadlock.
    87  	mut       sync.Mutex
    88  	processes map[string]*managedProcess
    89  
    90  	launch Factory
    91  }
    92  
    93  // managedProcess represents a goroutine running a ManagedInstance. cancel
    94  // requests that the goroutine should shutdown. done will be closed after the
    95  // goroutine exists.
    96  type managedProcess struct {
    97  	cfg    Config
    98  	inst   ManagedInstance
    99  	cancel context.CancelFunc
   100  	done   chan bool
   101  }
   102  
   103  func (p managedProcess) Stop() {
   104  	if p.inst.Ready() { // Only stop initialized instances to avoid panic
   105  		if err := p.inst.Stop(); err != nil {
   106  			level.Error(util_log.Logger).Log("msg", "error while stopping instance", "user", p.inst.Tenant(), "err", err)
   107  		}
   108  	}
   109  
   110  	p.cancel()
   111  	<-p.done
   112  }
   113  
   114  // Factory should return an unstarted instance given some config.
   115  type Factory func(c Config) (ManagedInstance, error)
   116  
   117  // NewBasicManager creates a new BasicManager. The launch function will be
   118  // invoked any time a new Config is applied.
   119  //
   120  // The lifecycle of any ManagedInstance returned by the launch function will
   121  // be handled by the BasicManager. Instances will be automatically restarted
   122  // if stopped, updated if the config changes, or removed when the Config is
   123  // deleted.
   124  func NewBasicManager(cfg BasicManagerConfig, metrics *Metrics, logger log.Logger, launch Factory) *BasicManager {
   125  	return &BasicManager{
   126  		cfg:       cfg,
   127  		metrics:   metrics,
   128  		logger:    logger,
   129  		processes: make(map[string]*managedProcess),
   130  		launch:    launch,
   131  	}
   132  }
   133  
   134  // UpdateManagerConfig updates the BasicManagerConfig.
   135  func (m *BasicManager) UpdateManagerConfig(c BasicManagerConfig) {
   136  	m.cfgMut.Lock()
   137  	defer m.cfgMut.Unlock()
   138  	m.cfg = c
   139  }
   140  
   141  // GetInstance returns the given instance by name.
   142  func (m *BasicManager) GetInstance(name string) (ManagedInstance, error) {
   143  	m.mut.Lock()
   144  	defer m.mut.Unlock()
   145  
   146  	process, ok := m.processes[name]
   147  	if !ok {
   148  		return nil, fmt.Errorf("instance %s does not exist", name)
   149  	}
   150  	return process.inst, nil
   151  }
   152  
   153  // ListInstances returns the current active instances managed by BasicManager.
   154  func (m *BasicManager) ListInstances() map[string]ManagedInstance {
   155  	m.mut.Lock()
   156  	defer m.mut.Unlock()
   157  
   158  	res := make(map[string]ManagedInstance, len(m.processes))
   159  	for name, process := range m.processes {
   160  		res[name] = process.inst
   161  	}
   162  	return res
   163  }
   164  
   165  // ListConfigs lists the current active configs managed by BasicManager.
   166  func (m *BasicManager) ListConfigs() map[string]Config {
   167  	m.mut.Lock()
   168  	defer m.mut.Unlock()
   169  
   170  	res := make(map[string]Config, len(m.processes))
   171  	for name, process := range m.processes {
   172  		res[name] = process.cfg
   173  	}
   174  	return res
   175  }
   176  
   177  // ApplyConfig takes a Config and either starts a new managed instance or
   178  // updates an existing managed instance. The value for Name in c is used to
   179  // uniquely identify the Config and determine whether the Config has an
   180  // existing associated managed instance.
   181  func (m *BasicManager) ApplyConfig(c Config) error {
   182  	m.mut.Lock()
   183  	defer m.mut.Unlock()
   184  
   185  	// If the config already exists, we need to update it.
   186  	proc, ok := m.processes[c.Name]
   187  	if ok {
   188  		err := proc.inst.Update(c)
   189  
   190  		// If the instance could not be dynamically updated, we need to force the
   191  		// update by restarting it. If it failed for another reason, something
   192  		// serious went wrong and we'll completely give up without stopping the
   193  		// existing job.
   194  		if errors.Is(err, ErrInvalidUpdate{}) {
   195  			level.Info(m.logger).Log("msg", "could not dynamically update instance, will manually restart", "instance", c.Name, "reason", err)
   196  
   197  			// NOTE: we don't return here; we fall through to spawn the new instance.
   198  			proc.Stop()
   199  		} else if err != nil {
   200  			return fmt.Errorf("failed to update instance %s: %w", c.Name, err)
   201  		} else {
   202  			level.Info(m.logger).Log("msg", "dynamically updated instance", "instance", c.Name)
   203  
   204  			proc.cfg = c
   205  			return nil
   206  		}
   207  	}
   208  
   209  	// Spawn a new process for the new config.
   210  	err := m.spawnProcess(c)
   211  	if err != nil {
   212  		return err
   213  	}
   214  
   215  	m.metrics.RunningInstances.Inc()
   216  	return nil
   217  }
   218  
   219  func (m *BasicManager) spawnProcess(c Config) error {
   220  	inst, err := m.launch(c)
   221  	if err != nil {
   222  		return err
   223  	}
   224  
   225  	ctx, cancel := context.WithCancel(context.Background())
   226  	done := make(chan bool)
   227  
   228  	proc := &managedProcess{
   229  		cancel: cancel,
   230  		done:   done,
   231  		cfg:    c,
   232  		inst:   inst,
   233  	}
   234  	m.processes[c.Name] = proc
   235  
   236  	go func() {
   237  		m.runProcess(ctx, c.Name, inst)
   238  		close(done)
   239  
   240  		// Now that the process has stopped, we can remove it from our managed
   241  		// list.
   242  		//
   243  		// However, it's possible that a new Config may have been applied and
   244  		// overwrote the initial value in our map. We only want to delete the
   245  		// process from the map if it hasn't changed from what we initially
   246  		// set it to.
   247  		//
   248  		// We only use the instance for comparing (which will never change) because
   249  		// the instance may have dynamically been given a new config since this
   250  		// goroutine started.
   251  		m.mut.Lock()
   252  		if storedProc, exist := m.processes[c.Name]; exist && storedProc.inst == inst {
   253  			delete(m.processes, c.Name)
   254  		}
   255  		m.mut.Unlock()
   256  
   257  		m.metrics.RunningInstances.Dec()
   258  	}()
   259  
   260  	return nil
   261  }
   262  
   263  // runProcess runs and instance and keeps it alive until it is explicitly stopped
   264  // by cancelling the context.
   265  func (m *BasicManager) runProcess(ctx context.Context, name string, inst ManagedInstance) {
   266  	for {
   267  		err := inst.Run(ctx)
   268  		if err != nil && err != context.Canceled {
   269  			backoff := m.instanceRestartBackoff()
   270  
   271  			m.metrics.AbnormalExits.WithLabelValues(name).Inc()
   272  			level.Error(m.logger).Log("msg", "instance stopped abnormally, restarting after backoff period", "err", err, "backoff", backoff, "instance", name)
   273  			time.Sleep(backoff)
   274  		} else {
   275  			level.Info(m.logger).Log("msg", "stopped instance", "instance", name)
   276  			break
   277  		}
   278  	}
   279  }
   280  
   281  func (m *BasicManager) instanceRestartBackoff() time.Duration {
   282  	m.cfgMut.Lock()
   283  	defer m.cfgMut.Unlock()
   284  	return m.cfg.InstanceRestartBackoff
   285  }
   286  
   287  // DeleteConfig removes a managed instance by its config name. Returns an error
   288  // if there is no such managed instance with the given name.
   289  func (m *BasicManager) DeleteConfig(name string) error {
   290  	m.mut.Lock()
   291  	proc, ok := m.processes[name]
   292  	if !ok {
   293  		m.mut.Unlock()
   294  		return errors.New("config does not exist")
   295  	}
   296  	m.mut.Unlock()
   297  
   298  	// spawnProcess is responsible for removing the process from the map after it
   299  	// stops so we don't need to delete anything from m.processes here.
   300  	proc.Stop()
   301  	return nil
   302  }
   303  
   304  // Ready indicates if all instances are ready for processing.
   305  func (m *BasicManager) Ready() bool {
   306  	m.mut.Lock()
   307  	defer m.mut.Unlock()
   308  
   309  	for _, process := range m.processes {
   310  		if process.inst == nil {
   311  			return false
   312  		}
   313  
   314  		if !process.inst.Ready() {
   315  			return false
   316  		}
   317  	}
   318  
   319  	return true
   320  }
   321  
   322  // InstanceReady indicates if an instance is ready for processing.
   323  func (m *BasicManager) InstanceReady(name string) bool {
   324  	inst, err := m.GetInstance(name)
   325  	if err != nil {
   326  		return false
   327  	}
   328  
   329  	return inst.Ready()
   330  }
   331  
   332  // Stop stops the BasicManager and stops all active processes for configs.
   333  func (m *BasicManager) Stop() {
   334  	var wg sync.WaitGroup
   335  
   336  	// We don't need to change m.processes here; processes remove themselves
   337  	// from the map (in spawnProcess).
   338  	m.mut.Lock()
   339  	wg.Add(len(m.processes))
   340  	for _, proc := range m.processes {
   341  		go func(proc *managedProcess) {
   342  			proc.Stop()
   343  			wg.Done()
   344  		}(proc)
   345  	}
   346  	m.mut.Unlock()
   347  
   348  	wg.Wait()
   349  }
   350  
   351  // MockManager exposes methods of the Manager interface as struct fields.
   352  // Useful for tests.
   353  type MockManager struct {
   354  	GetInstanceFunc   func(name string) (ManagedInstance, error)
   355  	ListInstancesFunc func() map[string]ManagedInstance
   356  	ListConfigsFunc   func() map[string]Config
   357  	ApplyConfigFunc   func(Config) error
   358  	DeleteConfigFunc  func(name string) error
   359  	StopFunc          func()
   360  }
   361  
   362  func (m MockManager) Ready() bool {
   363  	return true
   364  }
   365  
   366  func (m MockManager) InstanceReady(name string) bool {
   367  	return true
   368  }
   369  
   370  // GetInstance implements Manager.
   371  func (m MockManager) GetInstance(name string) (ManagedInstance, error) {
   372  	if m.GetInstanceFunc != nil {
   373  		return m.GetInstanceFunc(name)
   374  	}
   375  	panic("GetInstanceFunc not implemented")
   376  }
   377  
   378  // ListInstances implements Manager.
   379  func (m MockManager) ListInstances() map[string]ManagedInstance {
   380  	if m.ListInstancesFunc != nil {
   381  		return m.ListInstancesFunc()
   382  	}
   383  	panic("ListInstancesFunc not implemented")
   384  }
   385  
   386  // ListConfigs implements Manager.
   387  func (m MockManager) ListConfigs() map[string]Config {
   388  	if m.ListConfigsFunc != nil {
   389  		return m.ListConfigsFunc()
   390  	}
   391  	panic("ListConfigsFunc not implemented")
   392  }
   393  
   394  // ApplyConfig implements Manager.
   395  func (m MockManager) ApplyConfig(c Config) error {
   396  	if m.ApplyConfigFunc != nil {
   397  		return m.ApplyConfigFunc(c)
   398  	}
   399  	panic("ApplyConfigFunc not implemented")
   400  }
   401  
   402  // DeleteConfig implements Manager.
   403  func (m MockManager) DeleteConfig(name string) error {
   404  	if m.DeleteConfigFunc != nil {
   405  		return m.DeleteConfigFunc(name)
   406  	}
   407  	panic("DeleteConfigFunc not implemented")
   408  }
   409  
   410  // Stop implements Manager.
   411  func (m MockManager) Stop() {
   412  	if m.StopFunc != nil {
   413  		m.StopFunc()
   414  		return
   415  	}
   416  	panic("StopFunc not implemented")
   417  }