github.com/choria-io/go-choria@v0.28.1-0.20240416190746-b3bf9c7d5a45/providers/governor/streams/governor.go (about)

     1  // Copyright 2020-2022 The NATS Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  // http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  //
    15  // Copyright (c) 2022, R.I. Pienaar and the Choria Project contributors
    16  //
    17  // SPDX-License-Identifier: Apache-2.0
    18  
    19  // Package governor controls the concurrency of a network wide process
    20  //
    21  // Using this one can, for example, create CRON jobs that can trigger
    22  // 100s or 1000s concurrently but where most will wait for a set limit
    23  // to complete.  In effect limiting the overall concurrency of these
    24  // execution.
    25  //
    26  // To do this a Stream is created that has a maximum message limit and
    27  // that will reject new entries when full.
    28  //
    29  // Workers will try to place themselves in the Stream, they do their work
    30  // if they succeed and remove themselves from the Stream once they are done.
    31  //
    32  // As a fail safe the stack will evict entries after a set time based on
    33  // Stream max age.
    34  //
    35  // A manager is included to create, observe and edit these streams and the
    36  // choria CLI has a new command build on this library: choria governor
    37  package governor
    38  
    39  import (
    40  	"context"
    41  	"errors"
    42  	"fmt"
    43  	"sort"
    44  	"strings"
    45  	"sync"
    46  	"time"
    47  
    48  	"github.com/choria-io/go-choria/backoff"
    49  	iu "github.com/choria-io/go-choria/internal/util"
    50  	"github.com/google/go-cmp/cmp"
    51  	"github.com/nats-io/jsm.go"
    52  	"github.com/nats-io/nats.go"
    53  )
    54  
    55  // DefaultInterval default sleep between tries, set with WithInterval()
    56  const DefaultInterval = 250 * time.Millisecond
    57  
    58  // Finisher signals that work is completed releasing the slot on the stack
    59  type Finisher func() error
    60  
    61  // Governor controls concurrency of distributed processes using a named governor stream
    62  type Governor interface {
    63  	// Start attempts to get a spot in the Governor, gives up on context, call Finisher to signal end of work
    64  	Start(ctx context.Context, name string) (fin Finisher, seq uint64, err error)
    65  	// Connection is the NATS connection used to communicate
    66  	Connection() *nats.Conn
    67  }
    68  
    69  // Logger is a custom logger
    70  type Logger interface {
    71  	Debugf(format string, a ...any)
    72  	Infof(format string, a ...any)
    73  	Warnf(format string, a ...any)
    74  	Errorf(format string, a ...any)
    75  }
    76  
    77  // Manager controls concurrent executions of work distributed throughout a nats network by using
    78  // a stream as a capped stack where workers reserve a slot and later release the slot
    79  type Manager interface {
    80  	// Limit is the configured maximum entries in the Governor
    81  	Limit() int64
    82  	// MaxAge is the time after which entries will be evicted
    83  	MaxAge() time.Duration
    84  	// Name is the Governor name
    85  	Name() string
    86  	// Replicas is how many data replicas are kept of the data
    87  	Replicas() int
    88  	// SetLimit configures the maximum entries in the Governor and takes immediate effect
    89  	SetLimit(uint64) error
    90  	// SetMaxAge configures the maximum age of entries, takes immediate effect
    91  	SetMaxAge(time.Duration) error
    92  	// SetSubject configures the underlying NATS subject the Governor listens on for entry campaigns
    93  	SetSubject(subj string) error
    94  	// Stream is the underlying JetStream stream
    95  	Stream() *jsm.Stream
    96  	// Subject is the subject the Governor listens on for entry campaigns
    97  	Subject() string
    98  	// Reset resets the governor removing all current entries from it
    99  	Reset() error
   100  	// Active is the number of active entries in the Governor
   101  	Active() (uint64, error)
   102  	// Evict removes an entry from the Governor given its unique id, returns the name that was on that entry
   103  	Evict(entry uint64) (name string, err error)
   104  	// LastActive returns the the since entry was added to the Governor, can be zero time when no entries were added
   105  	LastActive() (time.Time, error)
   106  	// Connection is the NATS connection used to communicate
   107  	Connection() *nats.Conn
   108  }
   109  
   110  var errRetry = errors.New("retryable error")
   111  
   112  type jsGMgr struct {
   113  	name     string
   114  	stream   string
   115  	maxAge   time.Duration
   116  	limit    uint64
   117  	mgr      *jsm.Manager
   118  	nc       *nats.Conn
   119  	str      *jsm.Stream
   120  	subj     string
   121  	replicas int
   122  	running  bool
   123  	noCreate bool
   124  	noLeave  bool
   125  
   126  	logger Logger
   127  	cint   time.Duration
   128  	bo     *backoff.Policy
   129  
   130  	mu sync.Mutex
   131  }
   132  
   133  func NewManager(name string, limit uint64, maxAge time.Duration, replicas uint, nc *nats.Conn, update bool, opts ...Option) (Manager, error) {
   134  	mgr, err := jsm.New(nc)
   135  	if err != nil {
   136  		return nil, err
   137  	}
   138  
   139  	gov := &jsGMgr{
   140  		name:     name,
   141  		maxAge:   maxAge,
   142  		limit:    limit,
   143  		mgr:      mgr,
   144  		nc:       nc,
   145  		replicas: int(replicas),
   146  		cint:     DefaultInterval,
   147  	}
   148  
   149  	for _, opt := range opts {
   150  		opt(gov)
   151  	}
   152  
   153  	if limit == 0 {
   154  		gov.noCreate = true
   155  	}
   156  
   157  	gov.stream = gov.streamName()
   158  	gov.subj = gov.streamSubject()
   159  
   160  	err = gov.loadOrCreate(update)
   161  	if err != nil {
   162  		return nil, err
   163  	}
   164  
   165  	return gov, nil
   166  }
   167  
   168  type Option func(mgr *jsGMgr)
   169  
   170  // WithLogger configures the logger to use, no logging when none is given
   171  func WithLogger(log Logger) Option {
   172  	return func(mgr *jsGMgr) {
   173  		mgr.logger = log
   174  	}
   175  }
   176  
   177  // WithBackoff sets a backoff policy for gradually reducing try interval
   178  func WithBackoff(p backoff.Policy) Option {
   179  	return func(mgr *jsGMgr) {
   180  		mgr.bo = &p
   181  	}
   182  }
   183  
   184  // WithInterval sets the interval between tries
   185  func WithInterval(i time.Duration) Option {
   186  	return func(mgr *jsGMgr) {
   187  		mgr.cint = i
   188  	}
   189  }
   190  
   191  // WithSubject configures a specific subject for the governor to act on
   192  func WithSubject(s string) Option {
   193  	return func(mgr *jsGMgr) {
   194  		mgr.subj = s
   195  	}
   196  }
   197  
   198  // WithoutLeavingOnCompletion prevents removal from the governor after execution
   199  func WithoutLeavingOnCompletion() Option {
   200  	return func(mgr *jsGMgr) {
   201  		mgr.noLeave = true
   202  	}
   203  }
   204  
   205  func New(name string, nc *nats.Conn, opts ...Option) Governor {
   206  	mgr, err := jsm.New(nc)
   207  	if err != nil {
   208  		return nil
   209  	}
   210  
   211  	gov := &jsGMgr{
   212  		name: name,
   213  		mgr:  mgr,
   214  		nc:   nc,
   215  		cint: DefaultInterval,
   216  	}
   217  
   218  	for _, opt := range opts {
   219  		opt(gov)
   220  	}
   221  
   222  	gov.stream = gov.streamName()
   223  	gov.subj = gov.streamSubject()
   224  
   225  	return gov
   226  }
   227  
   228  func (g *jsGMgr) streamSubject() string {
   229  	if g.subj != "" {
   230  		return g.subj
   231  	}
   232  
   233  	return fmt.Sprintf("$GOVERNOR.campaign.%s", g.name)
   234  }
   235  
   236  func (g *jsGMgr) streamName() string {
   237  	if g.stream != "" {
   238  		return g.stream
   239  	}
   240  
   241  	return StreamName(g.name)
   242  }
   243  
   244  func StreamName(governor string) string {
   245  	return fmt.Sprintf("GOVERNOR_%s", governor)
   246  }
   247  
   248  func List(nc *nats.Conn, collective string) ([]string, error) {
   249  	mgr, err := jsm.New(nc)
   250  	if err != nil {
   251  		return nil, err
   252  	}
   253  
   254  	known, err := mgr.StreamNames(&jsm.StreamNamesFilter{
   255  		Subject: iu.GovernorSubject("*", collective),
   256  	})
   257  	if err != nil {
   258  		return nil, err
   259  	}
   260  
   261  	for i := 0; i < len(known); i++ {
   262  		known[i] = strings.TrimPrefix(known[i], "GOVERNOR_")
   263  	}
   264  
   265  	sort.Strings(known)
   266  
   267  	return known, nil
   268  }
   269  func (g *jsGMgr) Start(ctx context.Context, name string) (Finisher, uint64, error) {
   270  	g.mu.Lock()
   271  	defer g.mu.Unlock()
   272  
   273  	if g.running {
   274  		return nil, 0, fmt.Errorf("already running")
   275  	}
   276  
   277  	g.running = true
   278  	seq := uint64(0)
   279  	tries := 0
   280  
   281  	try := func() error {
   282  		ctx, cancel := context.WithTimeout(ctx, time.Second)
   283  		defer cancel()
   284  
   285  		g.Debugf("Publishing to %s", g.subj)
   286  		m, err := g.nc.RequestWithContext(ctx, g.subj, []byte(name))
   287  		if err != nil {
   288  			g.Errorf("Publishing to governor %s via %s failed: %s", g.name, g.subj, err)
   289  			return err
   290  		}
   291  
   292  		res, err := jsm.ParsePubAck(m)
   293  		if err != nil {
   294  			// jetstream sent us a puback error, this is retryable in the case of governors
   295  			if jsm.IsNatsError(err, 10077) {
   296  				g.Debugf("Could not obtain a slot: %v", err)
   297  				return errRetry
   298  			}
   299  
   300  			g.Errorf("Invalid pub ack: %s", err)
   301  			return err
   302  		}
   303  
   304  		seq = res.Sequence
   305  
   306  		g.Infof("Got a slot on %s with sequence %d", g.name, seq)
   307  
   308  		return nil
   309  	}
   310  
   311  	closer := func() error {
   312  		if seq == 0 {
   313  			return nil
   314  		}
   315  
   316  		g.mu.Lock()
   317  		defer g.mu.Unlock()
   318  		if !g.running {
   319  			return nil
   320  		}
   321  
   322  		g.running = false
   323  
   324  		if g.noLeave {
   325  			g.Infof("Not evicting self from %s based on configuration directive", g.name)
   326  			return nil
   327  		}
   328  
   329  		g.Infof("Removing self from %s sequence %d", g.name, seq)
   330  		err := g.mgr.DeleteStreamMessage(g.stream, seq, true)
   331  		if err != nil {
   332  			g.Errorf("Could not remove self from %s: %s", g.name, err)
   333  			return fmt.Errorf("could not remove seq %d: %s", seq, err)
   334  		}
   335  
   336  		return nil
   337  	}
   338  
   339  	g.Debugf("Starting to campaign every %v for a slot on %s using %s", g.cint, g.name, g.subj)
   340  
   341  	// we try to enter the governor and if it fails in a way thats safe to retry
   342  	// we will do so else we exit.
   343  	//
   344  	// We need to handle thins like context timeout, bucket not found etc specifically
   345  	// as hard errors since, especially context timeout, it does not mean the message did
   346  	// not enter the governor, it just means something went wrong, perhaps in getting the
   347  	// ok reply.  In the case where the message did reach the governor but the reply could
   348  	// not be processed we will retry again and again potentially filling the governor.
   349  	err := try()
   350  	if err == nil {
   351  		return closer, seq, nil
   352  	} else if err != errRetry {
   353  		return nil, 0, err
   354  	}
   355  
   356  	ticker := time.NewTicker(g.cint)
   357  
   358  	for {
   359  		select {
   360  		case <-ticker.C:
   361  			tries++
   362  
   363  			err = try()
   364  			if err == nil {
   365  				return closer, seq, nil
   366  			} else if err != errRetry {
   367  				return nil, 0, err
   368  			}
   369  
   370  			if g.bo != nil {
   371  				delay := g.bo.Duration(tries)
   372  				g.Debugf("Retrying after %v", delay)
   373  				ticker.Reset(delay)
   374  			}
   375  
   376  		case <-ctx.Done():
   377  			g.Infof("Stopping campaigns against %s due to context timeout after %d tries", g.name, tries)
   378  			ticker.Stop()
   379  			return nil, 0, ctx.Err()
   380  		}
   381  	}
   382  }
   383  
   384  func (g *jsGMgr) Reset() error {
   385  	return g.str.Purge()
   386  }
   387  func (g *jsGMgr) Stream() *jsm.Stream    { return g.str }
   388  func (g *jsGMgr) Limit() int64           { return g.str.MaxMsgs() }
   389  func (g *jsGMgr) MaxAge() time.Duration  { return g.str.MaxAge() }
   390  func (g *jsGMgr) Subject() string        { return g.str.Subjects()[0] }
   391  func (g *jsGMgr) Replicas() int          { return g.str.Replicas() }
   392  func (g *jsGMgr) Connection() *nats.Conn { return g.nc }
   393  func (g *jsGMgr) Name() string           { return g.name }
   394  func (g *jsGMgr) Evict(entry uint64) (string, error) {
   395  	msg, err := g.str.ReadMessage(entry)
   396  	if err != nil {
   397  		return "", err
   398  	}
   399  
   400  	return string(msg.Data), g.str.DeleteMessage(entry)
   401  }
   402  
   403  func (g *jsGMgr) Active() (uint64, error) {
   404  	nfo, err := g.str.Information()
   405  	if err != nil {
   406  		return 0, err
   407  	}
   408  
   409  	return nfo.State.Msgs, nil
   410  }
   411  
   412  func (g *jsGMgr) LastActive() (time.Time, error) {
   413  	nfo, err := g.str.Information()
   414  	if err != nil {
   415  		return time.Time{}, err
   416  	}
   417  
   418  	return nfo.State.LastTime, nil
   419  }
   420  
   421  func (g *jsGMgr) SetSubject(subj string) error {
   422  	g.mu.Lock()
   423  	g.subj = subj
   424  	g.mu.Unlock()
   425  
   426  	return g.updateConfig()
   427  }
   428  
   429  func (g *jsGMgr) SetLimit(limit uint64) error {
   430  	g.mu.Lock()
   431  	g.limit = limit
   432  	g.mu.Unlock()
   433  
   434  	return g.updateConfig()
   435  }
   436  
   437  func (g *jsGMgr) SetMaxAge(age time.Duration) error {
   438  	g.mu.Lock()
   439  	g.maxAge = age
   440  	g.mu.Unlock()
   441  
   442  	return g.updateConfig()
   443  }
   444  
   445  func (g *jsGMgr) updateConfig() error {
   446  	g.mu.Lock()
   447  	defer g.mu.Unlock()
   448  
   449  	if g.str.MaxAge() != g.maxAge || g.str.MaxMsgs() != int64(g.limit) || !cmp.Equal([]string{g.streamSubject()}, g.str.Subjects()) || g.str.Replicas() != g.replicas {
   450  		err := g.str.UpdateConfiguration(g.str.Configuration(), g.streamOpts()...)
   451  		if err != nil {
   452  			return fmt.Errorf("stream update failed: %s", err)
   453  		}
   454  	}
   455  
   456  	return nil
   457  }
   458  
   459  func (g *jsGMgr) streamOpts() []jsm.StreamOption {
   460  	opts := []jsm.StreamOption{
   461  		jsm.StreamDescription(fmt.Sprintf("Concurrency Governor %s", g.name)),
   462  		jsm.MaxAge(g.maxAge),
   463  		jsm.MaxMessages(int64(g.limit)),
   464  		jsm.Subjects(g.subj),
   465  		jsm.Replicas(g.replicas),
   466  		jsm.LimitsRetention(),
   467  		jsm.FileStorage(),
   468  		jsm.DiscardNew(),
   469  		jsm.DuplicateWindow(0),
   470  	}
   471  
   472  	if g.replicas > 0 {
   473  		opts = append(opts, jsm.Replicas(g.replicas))
   474  	}
   475  
   476  	return opts
   477  }
   478  
   479  func (g *jsGMgr) loadOrCreate(update bool) error {
   480  	opts := g.streamOpts()
   481  
   482  	if g.noCreate {
   483  		has, err := g.mgr.IsKnownStream(g.stream)
   484  		if err != nil {
   485  			return err
   486  		}
   487  
   488  		if !has {
   489  			return fmt.Errorf("unknown governor")
   490  		}
   491  	}
   492  
   493  	str, err := g.mgr.LoadOrNewStream(g.stream, opts...)
   494  	if err != nil {
   495  		return err
   496  	}
   497  
   498  	g.str = str
   499  
   500  	if update {
   501  		g.updateConfig()
   502  	}
   503  
   504  	return nil
   505  }
   506  
   507  func (g *jsGMgr) Debugf(format string, a ...any) {
   508  	if g.logger != nil {
   509  		g.logger.Debugf(format, a...)
   510  	}
   511  }
   512  
   513  func (g *jsGMgr) Infof(format string, a ...any) {
   514  	if g.logger != nil {
   515  		g.logger.Infof(format, a...)
   516  	}
   517  }
   518  
   519  func (g *jsGMgr) Warnf(format string, a ...any) {
   520  	if g.logger != nil {
   521  		g.logger.Warnf(format, a...)
   522  	}
   523  }
   524  
   525  func (g *jsGMgr) Errorf(format string, a ...any) {
   526  	if g.logger != nil {
   527  		g.logger.Errorf(format, a...)
   528  	}
   529  }