github.com/choria-io/go-choria@v0.28.1-0.20240416190746-b3bf9c7d5a45/providers/governor/streams/governor.go (about) 1 // Copyright 2020-2022 The NATS Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // Copyright (c) 2022, R.I. Pienaar and the Choria Project contributors 16 // 17 // SPDX-License-Identifier: Apache-2.0 18 19 // Package governor controls the concurrency of a network wide process 20 // 21 // Using this one can, for example, create CRON jobs that can trigger 22 // 100s or 1000s concurrently but where most will wait for a set limit 23 // to complete. In effect limiting the overall concurrency of these 24 // execution. 25 // 26 // To do this a Stream is created that has a maximum message limit and 27 // that will reject new entries when full. 28 // 29 // Workers will try to place themselves in the Stream, they do their work 30 // if they succeed and remove themselves from the Stream once they are done. 31 // 32 // As a fail safe the stack will evict entries after a set time based on 33 // Stream max age. 34 // 35 // A manager is included to create, observe and edit these streams and the 36 // choria CLI has a new command build on this library: choria governor 37 package governor 38 39 import ( 40 "context" 41 "errors" 42 "fmt" 43 "sort" 44 "strings" 45 "sync" 46 "time" 47 48 "github.com/choria-io/go-choria/backoff" 49 iu "github.com/choria-io/go-choria/internal/util" 50 "github.com/google/go-cmp/cmp" 51 "github.com/nats-io/jsm.go" 52 "github.com/nats-io/nats.go" 53 ) 54 55 // DefaultInterval default sleep between tries, set with WithInterval() 56 const DefaultInterval = 250 * time.Millisecond 57 58 // Finisher signals that work is completed releasing the slot on the stack 59 type Finisher func() error 60 61 // Governor controls concurrency of distributed processes using a named governor stream 62 type Governor interface { 63 // Start attempts to get a spot in the Governor, gives up on context, call Finisher to signal end of work 64 Start(ctx context.Context, name string) (fin Finisher, seq uint64, err error) 65 // Connection is the NATS connection used to communicate 66 Connection() *nats.Conn 67 } 68 69 // Logger is a custom logger 70 type Logger interface { 71 Debugf(format string, a ...any) 72 Infof(format string, a ...any) 73 Warnf(format string, a ...any) 74 Errorf(format string, a ...any) 75 } 76 77 // Manager controls concurrent executions of work distributed throughout a nats network by using 78 // a stream as a capped stack where workers reserve a slot and later release the slot 79 type Manager interface { 80 // Limit is the configured maximum entries in the Governor 81 Limit() int64 82 // MaxAge is the time after which entries will be evicted 83 MaxAge() time.Duration 84 // Name is the Governor name 85 Name() string 86 // Replicas is how many data replicas are kept of the data 87 Replicas() int 88 // SetLimit configures the maximum entries in the Governor and takes immediate effect 89 SetLimit(uint64) error 90 // SetMaxAge configures the maximum age of entries, takes immediate effect 91 SetMaxAge(time.Duration) error 92 // SetSubject configures the underlying NATS subject the Governor listens on for entry campaigns 93 SetSubject(subj string) error 94 // Stream is the underlying JetStream stream 95 Stream() *jsm.Stream 96 // Subject is the subject the Governor listens on for entry campaigns 97 Subject() string 98 // Reset resets the governor removing all current entries from it 99 Reset() error 100 // Active is the number of active entries in the Governor 101 Active() (uint64, error) 102 // Evict removes an entry from the Governor given its unique id, returns the name that was on that entry 103 Evict(entry uint64) (name string, err error) 104 // LastActive returns the the since entry was added to the Governor, can be zero time when no entries were added 105 LastActive() (time.Time, error) 106 // Connection is the NATS connection used to communicate 107 Connection() *nats.Conn 108 } 109 110 var errRetry = errors.New("retryable error") 111 112 type jsGMgr struct { 113 name string 114 stream string 115 maxAge time.Duration 116 limit uint64 117 mgr *jsm.Manager 118 nc *nats.Conn 119 str *jsm.Stream 120 subj string 121 replicas int 122 running bool 123 noCreate bool 124 noLeave bool 125 126 logger Logger 127 cint time.Duration 128 bo *backoff.Policy 129 130 mu sync.Mutex 131 } 132 133 func NewManager(name string, limit uint64, maxAge time.Duration, replicas uint, nc *nats.Conn, update bool, opts ...Option) (Manager, error) { 134 mgr, err := jsm.New(nc) 135 if err != nil { 136 return nil, err 137 } 138 139 gov := &jsGMgr{ 140 name: name, 141 maxAge: maxAge, 142 limit: limit, 143 mgr: mgr, 144 nc: nc, 145 replicas: int(replicas), 146 cint: DefaultInterval, 147 } 148 149 for _, opt := range opts { 150 opt(gov) 151 } 152 153 if limit == 0 { 154 gov.noCreate = true 155 } 156 157 gov.stream = gov.streamName() 158 gov.subj = gov.streamSubject() 159 160 err = gov.loadOrCreate(update) 161 if err != nil { 162 return nil, err 163 } 164 165 return gov, nil 166 } 167 168 type Option func(mgr *jsGMgr) 169 170 // WithLogger configures the logger to use, no logging when none is given 171 func WithLogger(log Logger) Option { 172 return func(mgr *jsGMgr) { 173 mgr.logger = log 174 } 175 } 176 177 // WithBackoff sets a backoff policy for gradually reducing try interval 178 func WithBackoff(p backoff.Policy) Option { 179 return func(mgr *jsGMgr) { 180 mgr.bo = &p 181 } 182 } 183 184 // WithInterval sets the interval between tries 185 func WithInterval(i time.Duration) Option { 186 return func(mgr *jsGMgr) { 187 mgr.cint = i 188 } 189 } 190 191 // WithSubject configures a specific subject for the governor to act on 192 func WithSubject(s string) Option { 193 return func(mgr *jsGMgr) { 194 mgr.subj = s 195 } 196 } 197 198 // WithoutLeavingOnCompletion prevents removal from the governor after execution 199 func WithoutLeavingOnCompletion() Option { 200 return func(mgr *jsGMgr) { 201 mgr.noLeave = true 202 } 203 } 204 205 func New(name string, nc *nats.Conn, opts ...Option) Governor { 206 mgr, err := jsm.New(nc) 207 if err != nil { 208 return nil 209 } 210 211 gov := &jsGMgr{ 212 name: name, 213 mgr: mgr, 214 nc: nc, 215 cint: DefaultInterval, 216 } 217 218 for _, opt := range opts { 219 opt(gov) 220 } 221 222 gov.stream = gov.streamName() 223 gov.subj = gov.streamSubject() 224 225 return gov 226 } 227 228 func (g *jsGMgr) streamSubject() string { 229 if g.subj != "" { 230 return g.subj 231 } 232 233 return fmt.Sprintf("$GOVERNOR.campaign.%s", g.name) 234 } 235 236 func (g *jsGMgr) streamName() string { 237 if g.stream != "" { 238 return g.stream 239 } 240 241 return StreamName(g.name) 242 } 243 244 func StreamName(governor string) string { 245 return fmt.Sprintf("GOVERNOR_%s", governor) 246 } 247 248 func List(nc *nats.Conn, collective string) ([]string, error) { 249 mgr, err := jsm.New(nc) 250 if err != nil { 251 return nil, err 252 } 253 254 known, err := mgr.StreamNames(&jsm.StreamNamesFilter{ 255 Subject: iu.GovernorSubject("*", collective), 256 }) 257 if err != nil { 258 return nil, err 259 } 260 261 for i := 0; i < len(known); i++ { 262 known[i] = strings.TrimPrefix(known[i], "GOVERNOR_") 263 } 264 265 sort.Strings(known) 266 267 return known, nil 268 } 269 func (g *jsGMgr) Start(ctx context.Context, name string) (Finisher, uint64, error) { 270 g.mu.Lock() 271 defer g.mu.Unlock() 272 273 if g.running { 274 return nil, 0, fmt.Errorf("already running") 275 } 276 277 g.running = true 278 seq := uint64(0) 279 tries := 0 280 281 try := func() error { 282 ctx, cancel := context.WithTimeout(ctx, time.Second) 283 defer cancel() 284 285 g.Debugf("Publishing to %s", g.subj) 286 m, err := g.nc.RequestWithContext(ctx, g.subj, []byte(name)) 287 if err != nil { 288 g.Errorf("Publishing to governor %s via %s failed: %s", g.name, g.subj, err) 289 return err 290 } 291 292 res, err := jsm.ParsePubAck(m) 293 if err != nil { 294 // jetstream sent us a puback error, this is retryable in the case of governors 295 if jsm.IsNatsError(err, 10077) { 296 g.Debugf("Could not obtain a slot: %v", err) 297 return errRetry 298 } 299 300 g.Errorf("Invalid pub ack: %s", err) 301 return err 302 } 303 304 seq = res.Sequence 305 306 g.Infof("Got a slot on %s with sequence %d", g.name, seq) 307 308 return nil 309 } 310 311 closer := func() error { 312 if seq == 0 { 313 return nil 314 } 315 316 g.mu.Lock() 317 defer g.mu.Unlock() 318 if !g.running { 319 return nil 320 } 321 322 g.running = false 323 324 if g.noLeave { 325 g.Infof("Not evicting self from %s based on configuration directive", g.name) 326 return nil 327 } 328 329 g.Infof("Removing self from %s sequence %d", g.name, seq) 330 err := g.mgr.DeleteStreamMessage(g.stream, seq, true) 331 if err != nil { 332 g.Errorf("Could not remove self from %s: %s", g.name, err) 333 return fmt.Errorf("could not remove seq %d: %s", seq, err) 334 } 335 336 return nil 337 } 338 339 g.Debugf("Starting to campaign every %v for a slot on %s using %s", g.cint, g.name, g.subj) 340 341 // we try to enter the governor and if it fails in a way thats safe to retry 342 // we will do so else we exit. 343 // 344 // We need to handle thins like context timeout, bucket not found etc specifically 345 // as hard errors since, especially context timeout, it does not mean the message did 346 // not enter the governor, it just means something went wrong, perhaps in getting the 347 // ok reply. In the case where the message did reach the governor but the reply could 348 // not be processed we will retry again and again potentially filling the governor. 349 err := try() 350 if err == nil { 351 return closer, seq, nil 352 } else if err != errRetry { 353 return nil, 0, err 354 } 355 356 ticker := time.NewTicker(g.cint) 357 358 for { 359 select { 360 case <-ticker.C: 361 tries++ 362 363 err = try() 364 if err == nil { 365 return closer, seq, nil 366 } else if err != errRetry { 367 return nil, 0, err 368 } 369 370 if g.bo != nil { 371 delay := g.bo.Duration(tries) 372 g.Debugf("Retrying after %v", delay) 373 ticker.Reset(delay) 374 } 375 376 case <-ctx.Done(): 377 g.Infof("Stopping campaigns against %s due to context timeout after %d tries", g.name, tries) 378 ticker.Stop() 379 return nil, 0, ctx.Err() 380 } 381 } 382 } 383 384 func (g *jsGMgr) Reset() error { 385 return g.str.Purge() 386 } 387 func (g *jsGMgr) Stream() *jsm.Stream { return g.str } 388 func (g *jsGMgr) Limit() int64 { return g.str.MaxMsgs() } 389 func (g *jsGMgr) MaxAge() time.Duration { return g.str.MaxAge() } 390 func (g *jsGMgr) Subject() string { return g.str.Subjects()[0] } 391 func (g *jsGMgr) Replicas() int { return g.str.Replicas() } 392 func (g *jsGMgr) Connection() *nats.Conn { return g.nc } 393 func (g *jsGMgr) Name() string { return g.name } 394 func (g *jsGMgr) Evict(entry uint64) (string, error) { 395 msg, err := g.str.ReadMessage(entry) 396 if err != nil { 397 return "", err 398 } 399 400 return string(msg.Data), g.str.DeleteMessage(entry) 401 } 402 403 func (g *jsGMgr) Active() (uint64, error) { 404 nfo, err := g.str.Information() 405 if err != nil { 406 return 0, err 407 } 408 409 return nfo.State.Msgs, nil 410 } 411 412 func (g *jsGMgr) LastActive() (time.Time, error) { 413 nfo, err := g.str.Information() 414 if err != nil { 415 return time.Time{}, err 416 } 417 418 return nfo.State.LastTime, nil 419 } 420 421 func (g *jsGMgr) SetSubject(subj string) error { 422 g.mu.Lock() 423 g.subj = subj 424 g.mu.Unlock() 425 426 return g.updateConfig() 427 } 428 429 func (g *jsGMgr) SetLimit(limit uint64) error { 430 g.mu.Lock() 431 g.limit = limit 432 g.mu.Unlock() 433 434 return g.updateConfig() 435 } 436 437 func (g *jsGMgr) SetMaxAge(age time.Duration) error { 438 g.mu.Lock() 439 g.maxAge = age 440 g.mu.Unlock() 441 442 return g.updateConfig() 443 } 444 445 func (g *jsGMgr) updateConfig() error { 446 g.mu.Lock() 447 defer g.mu.Unlock() 448 449 if g.str.MaxAge() != g.maxAge || g.str.MaxMsgs() != int64(g.limit) || !cmp.Equal([]string{g.streamSubject()}, g.str.Subjects()) || g.str.Replicas() != g.replicas { 450 err := g.str.UpdateConfiguration(g.str.Configuration(), g.streamOpts()...) 451 if err != nil { 452 return fmt.Errorf("stream update failed: %s", err) 453 } 454 } 455 456 return nil 457 } 458 459 func (g *jsGMgr) streamOpts() []jsm.StreamOption { 460 opts := []jsm.StreamOption{ 461 jsm.StreamDescription(fmt.Sprintf("Concurrency Governor %s", g.name)), 462 jsm.MaxAge(g.maxAge), 463 jsm.MaxMessages(int64(g.limit)), 464 jsm.Subjects(g.subj), 465 jsm.Replicas(g.replicas), 466 jsm.LimitsRetention(), 467 jsm.FileStorage(), 468 jsm.DiscardNew(), 469 jsm.DuplicateWindow(0), 470 } 471 472 if g.replicas > 0 { 473 opts = append(opts, jsm.Replicas(g.replicas)) 474 } 475 476 return opts 477 } 478 479 func (g *jsGMgr) loadOrCreate(update bool) error { 480 opts := g.streamOpts() 481 482 if g.noCreate { 483 has, err := g.mgr.IsKnownStream(g.stream) 484 if err != nil { 485 return err 486 } 487 488 if !has { 489 return fmt.Errorf("unknown governor") 490 } 491 } 492 493 str, err := g.mgr.LoadOrNewStream(g.stream, opts...) 494 if err != nil { 495 return err 496 } 497 498 g.str = str 499 500 if update { 501 g.updateConfig() 502 } 503 504 return nil 505 } 506 507 func (g *jsGMgr) Debugf(format string, a ...any) { 508 if g.logger != nil { 509 g.logger.Debugf(format, a...) 510 } 511 } 512 513 func (g *jsGMgr) Infof(format string, a ...any) { 514 if g.logger != nil { 515 g.logger.Infof(format, a...) 516 } 517 } 518 519 func (g *jsGMgr) Warnf(format string, a ...any) { 520 if g.logger != nil { 521 g.logger.Warnf(format, a...) 522 } 523 } 524 525 func (g *jsGMgr) Errorf(format string, a ...any) { 526 if g.logger != nil { 527 g.logger.Errorf(format, a...) 528 } 529 }