github.com/grailbio/base@v0.0.11/eventlog/cloudwatch/cloudwatch.go (about)

     1  // Copyright 2020 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache 2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package cloudwatch
     6  
     7  import (
     8  	"context"
     9  	"fmt"
    10  	"os"
    11  	"path/filepath"
    12  	"strings"
    13  	"sync"
    14  	"sync/atomic"
    15  	"time"
    16  
    17  	"github.com/aws/aws-sdk-go/aws"
    18  	"github.com/aws/aws-sdk-go/aws/awserr"
    19  	"github.com/aws/aws-sdk-go/aws/session"
    20  	"github.com/aws/aws-sdk-go/service/cloudwatchlogs"
    21  	"github.com/aws/aws-sdk-go/service/cloudwatchlogs/cloudwatchlogsiface"
    22  	"github.com/grailbio/base/config"
    23  	"github.com/grailbio/base/errors"
    24  	"github.com/grailbio/base/eventlog/internal/marshal"
    25  	"github.com/grailbio/base/log"
    26  	"github.com/grailbio/base/must"
    27  )
    28  
    29  // maxBatchSize is the maximum batch size of the events that we send to CloudWatch Logs, "calculated
    30  // as the sum of all event messages in UTF-8, plus 26 bytes for each log event". See:
    31  // https://docs.aws.amazon.com/sdk-for-go/api/service/cloudwatchlogs/#CloudWatchLogs.PutLogEvents
    32  const maxBatchSize = 1048576
    33  
    34  // maxSingleMessageSize is the maximum size, in bytes, of a single message
    35  // string of CloudWatch Log event.
    36  const maxSingleMessageSize = maxBatchSize - 26
    37  
    38  // syncInterval is the maximum duration that we will wait before sending any
    39  // buffered messages to CloudWatch.
    40  const syncInterval = 1 * time.Second
    41  
    42  // eventBufferSize is size of the channel buffer used to process events. When
    43  // this buffer is full, new events are dropped.
    44  const eventBufferSize = 32768
    45  
    46  func init() {
    47  	config.Register("eventer/cloudwatch", func(constr *config.Constructor[*Eventer]) {
    48  		var sess *session.Session
    49  		constr.InstanceVar(&sess, "aws", "aws", "AWS configuration for all CloudWatch calls")
    50  		var group string
    51  		constr.StringVar(&group, "group", "eventlog", "the CloudWatch log group of the stream to which events will be sent")
    52  		var stream string
    53  		constr.StringVar(&stream, "stream", "", "the CloudWatch log stream to which events will be sent")
    54  		constr.Doc = "eventer/cloudwatch configures an eventer that sends events to a CloudWatch log stream"
    55  		constr.New = func() (*Eventer, error) {
    56  			cw := cloudwatchlogs.New(sess)
    57  			if stream == "" {
    58  				// All the information of RFC 3339 but without colons, as stream
    59  				// names cannot have colons.
    60  				const layout = "20060102T150405.000-0700"
    61  				// The default stream name incorporates the current executable
    62  				// name and time for some measure of uniqueness and usefulness.
    63  				stream = strings.Join([]string{readExec(), time.Now().Format(layout)}, "~")
    64  			}
    65  			return NewEventer(cw, group, stream), nil
    66  		}
    67  	})
    68  }
    69  
    70  type (
    71  	// Eventer logs events to CloudWatch Logs.
    72  	Eventer struct {
    73  		client cloudwatchlogsiface.CloudWatchLogsAPI
    74  		group  string
    75  		stream string
    76  
    77  		cancel func()
    78  
    79  		// eventc is used to send events that are batched and sent to CloudWatch
    80  		// Logs.
    81  		eventc chan event
    82  
    83  		// syncc is used to force syncing of events to CloudWatch Logs.
    84  		syncc chan struct{}
    85  		// syncDonec is used to block for syncing.
    86  		syncDonec chan struct{}
    87  
    88  		// donec is used to signal that the event processing loop is done.
    89  		donec chan struct{}
    90  
    91  		initOnce sync.Once
    92  		initErr  error
    93  
    94  		sequenceToken *string
    95  
    96  		// loggedFullBuffer prevents consecutive printing of "buffer full" log
    97  		// messages inside Event. We get a full buffer when we are overloaded with
    98  		// many messages. Logging each dropped message is very noisy, so we suppress
    99  		// consecutive logging.
   100  		loggedFullBuffer int32
   101  
   102  		now func() time.Time
   103  	}
   104  
   105  	opts struct {
   106  		now func() time.Time
   107  	}
   108  	EventerOption func(*opts)
   109  )
   110  
   111  type event struct {
   112  	timestamp  time.Time
   113  	typ        string
   114  	fieldPairs []interface{}
   115  }
   116  
   117  // OptNow configures NewEventer to obtain timestamps from now. now must be non-nil.
   118  //
   119  // Example use case: reducing precision to make it more difficult to correlate which events
   120  // likely came from the same user (who may have done a few things in one minute, etc.).
   121  func OptNow(now func() time.Time) EventerOption {
   122  	must.True(now != nil)
   123  	return func(o *opts) { o.now = now }
   124  }
   125  
   126  // NewEventer returns a *CloudWatchLogger. It does create the group or
   127  // stream until the first event is logged.
   128  func NewEventer(
   129  	client cloudwatchlogsiface.CloudWatchLogsAPI, group, stream string, options ...EventerOption,
   130  ) *Eventer {
   131  	opts := opts{now: time.Now}
   132  	for _, option := range options {
   133  		option(&opts)
   134  	}
   135  	eventer := &Eventer{
   136  		client:    client,
   137  		group:     group,
   138  		stream:    stream,
   139  		eventc:    make(chan event, eventBufferSize),
   140  		syncc:     make(chan struct{}),
   141  		syncDonec: make(chan struct{}),
   142  		donec:     make(chan struct{}),
   143  		now:       opts.now,
   144  	}
   145  	var ctx context.Context
   146  	ctx, eventer.cancel = context.WithCancel(context.Background())
   147  	go eventer.loop(ctx)
   148  	return eventer
   149  }
   150  
   151  func (c *Eventer) String() string {
   152  	return fmt.Sprintf("CloudWatch Logs: %s/%s", c.group, c.stream)
   153  }
   154  
   155  // Event implements Eventer.
   156  func (c *Eventer) Event(typ string, fieldPairs ...interface{}) {
   157  	select {
   158  	case c.eventc <- event{timestamp: c.now(), typ: typ, fieldPairs: fieldPairs}:
   159  		atomic.StoreInt32(&c.loggedFullBuffer, 0)
   160  	default:
   161  		if atomic.LoadInt32(&c.loggedFullBuffer) == 0 {
   162  			log.Error.Printf("Eventer: dropping log events: buffer full")
   163  			atomic.StoreInt32(&c.loggedFullBuffer, 1)
   164  		}
   165  	}
   166  }
   167  
   168  // Init initializes the group and stream used by c. It will only attempt
   169  // initialization once, subsequently returning the result of that attempt.
   170  func (c *Eventer) Init(ctx context.Context) error {
   171  	// TODO: Initialize with loadingcache.Value so concurrent Init()s each respect their own
   172  	// context's cancellation.
   173  	c.initOnce.Do(func() {
   174  		defer func() {
   175  			if c.initErr != nil {
   176  				log.Error.Printf("Eventer: failed to initialize event log: %v", c.initErr)
   177  			}
   178  		}()
   179  		var err error
   180  		_, err = c.client.CreateLogGroupWithContext(ctx, &cloudwatchlogs.CreateLogGroupInput{
   181  			LogGroupName: aws.String(c.group),
   182  		})
   183  		if err != nil {
   184  			aerr, ok := err.(awserr.Error)
   185  			if !ok || aerr.Code() != cloudwatchlogs.ErrCodeResourceAlreadyExistsException {
   186  				c.initErr = errors.E(fmt.Sprintf("could not create CloudWatch log group %s", c.group), err)
   187  				return
   188  			}
   189  		}
   190  		_, err = c.client.CreateLogStreamWithContext(ctx, &cloudwatchlogs.CreateLogStreamInput{
   191  			LogGroupName:  aws.String(c.group),
   192  			LogStreamName: aws.String(c.stream),
   193  		})
   194  		if err != nil {
   195  			aerr, ok := err.(awserr.Error)
   196  			if ok && aerr.Code() != cloudwatchlogs.ErrCodeResourceAlreadyExistsException {
   197  				c.initErr = errors.E(fmt.Sprintf("could not create CloudWatch log stream %s", c.stream), err)
   198  				return
   199  			}
   200  		}
   201  	})
   202  	return c.initErr
   203  }
   204  
   205  // sync syncs all buffered events to CloudWatch. This is mostly useful for
   206  // testing.
   207  func (c *Eventer) sync() {
   208  	c.syncc <- struct{}{}
   209  	<-c.syncDonec
   210  }
   211  
   212  func (c *Eventer) Close() error {
   213  	c.cancel()
   214  	<-c.donec
   215  	return nil
   216  }
   217  
   218  func (c *Eventer) loop(ctx context.Context) {
   219  	var (
   220  		syncTimer      = time.NewTimer(syncInterval)
   221  		inputLogEvents []*cloudwatchlogs.InputLogEvent
   222  		batchSize      int
   223  	)
   224  	sync := func(drainTimer bool) {
   225  		defer func() {
   226  			inputLogEvents = nil
   227  			batchSize = 0
   228  			if !syncTimer.Stop() && drainTimer {
   229  				<-syncTimer.C
   230  			}
   231  			syncTimer.Reset(syncInterval)
   232  		}()
   233  		if len(inputLogEvents) == 0 {
   234  			return
   235  		}
   236  		if err := c.Init(ctx); err != nil {
   237  			return
   238  		}
   239  		response, err := c.client.PutLogEventsWithContext(ctx, &cloudwatchlogs.PutLogEventsInput{
   240  			LogEvents:     inputLogEvents,
   241  			LogGroupName:  aws.String(c.group),
   242  			LogStreamName: aws.String(c.stream),
   243  			SequenceToken: c.sequenceToken,
   244  		})
   245  		if err != nil {
   246  			log.Error.Printf("Eventer: PutLogEvents error: %v", err)
   247  			if aerr, ok := err.(*cloudwatchlogs.InvalidSequenceTokenException); ok {
   248  				c.sequenceToken = aerr.ExpectedSequenceToken
   249  			}
   250  			return
   251  		}
   252  		c.sequenceToken = response.NextSequenceToken
   253  	}
   254  	process := func(e event) {
   255  		s, err := marshal.Marshal(e.typ, e.fieldPairs)
   256  		if err != nil {
   257  			log.Error.Printf("Eventer: dropping log event: %v", err)
   258  			return
   259  		}
   260  		if len(s) > maxSingleMessageSize {
   261  			log.Error.Printf("Eventer: dropping log event: message too large")
   262  			return
   263  		}
   264  		newBatchSize := batchSize + len(s) + 26
   265  		if newBatchSize > maxBatchSize {
   266  			sync(true)
   267  		}
   268  		inputLogEvents = append(inputLogEvents, &cloudwatchlogs.InputLogEvent{
   269  			Message:   aws.String(s),
   270  			Timestamp: aws.Int64(e.timestamp.UnixNano() / 1000000),
   271  		})
   272  	}
   273  	drainEvents := func() {
   274  	drainLoop:
   275  		for {
   276  			select {
   277  			case e := <-c.eventc:
   278  				process(e)
   279  			default:
   280  				break drainLoop
   281  			}
   282  		}
   283  	}
   284  	for {
   285  		select {
   286  		case <-c.syncc:
   287  			drainEvents()
   288  			sync(false)
   289  			c.syncDonec <- struct{}{}
   290  		case <-syncTimer.C:
   291  			sync(false)
   292  		case e := <-c.eventc:
   293  			process(e)
   294  		case <-ctx.Done():
   295  			close(c.eventc)
   296  			for e := range c.eventc {
   297  				process(e)
   298  			}
   299  			sync(true)
   300  			close(c.donec)
   301  			return
   302  		}
   303  	}
   304  }
   305  
   306  // readExec returns a sanitized version of the executable name, if it can be
   307  // determined. If not, returns "unknown".
   308  func readExec() string {
   309  	const unknown = "unknown"
   310  	execPath, err := os.Executable()
   311  	if err != nil {
   312  		return unknown
   313  	}
   314  	rawExec := filepath.Base(execPath)
   315  	var sanitized strings.Builder
   316  	for _, r := range rawExec {
   317  		if (r == '-' || 'a' <= r && r <= 'z') || ('0' <= r && r <= '9') {
   318  			sanitized.WriteRune(r)
   319  		}
   320  	}
   321  	if sanitized.Len() == 0 {
   322  		return unknown
   323  	}
   324  	return sanitized.String()
   325  }