github.com/grailbio/base@v0.0.11/eventlog/cloudwatch/cloudwatch.go (about) 1 // Copyright 2020 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache 2.0 3 // license that can be found in the LICENSE file. 4 5 package cloudwatch 6 7 import ( 8 "context" 9 "fmt" 10 "os" 11 "path/filepath" 12 "strings" 13 "sync" 14 "sync/atomic" 15 "time" 16 17 "github.com/aws/aws-sdk-go/aws" 18 "github.com/aws/aws-sdk-go/aws/awserr" 19 "github.com/aws/aws-sdk-go/aws/session" 20 "github.com/aws/aws-sdk-go/service/cloudwatchlogs" 21 "github.com/aws/aws-sdk-go/service/cloudwatchlogs/cloudwatchlogsiface" 22 "github.com/grailbio/base/config" 23 "github.com/grailbio/base/errors" 24 "github.com/grailbio/base/eventlog/internal/marshal" 25 "github.com/grailbio/base/log" 26 "github.com/grailbio/base/must" 27 ) 28 29 // maxBatchSize is the maximum batch size of the events that we send to CloudWatch Logs, "calculated 30 // as the sum of all event messages in UTF-8, plus 26 bytes for each log event". See: 31 // https://docs.aws.amazon.com/sdk-for-go/api/service/cloudwatchlogs/#CloudWatchLogs.PutLogEvents 32 const maxBatchSize = 1048576 33 34 // maxSingleMessageSize is the maximum size, in bytes, of a single message 35 // string of CloudWatch Log event. 36 const maxSingleMessageSize = maxBatchSize - 26 37 38 // syncInterval is the maximum duration that we will wait before sending any 39 // buffered messages to CloudWatch. 40 const syncInterval = 1 * time.Second 41 42 // eventBufferSize is size of the channel buffer used to process events. When 43 // this buffer is full, new events are dropped. 44 const eventBufferSize = 32768 45 46 func init() { 47 config.Register("eventer/cloudwatch", func(constr *config.Constructor[*Eventer]) { 48 var sess *session.Session 49 constr.InstanceVar(&sess, "aws", "aws", "AWS configuration for all CloudWatch calls") 50 var group string 51 constr.StringVar(&group, "group", "eventlog", "the CloudWatch log group of the stream to which events will be sent") 52 var stream string 53 constr.StringVar(&stream, "stream", "", "the CloudWatch log stream to which events will be sent") 54 constr.Doc = "eventer/cloudwatch configures an eventer that sends events to a CloudWatch log stream" 55 constr.New = func() (*Eventer, error) { 56 cw := cloudwatchlogs.New(sess) 57 if stream == "" { 58 // All the information of RFC 3339 but without colons, as stream 59 // names cannot have colons. 60 const layout = "20060102T150405.000-0700" 61 // The default stream name incorporates the current executable 62 // name and time for some measure of uniqueness and usefulness. 63 stream = strings.Join([]string{readExec(), time.Now().Format(layout)}, "~") 64 } 65 return NewEventer(cw, group, stream), nil 66 } 67 }) 68 } 69 70 type ( 71 // Eventer logs events to CloudWatch Logs. 72 Eventer struct { 73 client cloudwatchlogsiface.CloudWatchLogsAPI 74 group string 75 stream string 76 77 cancel func() 78 79 // eventc is used to send events that are batched and sent to CloudWatch 80 // Logs. 81 eventc chan event 82 83 // syncc is used to force syncing of events to CloudWatch Logs. 84 syncc chan struct{} 85 // syncDonec is used to block for syncing. 86 syncDonec chan struct{} 87 88 // donec is used to signal that the event processing loop is done. 89 donec chan struct{} 90 91 initOnce sync.Once 92 initErr error 93 94 sequenceToken *string 95 96 // loggedFullBuffer prevents consecutive printing of "buffer full" log 97 // messages inside Event. We get a full buffer when we are overloaded with 98 // many messages. Logging each dropped message is very noisy, so we suppress 99 // consecutive logging. 100 loggedFullBuffer int32 101 102 now func() time.Time 103 } 104 105 opts struct { 106 now func() time.Time 107 } 108 EventerOption func(*opts) 109 ) 110 111 type event struct { 112 timestamp time.Time 113 typ string 114 fieldPairs []interface{} 115 } 116 117 // OptNow configures NewEventer to obtain timestamps from now. now must be non-nil. 118 // 119 // Example use case: reducing precision to make it more difficult to correlate which events 120 // likely came from the same user (who may have done a few things in one minute, etc.). 121 func OptNow(now func() time.Time) EventerOption { 122 must.True(now != nil) 123 return func(o *opts) { o.now = now } 124 } 125 126 // NewEventer returns a *CloudWatchLogger. It does create the group or 127 // stream until the first event is logged. 128 func NewEventer( 129 client cloudwatchlogsiface.CloudWatchLogsAPI, group, stream string, options ...EventerOption, 130 ) *Eventer { 131 opts := opts{now: time.Now} 132 for _, option := range options { 133 option(&opts) 134 } 135 eventer := &Eventer{ 136 client: client, 137 group: group, 138 stream: stream, 139 eventc: make(chan event, eventBufferSize), 140 syncc: make(chan struct{}), 141 syncDonec: make(chan struct{}), 142 donec: make(chan struct{}), 143 now: opts.now, 144 } 145 var ctx context.Context 146 ctx, eventer.cancel = context.WithCancel(context.Background()) 147 go eventer.loop(ctx) 148 return eventer 149 } 150 151 func (c *Eventer) String() string { 152 return fmt.Sprintf("CloudWatch Logs: %s/%s", c.group, c.stream) 153 } 154 155 // Event implements Eventer. 156 func (c *Eventer) Event(typ string, fieldPairs ...interface{}) { 157 select { 158 case c.eventc <- event{timestamp: c.now(), typ: typ, fieldPairs: fieldPairs}: 159 atomic.StoreInt32(&c.loggedFullBuffer, 0) 160 default: 161 if atomic.LoadInt32(&c.loggedFullBuffer) == 0 { 162 log.Error.Printf("Eventer: dropping log events: buffer full") 163 atomic.StoreInt32(&c.loggedFullBuffer, 1) 164 } 165 } 166 } 167 168 // Init initializes the group and stream used by c. It will only attempt 169 // initialization once, subsequently returning the result of that attempt. 170 func (c *Eventer) Init(ctx context.Context) error { 171 // TODO: Initialize with loadingcache.Value so concurrent Init()s each respect their own 172 // context's cancellation. 173 c.initOnce.Do(func() { 174 defer func() { 175 if c.initErr != nil { 176 log.Error.Printf("Eventer: failed to initialize event log: %v", c.initErr) 177 } 178 }() 179 var err error 180 _, err = c.client.CreateLogGroupWithContext(ctx, &cloudwatchlogs.CreateLogGroupInput{ 181 LogGroupName: aws.String(c.group), 182 }) 183 if err != nil { 184 aerr, ok := err.(awserr.Error) 185 if !ok || aerr.Code() != cloudwatchlogs.ErrCodeResourceAlreadyExistsException { 186 c.initErr = errors.E(fmt.Sprintf("could not create CloudWatch log group %s", c.group), err) 187 return 188 } 189 } 190 _, err = c.client.CreateLogStreamWithContext(ctx, &cloudwatchlogs.CreateLogStreamInput{ 191 LogGroupName: aws.String(c.group), 192 LogStreamName: aws.String(c.stream), 193 }) 194 if err != nil { 195 aerr, ok := err.(awserr.Error) 196 if ok && aerr.Code() != cloudwatchlogs.ErrCodeResourceAlreadyExistsException { 197 c.initErr = errors.E(fmt.Sprintf("could not create CloudWatch log stream %s", c.stream), err) 198 return 199 } 200 } 201 }) 202 return c.initErr 203 } 204 205 // sync syncs all buffered events to CloudWatch. This is mostly useful for 206 // testing. 207 func (c *Eventer) sync() { 208 c.syncc <- struct{}{} 209 <-c.syncDonec 210 } 211 212 func (c *Eventer) Close() error { 213 c.cancel() 214 <-c.donec 215 return nil 216 } 217 218 func (c *Eventer) loop(ctx context.Context) { 219 var ( 220 syncTimer = time.NewTimer(syncInterval) 221 inputLogEvents []*cloudwatchlogs.InputLogEvent 222 batchSize int 223 ) 224 sync := func(drainTimer bool) { 225 defer func() { 226 inputLogEvents = nil 227 batchSize = 0 228 if !syncTimer.Stop() && drainTimer { 229 <-syncTimer.C 230 } 231 syncTimer.Reset(syncInterval) 232 }() 233 if len(inputLogEvents) == 0 { 234 return 235 } 236 if err := c.Init(ctx); err != nil { 237 return 238 } 239 response, err := c.client.PutLogEventsWithContext(ctx, &cloudwatchlogs.PutLogEventsInput{ 240 LogEvents: inputLogEvents, 241 LogGroupName: aws.String(c.group), 242 LogStreamName: aws.String(c.stream), 243 SequenceToken: c.sequenceToken, 244 }) 245 if err != nil { 246 log.Error.Printf("Eventer: PutLogEvents error: %v", err) 247 if aerr, ok := err.(*cloudwatchlogs.InvalidSequenceTokenException); ok { 248 c.sequenceToken = aerr.ExpectedSequenceToken 249 } 250 return 251 } 252 c.sequenceToken = response.NextSequenceToken 253 } 254 process := func(e event) { 255 s, err := marshal.Marshal(e.typ, e.fieldPairs) 256 if err != nil { 257 log.Error.Printf("Eventer: dropping log event: %v", err) 258 return 259 } 260 if len(s) > maxSingleMessageSize { 261 log.Error.Printf("Eventer: dropping log event: message too large") 262 return 263 } 264 newBatchSize := batchSize + len(s) + 26 265 if newBatchSize > maxBatchSize { 266 sync(true) 267 } 268 inputLogEvents = append(inputLogEvents, &cloudwatchlogs.InputLogEvent{ 269 Message: aws.String(s), 270 Timestamp: aws.Int64(e.timestamp.UnixNano() / 1000000), 271 }) 272 } 273 drainEvents := func() { 274 drainLoop: 275 for { 276 select { 277 case e := <-c.eventc: 278 process(e) 279 default: 280 break drainLoop 281 } 282 } 283 } 284 for { 285 select { 286 case <-c.syncc: 287 drainEvents() 288 sync(false) 289 c.syncDonec <- struct{}{} 290 case <-syncTimer.C: 291 sync(false) 292 case e := <-c.eventc: 293 process(e) 294 case <-ctx.Done(): 295 close(c.eventc) 296 for e := range c.eventc { 297 process(e) 298 } 299 sync(true) 300 close(c.donec) 301 return 302 } 303 } 304 } 305 306 // readExec returns a sanitized version of the executable name, if it can be 307 // determined. If not, returns "unknown". 308 func readExec() string { 309 const unknown = "unknown" 310 execPath, err := os.Executable() 311 if err != nil { 312 return unknown 313 } 314 rawExec := filepath.Base(execPath) 315 var sanitized strings.Builder 316 for _, r := range rawExec { 317 if (r == '-' || 'a' <= r && r <= 'z') || ('0' <= r && r <= '9') { 318 sanitized.WriteRune(r) 319 } 320 } 321 if sanitized.Len() == 0 { 322 return unknown 323 } 324 return sanitized.String() 325 }