go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/logdog/server/collector/collector.go (about) 1 // Copyright 2016 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package collector 16 17 import ( 18 "bytes" 19 "context" 20 "time" 21 22 "go.chromium.org/luci/common/clock" 23 "go.chromium.org/luci/common/errors" 24 log "go.chromium.org/luci/common/logging" 25 "go.chromium.org/luci/common/sync/parallel" 26 "go.chromium.org/luci/common/tsmon/distribution" 27 "go.chromium.org/luci/common/tsmon/field" 28 "go.chromium.org/luci/common/tsmon/metric" 29 tsmon_types "go.chromium.org/luci/common/tsmon/types" 30 "go.chromium.org/luci/config" 31 "go.chromium.org/luci/logdog/api/logpb" 32 "go.chromium.org/luci/logdog/client/pubsubprotocol" 33 "go.chromium.org/luci/logdog/common/storage" 34 "go.chromium.org/luci/logdog/common/types" 35 "go.chromium.org/luci/logdog/server/collector/coordinator" 36 37 "google.golang.org/protobuf/proto" 38 ) 39 40 const ( 41 // DefaultMaxMessageWorkers is the default number of concurrent worker 42 // goroutones to employ for a single message. 43 DefaultMaxMessageWorkers = 4 44 ) 45 46 var ( 47 // tsBundles tracks the total number of logpb.ButlerLogBundle entries that 48 // have been submitted for collection. 49 tsBundles = metric.NewCounter("logdog/collector/bundles", 50 "The number of individual log entry bundles that have been ingested.", 51 nil) 52 // tsLogs tracks the number of logpb.LogEntry entries that have been 53 // written to intermediate storage. 54 tsLogs = metric.NewCounter("logdog/collector/logs", 55 "The number of individual log entries that have been ingested.", 56 nil) 57 58 // tsBundleSize tracks the size, in bytes, of a given log bundle. 59 tsBundleSize = metric.NewCumulativeDistribution("logdog/collector/bundle/size", 60 "The size (in bytes) of the bundle.", 61 &tsmon_types.MetricMetadata{Units: tsmon_types.Bytes}, 62 distribution.DefaultBucketer) 63 // tsBundleEntriesPerBundle tracks the number of ButlerLogBundle.Entry entries 64 // in each bundle that have been collected. 65 tsBundleEntriesPerBundle = metric.NewCumulativeDistribution("logdog/collector/bundle/entries_per_bundle", 66 "The number of log bundle entries per bundle.", 67 nil, 68 distribution.DefaultBucketer) 69 70 // tsBundleEntries tracks the total number of ButlerLogBundle.Entry entries 71 // that have been collected. 72 // 73 // The "stream" field is the type of log stream for each tracked bundle entry. 74 tsBundleEntries = metric.NewCounter("logdog/collector/bundle/entries", 75 "The number of Butler bundle entries pulled.", 76 nil, 77 field.String("stream")) 78 tsBundleEntryProcessingTime = metric.NewCumulativeDistribution("logdog/collector/bundle/entry/processing_time_ms", 79 "The amount of time in milliseconds that a bundle entry takes to process.", 80 &tsmon_types.MetricMetadata{Units: tsmon_types.Milliseconds}, 81 distribution.DefaultBucketer, 82 field.String("stream")) 83 ) 84 85 // Collector is a stateful object responsible for ingesting LogDog logs, 86 // registering them with a Coordinator, and stowing them in short-term storage 87 // for streaming and processing. 88 // 89 // A Collector's Close should be called when finished to release any internal 90 // resources. 91 type Collector struct { 92 // Coordinator is used to interface with the Coordinator client. 93 // 94 // On production systems, this should wrapped with a caching client (see 95 // the stateCache sub-package) to avoid overwhelming the server. 96 Coordinator coordinator.Coordinator 97 98 // Storage is the intermediate storage instance to use. 99 Storage storage.Storage 100 101 // StreamStateCacheExpire is the maximum amount of time that a cached stream 102 // state entry is valid. If zero, DefaultStreamStateCacheExpire will be used. 103 StreamStateCacheExpire time.Duration 104 105 // MaxMessageWorkers is the maximum number of concurrent workers to employ 106 // for any given message. If <= 0, DefaultMaxMessageWorkers will be applied. 107 MaxMessageWorkers int 108 } 109 110 // Process ingests an encoded ButlerLogBundle message, registering it with 111 // the LogDog Coordinator and stowing it in a temporary Storage for streaming 112 // retrieval. 113 // 114 // If a transient error occurs during ingest, Process will return an error. 115 // If no error occurred, or if there was an error with the input data, no error 116 // will be returned. 117 func (c *Collector) Process(ctx context.Context, msg []byte) error { 118 tsBundles.Add(ctx, 1) 119 tsBundleSize.Add(ctx, float64(len(msg))) 120 121 pr := pubsubprotocol.Reader{} 122 if err := pr.Read(bytes.NewReader(msg)); err != nil { 123 log.WithError(err).Errorf(ctx, "Failed to unpack message.") 124 return nil 125 } 126 if pr.Metadata.ProtoVersion != logpb.Version { 127 log.Fields{ 128 "messageProtoVersion": pr.Metadata.ProtoVersion, 129 "currentProtoVersion": logpb.Version, 130 }.Errorf(ctx, "Unknown protobuf version.") 131 return nil 132 } 133 if pr.Bundle == nil { 134 log.Errorf(ctx, "Protocol message did not contain a Butler bundle.") 135 return nil 136 } 137 ctx = log.SetField(ctx, "prefix", pr.Bundle.Prefix) 138 139 tsBundleEntriesPerBundle.Add(ctx, float64(len(pr.Bundle.Entries))) 140 for i, entry := range pr.Bundle.Entries { 141 tsBundleEntries.Add(ctx, 1, streamType(entry.Desc)) 142 143 // If we're logging INFO or higher, log the ranges that this bundle 144 // represents. 145 if log.IsLogging(ctx, log.Info) { 146 fields := log.Fields{ 147 "index": i, 148 "project": pr.Bundle.Project, 149 "path": entry.GetDesc().Path(), 150 } 151 if entry.Terminal { 152 fields["terminalIndex"] = entry.TerminalIndex 153 } 154 if logs := entry.GetLogs(); len(logs) > 0 { 155 fields["logStart"] = logs[0].StreamIndex 156 fields["logEnd"] = logs[len(logs)-1].StreamIndex 157 } 158 159 fields.Infof(ctx, "Processing log bundle entry.") 160 } 161 } 162 163 lw := bundleHandler{ 164 msg: msg, 165 md: pr.Metadata, 166 b: pr.Bundle, 167 } 168 169 lw.project = lw.b.Project 170 if err := config.ValidateProjectName(lw.project); err != nil { 171 log.Fields{ 172 log.ErrorKey: err, 173 "project": lw.b.Project, 174 }.Errorf(ctx, "Failed to validate bundle project name.") 175 return errors.New("invalid bundle project name") 176 } 177 178 if err := types.StreamName(lw.b.Prefix).Validate(); err != nil { 179 log.Fields{ 180 log.ErrorKey: err, 181 "prefix": lw.b.Prefix, 182 }.Errorf(ctx, "Failed to validate bundle prefix.") 183 return errors.New("invalid bundle prefix") 184 } 185 186 // If there are no entries, there is nothing to do. 187 if len(pr.Bundle.Entries) == 0 { 188 return nil 189 } 190 191 // Handle each bundle entry in parallel. We will use a separate work pool 192 // here so that top-level bundle dispatch can't deadlock the processing tasks. 193 workers := c.MaxMessageWorkers 194 if workers <= 0 { 195 workers = DefaultMaxMessageWorkers 196 } 197 return parallel.WorkPool(workers, func(taskC chan<- func() error) { 198 for _, be := range pr.Bundle.Entries { 199 be := be 200 201 taskC <- func() error { 202 return c.processLogStream(ctx, &bundleEntryHandler{ 203 bundleHandler: &lw, 204 be: be, 205 }) 206 } 207 } 208 }) 209 } 210 211 // Close releases any internal resources and blocks pending the completion of 212 // any outstanding operations. After Close, no new Process calls may be made. 213 func (c *Collector) Close() { 214 } 215 216 // bundleHandler is a cumulative set of read-only state passed around by 217 // value for log processing. 218 type bundleHandler struct { 219 // msg is the original message bytes. 220 msg []byte 221 // md is the metadata associated with the overall message. 222 md *logpb.ButlerMetadata 223 // b is the Butler bundle. 224 b *logpb.ButlerLogBundle 225 226 // project is the validated project name. 227 project string 228 } 229 230 type bundleEntryHandler struct { 231 *bundleHandler 232 233 // be is the Bundle entry. 234 be *logpb.ButlerLogBundle_Entry 235 // path is the constructed path of the stream being processed. 236 path types.StreamPath 237 } 238 239 // processLogStream processes an individual set of log messages belonging to the 240 // same log stream. 241 func (c *Collector) processLogStream(ctx context.Context, h *bundleEntryHandler) error { 242 streamTypeField := streamType(h.be.Desc) 243 startTime := clock.Now(ctx) 244 defer func() { 245 duration := clock.Now(ctx).Sub(startTime) 246 247 // We track processing time in milliseconds. 248 tsBundleEntryProcessingTime.Add(ctx, duration.Seconds()*1000, streamTypeField) 249 }() 250 251 // If this bundle has neither log entries nor a terminal index, it is junk and 252 // must be discarded. 253 // 254 // This is more important than a basic optimization, as it enforces that no 255 // zero-entry log streams can be ingested. Either some entries exist, or there 256 // is a promise of a terminal entry. 257 if len(h.be.Logs) == 0 && !h.be.Terminal { 258 log.Warningf(ctx, "Bundle entry is non-terminal and contains no logs; discarding.") 259 return nil 260 } 261 262 secret := types.PrefixSecret(h.b.Secret) 263 if err := secret.Validate(); err != nil { 264 log.Fields{ 265 log.ErrorKey: err, 266 "secretLength": len(secret), 267 }.Errorf(ctx, "Failed to validate prefix secret.") 268 return errors.New("invalid prefix secret") 269 } 270 271 // If the descriptor has a Prefix, it must match the bundle's Prefix. 272 if p := h.be.Desc.Prefix; p != "" { 273 if p != h.b.Prefix { 274 log.Fields{ 275 "bundlePrefix": h.b.Prefix, 276 "bundleEntryPrefix": p, 277 }.Errorf(ctx, "Bundle prefix does not match entry prefix.") 278 return errors.New("mismatched bundle and entry prefixes") 279 } 280 } else { 281 // Fill in the bundle's Prefix. 282 h.be.Desc.Prefix = h.b.Prefix 283 } 284 285 if err := h.be.Desc.Validate(true); err != nil { 286 log.WithError(err).Errorf(ctx, "Invalid log stream descriptor.") 287 return err 288 } 289 descBytes, err := proto.Marshal(h.be.Desc) 290 if err != nil { 291 log.WithError(err).Errorf(ctx, "Failed to marshal descriptor.") 292 return err 293 } 294 295 h.path = types.StreamName(h.be.Desc.Prefix).Join(types.StreamName(h.be.Desc.Name)) 296 ctx = log.SetFields(ctx, log.Fields{ 297 "project": h.project, 298 "path": h.path, 299 }) 300 301 // Confirm that the log entries are valid and contiguous. Serialize the log 302 // entries for ingest as we validate them. 303 var logData [][]byte 304 var blockIndex uint64 305 if logs := h.be.Logs; len(logs) > 0 { 306 logData = make([][]byte, len(logs)) 307 blockIndex = logs[0].StreamIndex 308 309 for i, le := range logs { 310 // Validate this log entry. 311 if err := le.Validate(h.be.Desc); err != nil { 312 log.Fields{ 313 log.ErrorKey: err, 314 "index": le.StreamIndex, 315 }.Warningf(ctx, "Discarding invalid log entry.") 316 return errors.New("invalid log entry") 317 } 318 319 // Validate that this entry is contiguous. 320 if le.StreamIndex != blockIndex+uint64(i) { 321 log.Fields{ 322 "index": i, 323 "expected": (blockIndex + uint64(i)), 324 "actual": le.StreamIndex, 325 }.Errorf(ctx, "Non-contiguous log entry block in stream.") 326 return errors.New("non-contiguous log entry block") 327 } 328 329 var err error 330 logData[i], err = proto.Marshal(le) 331 if err != nil { 332 log.Fields{ 333 log.ErrorKey: err, 334 "index": le.StreamIndex, 335 }.Errorf(ctx, "Failed to marshal log entry.") 336 return errors.New("failed to marshal log entries") 337 } 338 } 339 } 340 341 // Fetch our cached/remote state. This will replace our state object with the 342 // fetched state, so any future calls will need to re-set the Secret value. 343 // TODO: Use timeout? 344 registerReq := coordinator.LogStreamState{ 345 Project: h.project, 346 Path: h.path, 347 Secret: secret, 348 ProtoVersion: h.md.ProtoVersion, 349 TerminalIndex: -1, 350 } 351 if h.be.Terminal { 352 registerReq.TerminalIndex = types.MessageIndex(h.be.TerminalIndex) 353 } 354 state, err := c.Coordinator.RegisterStream(ctx, ®isterReq, descBytes) 355 if err != nil { 356 log.WithError(err).Errorf(ctx, "Failed to get/register current stream state.") 357 return err 358 } 359 360 // Does the log stream's secret match the expected secret? 361 // 362 // Note that this check does NOT use the "subtle" package to do time-constant 363 // byte comparison, and may leak information about the secret. This is OK, 364 // since users cannot interact with this service directly; however, if this 365 // code is ever used elsewhere, this should be a consideration. 366 if !bytes.Equal([]byte(secret), []byte(state.Secret)) { 367 log.Errorf(log.SetFields(ctx, log.Fields{ 368 "secret": secret, 369 "expectedSecret": state.Secret, 370 }), "Log entry has incorrect secret.") 371 return nil 372 } 373 374 if state.Archived { 375 log.Infof(ctx, "Skipping message bundle for archived stream.") 376 return nil 377 } 378 if state.Purged { 379 log.Infof(ctx, "Skipping message bundle for purged stream.") 380 return nil 381 } 382 383 // Update our terminal index if we have one. 384 // 385 // Note that even if our cached value is marked terminal, we could have failed 386 // to push the terminal index to the Coordinator, so we will not refrain from 387 // pushing every terminal index encountered regardless of cache state. 388 if h.be.Terminal { 389 tidx := types.MessageIndex(h.be.TerminalIndex) 390 391 // Bundle includes terminal index. 392 393 if state.TerminalIndex < 0 { 394 state.TerminalIndex = tidx 395 } else if state.TerminalIndex != tidx { 396 log.Fields{ 397 "cachedIndex": state.TerminalIndex, 398 "bundleIndex": tidx, 399 }.Warningf(ctx, "Cached terminal index disagrees with state.") 400 } 401 } 402 403 // Perform stream processing operations. We can do these operations in 404 // parallel. 405 return parallel.FanOutIn(func(taskC chan<- func() error) { 406 // Store log data, if any was provided. It has already been validated. 407 if len(logData) > 0 { 408 taskC <- func() error { 409 // Post the log to storage. 410 err = c.Storage.Put(ctx, storage.PutRequest{ 411 Project: h.project, 412 Path: h.path, 413 Index: types.MessageIndex(blockIndex), 414 Values: logData, 415 }) 416 417 // If the log entry already exists, consider the "put" successful. 418 // Storage will return a transient error if one occurred. 419 if err != nil && err != storage.ErrExists { 420 log.Fields{ 421 log.ErrorKey: err, 422 "blockIndex": blockIndex, 423 }.Errorf(ctx, "Failed to load log entry into Storage.") 424 return err 425 } 426 427 tsLogs.Add(ctx, int64(len(logData))) 428 return nil 429 } 430 } 431 432 // If our bundle entry is terminal, we have an additional task of reporting 433 // this to the Coordinator. 434 if h.be.Terminal { 435 taskC <- func() error { 436 // Sentinel task: Update the terminal bundle state. 437 treq := coordinator.TerminateRequest{ 438 Project: state.Project, 439 Path: state.Path, 440 ID: state.ID, 441 Secret: state.Secret, 442 TerminalIndex: types.MessageIndex(h.be.TerminalIndex), 443 } 444 445 log.Fields{ 446 "terminalIndex": state.TerminalIndex, 447 }.Infof(ctx, "Received terminal log; updating Coordinator state.") 448 if err := c.Coordinator.TerminateStream(ctx, &treq); err != nil { 449 log.WithError(err).Errorf(ctx, "Failed to set stream terminal index.") 450 return err 451 } 452 return nil 453 } 454 } 455 }) 456 } 457 458 func streamType(desc *logpb.LogStreamDescriptor) string { 459 if desc == nil { 460 return "UNKNOWN" 461 } 462 return desc.StreamType.String() 463 }