go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/logdog/server/collector/collector.go (about)

     1  // Copyright 2016 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package collector
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"time"
    21  
    22  	"go.chromium.org/luci/common/clock"
    23  	"go.chromium.org/luci/common/errors"
    24  	log "go.chromium.org/luci/common/logging"
    25  	"go.chromium.org/luci/common/sync/parallel"
    26  	"go.chromium.org/luci/common/tsmon/distribution"
    27  	"go.chromium.org/luci/common/tsmon/field"
    28  	"go.chromium.org/luci/common/tsmon/metric"
    29  	tsmon_types "go.chromium.org/luci/common/tsmon/types"
    30  	"go.chromium.org/luci/config"
    31  	"go.chromium.org/luci/logdog/api/logpb"
    32  	"go.chromium.org/luci/logdog/client/pubsubprotocol"
    33  	"go.chromium.org/luci/logdog/common/storage"
    34  	"go.chromium.org/luci/logdog/common/types"
    35  	"go.chromium.org/luci/logdog/server/collector/coordinator"
    36  
    37  	"google.golang.org/protobuf/proto"
    38  )
    39  
    40  const (
    41  	// DefaultMaxMessageWorkers is the default number of concurrent worker
    42  	// goroutones to employ for a single message.
    43  	DefaultMaxMessageWorkers = 4
    44  )
    45  
    46  var (
    47  	// tsBundles tracks the total number of logpb.ButlerLogBundle entries that
    48  	// have been submitted for collection.
    49  	tsBundles = metric.NewCounter("logdog/collector/bundles",
    50  		"The number of individual log entry bundles that have been ingested.",
    51  		nil)
    52  	// tsLogs tracks the number of logpb.LogEntry entries that have been
    53  	// written to intermediate storage.
    54  	tsLogs = metric.NewCounter("logdog/collector/logs",
    55  		"The number of individual log entries that have been ingested.",
    56  		nil)
    57  
    58  	// tsBundleSize tracks the size, in bytes, of a given log bundle.
    59  	tsBundleSize = metric.NewCumulativeDistribution("logdog/collector/bundle/size",
    60  		"The size (in bytes) of the bundle.",
    61  		&tsmon_types.MetricMetadata{Units: tsmon_types.Bytes},
    62  		distribution.DefaultBucketer)
    63  	// tsBundleEntriesPerBundle tracks the number of ButlerLogBundle.Entry entries
    64  	// in each bundle that have been collected.
    65  	tsBundleEntriesPerBundle = metric.NewCumulativeDistribution("logdog/collector/bundle/entries_per_bundle",
    66  		"The number of log bundle entries per bundle.",
    67  		nil,
    68  		distribution.DefaultBucketer)
    69  
    70  	// tsBundleEntries tracks the total number of ButlerLogBundle.Entry entries
    71  	// that have been collected.
    72  	//
    73  	// The "stream" field is the type of log stream for each tracked bundle entry.
    74  	tsBundleEntries = metric.NewCounter("logdog/collector/bundle/entries",
    75  		"The number of Butler bundle entries pulled.",
    76  		nil,
    77  		field.String("stream"))
    78  	tsBundleEntryProcessingTime = metric.NewCumulativeDistribution("logdog/collector/bundle/entry/processing_time_ms",
    79  		"The amount of time in milliseconds that a bundle entry takes to process.",
    80  		&tsmon_types.MetricMetadata{Units: tsmon_types.Milliseconds},
    81  		distribution.DefaultBucketer,
    82  		field.String("stream"))
    83  )
    84  
    85  // Collector is a stateful object responsible for ingesting LogDog logs,
    86  // registering them with a Coordinator, and stowing them in short-term storage
    87  // for streaming and processing.
    88  //
    89  // A Collector's Close should be called when finished to release any internal
    90  // resources.
    91  type Collector struct {
    92  	// Coordinator is used to interface with the Coordinator client.
    93  	//
    94  	// On production systems, this should wrapped with a caching client (see
    95  	// the stateCache sub-package) to avoid overwhelming the server.
    96  	Coordinator coordinator.Coordinator
    97  
    98  	// Storage is the intermediate storage instance to use.
    99  	Storage storage.Storage
   100  
   101  	// StreamStateCacheExpire is the maximum amount of time that a cached stream
   102  	// state entry is valid. If zero, DefaultStreamStateCacheExpire will be used.
   103  	StreamStateCacheExpire time.Duration
   104  
   105  	// MaxMessageWorkers is the maximum number of concurrent workers to employ
   106  	// for any given message. If <= 0, DefaultMaxMessageWorkers will be applied.
   107  	MaxMessageWorkers int
   108  }
   109  
   110  // Process ingests an encoded ButlerLogBundle message, registering it with
   111  // the LogDog Coordinator and stowing it in a temporary Storage for streaming
   112  // retrieval.
   113  //
   114  // If a transient error occurs during ingest, Process will return an error.
   115  // If no error occurred, or if there was an error with the input data, no error
   116  // will be returned.
   117  func (c *Collector) Process(ctx context.Context, msg []byte) error {
   118  	tsBundles.Add(ctx, 1)
   119  	tsBundleSize.Add(ctx, float64(len(msg)))
   120  
   121  	pr := pubsubprotocol.Reader{}
   122  	if err := pr.Read(bytes.NewReader(msg)); err != nil {
   123  		log.WithError(err).Errorf(ctx, "Failed to unpack message.")
   124  		return nil
   125  	}
   126  	if pr.Metadata.ProtoVersion != logpb.Version {
   127  		log.Fields{
   128  			"messageProtoVersion": pr.Metadata.ProtoVersion,
   129  			"currentProtoVersion": logpb.Version,
   130  		}.Errorf(ctx, "Unknown protobuf version.")
   131  		return nil
   132  	}
   133  	if pr.Bundle == nil {
   134  		log.Errorf(ctx, "Protocol message did not contain a Butler bundle.")
   135  		return nil
   136  	}
   137  	ctx = log.SetField(ctx, "prefix", pr.Bundle.Prefix)
   138  
   139  	tsBundleEntriesPerBundle.Add(ctx, float64(len(pr.Bundle.Entries)))
   140  	for i, entry := range pr.Bundle.Entries {
   141  		tsBundleEntries.Add(ctx, 1, streamType(entry.Desc))
   142  
   143  		// If we're logging INFO or higher, log the ranges that this bundle
   144  		// represents.
   145  		if log.IsLogging(ctx, log.Info) {
   146  			fields := log.Fields{
   147  				"index":   i,
   148  				"project": pr.Bundle.Project,
   149  				"path":    entry.GetDesc().Path(),
   150  			}
   151  			if entry.Terminal {
   152  				fields["terminalIndex"] = entry.TerminalIndex
   153  			}
   154  			if logs := entry.GetLogs(); len(logs) > 0 {
   155  				fields["logStart"] = logs[0].StreamIndex
   156  				fields["logEnd"] = logs[len(logs)-1].StreamIndex
   157  			}
   158  
   159  			fields.Infof(ctx, "Processing log bundle entry.")
   160  		}
   161  	}
   162  
   163  	lw := bundleHandler{
   164  		msg: msg,
   165  		md:  pr.Metadata,
   166  		b:   pr.Bundle,
   167  	}
   168  
   169  	lw.project = lw.b.Project
   170  	if err := config.ValidateProjectName(lw.project); err != nil {
   171  		log.Fields{
   172  			log.ErrorKey: err,
   173  			"project":    lw.b.Project,
   174  		}.Errorf(ctx, "Failed to validate bundle project name.")
   175  		return errors.New("invalid bundle project name")
   176  	}
   177  
   178  	if err := types.StreamName(lw.b.Prefix).Validate(); err != nil {
   179  		log.Fields{
   180  			log.ErrorKey: err,
   181  			"prefix":     lw.b.Prefix,
   182  		}.Errorf(ctx, "Failed to validate bundle prefix.")
   183  		return errors.New("invalid bundle prefix")
   184  	}
   185  
   186  	// If there are no entries, there is nothing to do.
   187  	if len(pr.Bundle.Entries) == 0 {
   188  		return nil
   189  	}
   190  
   191  	// Handle each bundle entry in parallel. We will use a separate work pool
   192  	// here so that top-level bundle dispatch can't deadlock the processing tasks.
   193  	workers := c.MaxMessageWorkers
   194  	if workers <= 0 {
   195  		workers = DefaultMaxMessageWorkers
   196  	}
   197  	return parallel.WorkPool(workers, func(taskC chan<- func() error) {
   198  		for _, be := range pr.Bundle.Entries {
   199  			be := be
   200  
   201  			taskC <- func() error {
   202  				return c.processLogStream(ctx, &bundleEntryHandler{
   203  					bundleHandler: &lw,
   204  					be:            be,
   205  				})
   206  			}
   207  		}
   208  	})
   209  }
   210  
   211  // Close releases any internal resources and blocks pending the completion of
   212  // any outstanding operations. After Close, no new Process calls may be made.
   213  func (c *Collector) Close() {
   214  }
   215  
   216  // bundleHandler is a cumulative set of read-only state passed around by
   217  // value for log processing.
   218  type bundleHandler struct {
   219  	// msg is the original message bytes.
   220  	msg []byte
   221  	// md is the metadata associated with the overall message.
   222  	md *logpb.ButlerMetadata
   223  	// b is the Butler bundle.
   224  	b *logpb.ButlerLogBundle
   225  
   226  	// project is the validated project name.
   227  	project string
   228  }
   229  
   230  type bundleEntryHandler struct {
   231  	*bundleHandler
   232  
   233  	// be is the Bundle entry.
   234  	be *logpb.ButlerLogBundle_Entry
   235  	// path is the constructed path of the stream being processed.
   236  	path types.StreamPath
   237  }
   238  
   239  // processLogStream processes an individual set of log messages belonging to the
   240  // same log stream.
   241  func (c *Collector) processLogStream(ctx context.Context, h *bundleEntryHandler) error {
   242  	streamTypeField := streamType(h.be.Desc)
   243  	startTime := clock.Now(ctx)
   244  	defer func() {
   245  		duration := clock.Now(ctx).Sub(startTime)
   246  
   247  		// We track processing time in milliseconds.
   248  		tsBundleEntryProcessingTime.Add(ctx, duration.Seconds()*1000, streamTypeField)
   249  	}()
   250  
   251  	// If this bundle has neither log entries nor a terminal index, it is junk and
   252  	// must be discarded.
   253  	//
   254  	// This is more important than a basic optimization, as it enforces that no
   255  	// zero-entry log streams can be ingested. Either some entries exist, or there
   256  	// is a promise of a terminal entry.
   257  	if len(h.be.Logs) == 0 && !h.be.Terminal {
   258  		log.Warningf(ctx, "Bundle entry is non-terminal and contains no logs; discarding.")
   259  		return nil
   260  	}
   261  
   262  	secret := types.PrefixSecret(h.b.Secret)
   263  	if err := secret.Validate(); err != nil {
   264  		log.Fields{
   265  			log.ErrorKey:   err,
   266  			"secretLength": len(secret),
   267  		}.Errorf(ctx, "Failed to validate prefix secret.")
   268  		return errors.New("invalid prefix secret")
   269  	}
   270  
   271  	// If the descriptor has a Prefix, it must match the bundle's Prefix.
   272  	if p := h.be.Desc.Prefix; p != "" {
   273  		if p != h.b.Prefix {
   274  			log.Fields{
   275  				"bundlePrefix":      h.b.Prefix,
   276  				"bundleEntryPrefix": p,
   277  			}.Errorf(ctx, "Bundle prefix does not match entry prefix.")
   278  			return errors.New("mismatched bundle and entry prefixes")
   279  		}
   280  	} else {
   281  		// Fill in the bundle's Prefix.
   282  		h.be.Desc.Prefix = h.b.Prefix
   283  	}
   284  
   285  	if err := h.be.Desc.Validate(true); err != nil {
   286  		log.WithError(err).Errorf(ctx, "Invalid log stream descriptor.")
   287  		return err
   288  	}
   289  	descBytes, err := proto.Marshal(h.be.Desc)
   290  	if err != nil {
   291  		log.WithError(err).Errorf(ctx, "Failed to marshal descriptor.")
   292  		return err
   293  	}
   294  
   295  	h.path = types.StreamName(h.be.Desc.Prefix).Join(types.StreamName(h.be.Desc.Name))
   296  	ctx = log.SetFields(ctx, log.Fields{
   297  		"project": h.project,
   298  		"path":    h.path,
   299  	})
   300  
   301  	// Confirm that the log entries are valid and contiguous. Serialize the log
   302  	// entries for ingest as we validate them.
   303  	var logData [][]byte
   304  	var blockIndex uint64
   305  	if logs := h.be.Logs; len(logs) > 0 {
   306  		logData = make([][]byte, len(logs))
   307  		blockIndex = logs[0].StreamIndex
   308  
   309  		for i, le := range logs {
   310  			// Validate this log entry.
   311  			if err := le.Validate(h.be.Desc); err != nil {
   312  				log.Fields{
   313  					log.ErrorKey: err,
   314  					"index":      le.StreamIndex,
   315  				}.Warningf(ctx, "Discarding invalid log entry.")
   316  				return errors.New("invalid log entry")
   317  			}
   318  
   319  			// Validate that this entry is contiguous.
   320  			if le.StreamIndex != blockIndex+uint64(i) {
   321  				log.Fields{
   322  					"index":    i,
   323  					"expected": (blockIndex + uint64(i)),
   324  					"actual":   le.StreamIndex,
   325  				}.Errorf(ctx, "Non-contiguous log entry block in stream.")
   326  				return errors.New("non-contiguous log entry block")
   327  			}
   328  
   329  			var err error
   330  			logData[i], err = proto.Marshal(le)
   331  			if err != nil {
   332  				log.Fields{
   333  					log.ErrorKey: err,
   334  					"index":      le.StreamIndex,
   335  				}.Errorf(ctx, "Failed to marshal log entry.")
   336  				return errors.New("failed to marshal log entries")
   337  			}
   338  		}
   339  	}
   340  
   341  	// Fetch our cached/remote state. This will replace our state object with the
   342  	// fetched state, so any future calls will need to re-set the Secret value.
   343  	// TODO: Use timeout?
   344  	registerReq := coordinator.LogStreamState{
   345  		Project:       h.project,
   346  		Path:          h.path,
   347  		Secret:        secret,
   348  		ProtoVersion:  h.md.ProtoVersion,
   349  		TerminalIndex: -1,
   350  	}
   351  	if h.be.Terminal {
   352  		registerReq.TerminalIndex = types.MessageIndex(h.be.TerminalIndex)
   353  	}
   354  	state, err := c.Coordinator.RegisterStream(ctx, &registerReq, descBytes)
   355  	if err != nil {
   356  		log.WithError(err).Errorf(ctx, "Failed to get/register current stream state.")
   357  		return err
   358  	}
   359  
   360  	// Does the log stream's secret match the expected secret?
   361  	//
   362  	// Note that this check does NOT use the "subtle" package to do time-constant
   363  	// byte comparison, and may leak information about the secret. This is OK,
   364  	// since users cannot interact with this service directly; however, if this
   365  	// code is ever used elsewhere, this should be a consideration.
   366  	if !bytes.Equal([]byte(secret), []byte(state.Secret)) {
   367  		log.Errorf(log.SetFields(ctx, log.Fields{
   368  			"secret":         secret,
   369  			"expectedSecret": state.Secret,
   370  		}), "Log entry has incorrect secret.")
   371  		return nil
   372  	}
   373  
   374  	if state.Archived {
   375  		log.Infof(ctx, "Skipping message bundle for archived stream.")
   376  		return nil
   377  	}
   378  	if state.Purged {
   379  		log.Infof(ctx, "Skipping message bundle for purged stream.")
   380  		return nil
   381  	}
   382  
   383  	// Update our terminal index if we have one.
   384  	//
   385  	// Note that even if our cached value is marked terminal, we could have failed
   386  	// to push the terminal index to the Coordinator, so we will not refrain from
   387  	// pushing every terminal index encountered regardless of cache state.
   388  	if h.be.Terminal {
   389  		tidx := types.MessageIndex(h.be.TerminalIndex)
   390  
   391  		// Bundle includes terminal index.
   392  
   393  		if state.TerminalIndex < 0 {
   394  			state.TerminalIndex = tidx
   395  		} else if state.TerminalIndex != tidx {
   396  			log.Fields{
   397  				"cachedIndex": state.TerminalIndex,
   398  				"bundleIndex": tidx,
   399  			}.Warningf(ctx, "Cached terminal index disagrees with state.")
   400  		}
   401  	}
   402  
   403  	// Perform stream processing operations. We can do these operations in
   404  	// parallel.
   405  	return parallel.FanOutIn(func(taskC chan<- func() error) {
   406  		// Store log data, if any was provided. It has already been validated.
   407  		if len(logData) > 0 {
   408  			taskC <- func() error {
   409  				// Post the log to storage.
   410  				err = c.Storage.Put(ctx, storage.PutRequest{
   411  					Project: h.project,
   412  					Path:    h.path,
   413  					Index:   types.MessageIndex(blockIndex),
   414  					Values:  logData,
   415  				})
   416  
   417  				// If the log entry already exists, consider the "put" successful.
   418  				// Storage will return a transient error if one occurred.
   419  				if err != nil && err != storage.ErrExists {
   420  					log.Fields{
   421  						log.ErrorKey: err,
   422  						"blockIndex": blockIndex,
   423  					}.Errorf(ctx, "Failed to load log entry into Storage.")
   424  					return err
   425  				}
   426  
   427  				tsLogs.Add(ctx, int64(len(logData)))
   428  				return nil
   429  			}
   430  		}
   431  
   432  		// If our bundle entry is terminal, we have an additional task of reporting
   433  		// this to the Coordinator.
   434  		if h.be.Terminal {
   435  			taskC <- func() error {
   436  				// Sentinel task: Update the terminal bundle state.
   437  				treq := coordinator.TerminateRequest{
   438  					Project:       state.Project,
   439  					Path:          state.Path,
   440  					ID:            state.ID,
   441  					Secret:        state.Secret,
   442  					TerminalIndex: types.MessageIndex(h.be.TerminalIndex),
   443  				}
   444  
   445  				log.Fields{
   446  					"terminalIndex": state.TerminalIndex,
   447  				}.Infof(ctx, "Received terminal log; updating Coordinator state.")
   448  				if err := c.Coordinator.TerminateStream(ctx, &treq); err != nil {
   449  					log.WithError(err).Errorf(ctx, "Failed to set stream terminal index.")
   450  					return err
   451  				}
   452  				return nil
   453  			}
   454  		}
   455  	})
   456  }
   457  
   458  func streamType(desc *logpb.LogStreamDescriptor) string {
   459  	if desc == nil {
   460  		return "UNKNOWN"
   461  	}
   462  	return desc.StreamType.String()
   463  }