go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/luciexe/host/buildmerge/agent.go (about)

     1  // Copyright 2019 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package buildmerge implements the build.proto tracking and merging logic for
    16  // luciexe host applications.
    17  //
    18  // You probably want to use `go.chromium.org/luci/luciexe/host` instead.
    19  //
    20  // This package is separate from luciexe/host to avoid unnecessary entaglement
    21  // with butler/logdog; All the logic here is implemented to avoid:
    22  //
    23  //   - interacting with the environment
    24  //   - interacting with butler/logdog (except by implementing callbacks for
    25  //     those, but only acting on simple datastructures/proto messages)
    26  //   - handling errors in any 'brutal' ways (all errors in this package are
    27  //     handled by reporting them directly in the data structures that this
    28  //     package manipulates).
    29  //
    30  // This is done to simplify testing (as much as it can be) by concentrating all
    31  // the environment stuff into luciexe/host, and all the 'pure' functional stuff
    32  // here (search "imperative shell, functional core").
    33  package buildmerge
    34  
    35  import (
    36  	"context"
    37  	"fmt"
    38  	"strings"
    39  	"sync"
    40  	"sync/atomic"
    41  
    42  	"github.com/golang/protobuf/ptypes"
    43  	"google.golang.org/protobuf/proto"
    44  	"google.golang.org/protobuf/types/known/timestamppb"
    45  
    46  	bbpb "go.chromium.org/luci/buildbucket/proto"
    47  	"go.chromium.org/luci/common/clock"
    48  	"go.chromium.org/luci/common/data/stringset"
    49  	"go.chromium.org/luci/common/errors"
    50  	"go.chromium.org/luci/common/proto/reflectutil"
    51  	"go.chromium.org/luci/common/sync/dispatcher"
    52  	"go.chromium.org/luci/common/sync/dispatcher/buffer"
    53  	"go.chromium.org/luci/logdog/api/logpb"
    54  	"go.chromium.org/luci/logdog/client/butler"
    55  	"go.chromium.org/luci/logdog/common/types"
    56  	"go.chromium.org/luci/luciexe"
    57  )
    58  
    59  // CalcURLFn is a stateless function which can calculate the absolute url and
    60  // viewUrl from a given logdog namespace (with trailing slash) and streamName.
    61  type CalcURLFn func(namespaceSlash, streamName types.StreamName) (url, viewUrl string)
    62  
    63  // Agent holds all the logic around merging build.proto streams.
    64  type Agent struct {
    65  	// MergedBuildC is the channel of all the merged builds generated by this
    66  	// Agent.
    67  	//
    68  	// The rate at which Agent merges Builds is governed by the consumption of
    69  	// this channel; Consuming it slowly will have Agent merge less frequently,
    70  	// and consuming it rapidly will have Agent merge more frequently.
    71  	//
    72  	// The last build before the channel closes will always be the final state of
    73  	// all builds at the time this Agent was Close()'d.
    74  	MergedBuildC <-chan *bbpb.Build
    75  
    76  	// Wait on this channel for the Agent to drain. Will only drain after calling
    77  	// Close() at least once.
    78  	DrainC <-chan struct{}
    79  
    80  	// used to cancel in-progress sendMerge calls.
    81  	ctx context.Context
    82  
    83  	// mergedBuildC is the send side of MergedBuildC
    84  	mergedBuildC chan<- *bbpb.Build
    85  
    86  	// userNamespace is the logdog namespace (with a trailing slash) which we'll
    87  	// use to determine if a new stream is potentially monitored, or not.
    88  	userNamespace types.StreamName
    89  
    90  	// userRootURL is the full url ('logdog://.../stream/build.proto') of the
    91  	// user's "root" build.proto stream (i.e. the one emitted by the top level
    92  	// luciexe implementation.
    93  	//
    94  	// This is used as a key to start the merge process.
    95  	userRootURL string
    96  	baseBuild   *bbpb.Build
    97  
    98  	// statesMu covers `states`. It must be held when reading or writing to
    99  	// `states`, but doesn't need to be held while interacting with an individual
   100  	// *buildState obtained from the map.
   101  	statesMu sync.RWMutex
   102  
   103  	// states maps a stream URL (i.e. `logdog://.../stream/build.proto`) to the
   104  	// state tracker for that stream.
   105  	states map[string]*buildStateTracker
   106  
   107  	// mergeCh is used in production mode to send pings via informNewData
   108  	mergeCh dispatcher.Channel
   109  
   110  	// informNewData is used to 'ping' mergeCh; it's overwritten in tests.
   111  	informNewData func()
   112  
   113  	// done is an atomically-accessed boolean
   114  	done int32
   115  
   116  	// calculateURLs is a function which can convert a logdog namespace and
   117  	// streamname into both the full 'Url' and 'ViewUrl' values for a Log message.
   118  	// This is used by the buildMerger itself when deriving keys for the `states`
   119  	// map, as well as for individual buildState objects to adjust their build's
   120  	// logs' URLs.
   121  	calculateURLs CalcURLFn
   122  }
   123  
   124  // New returns a new Agent.
   125  //
   126  // Args:
   127  //   - ctx - used for logging, clock and cancelation. When canceled, the Agent
   128  //     will cease sending updates on MergedBuildC, but you must still invoke
   129  //     Agent.Close() in order to clean up all resources associated with the
   130  //     Agent.
   131  //   - userNamespace - The logdog namespace (with a trailing slash) under which
   132  //     we should monitor streams.
   133  //   - base - The "model" Build message that all generated builds should start
   134  //     with. All build proto streams will be merged onto a copy of this message.
   135  //     Any Output.Log's which have non-absolute URLs will have their Url and
   136  //     ViewUrl absolutized relative to userNamespace using calculateURLs.
   137  //   - calculateURLs - A function to calculate Log.Url and Log.ViewUrl values.
   138  //     Should be a pure function.
   139  //
   140  // The following fields will be merged into `base` from the user controlled
   141  // build.proto stream(s):
   142  //
   143  //	Steps
   144  //	SummaryMarkdown
   145  //	Status
   146  //	StatusDetails
   147  //	UpdateTime
   148  //	Tags
   149  //	EndTime
   150  //	Output
   151  //
   152  // The frequency of updates from this Agent is governed by how quickly the
   153  // caller consumes from Agent.MergedBuildC.
   154  func New(ctx context.Context, userNamespace types.StreamName, base *bbpb.Build, calculateURLs CalcURLFn) (*Agent, error) {
   155  	userNamespace = userNamespace.AsNamespace()
   156  
   157  	ch := make(chan *bbpb.Build)
   158  	userRootURL, _ := calculateURLs(userNamespace, luciexe.BuildProtoStreamSuffix)
   159  
   160  	ret := &Agent{
   161  		ctx: ctx,
   162  
   163  		MergedBuildC: ch,
   164  
   165  		mergedBuildC:  ch,
   166  		states:        map[string]*buildStateTracker{},
   167  		calculateURLs: calculateURLs,
   168  		userNamespace: userNamespace,
   169  		userRootURL:   userRootURL,
   170  		baseBuild:     proto.Clone(base).(*bbpb.Build),
   171  	}
   172  	for _, log := range ret.baseBuild.GetOutput().GetLogs() {
   173  		var err error
   174  		log.Url, log.ViewUrl, err = absolutizeURLs(log.Url, log.ViewUrl, userNamespace, calculateURLs)
   175  		if err != nil {
   176  			return nil, errors.Annotate(err, "build.output.logs[%q]", log.Name).Err()
   177  		}
   178  	}
   179  
   180  	var err error
   181  	ret.mergeCh, err = dispatcher.NewChannel(ctx, &dispatcher.Options{
   182  		Buffer: buffer.Options{
   183  			MaxLeases:     1,
   184  			BatchItemsMax: 1,
   185  			FullBehavior:  &buffer.DropOldestBatch{},
   186  		},
   187  		DropFn:    dispatcher.DropFnQuiet,
   188  		DrainedFn: ret.finalize,
   189  	}, ret.sendMerge)
   190  	if err != nil {
   191  		return nil, err // creating dispatcher with static config should never fail
   192  	}
   193  	ret.informNewData = func() {
   194  		ret.mergeCh.C <- nil // content doesn't matter
   195  	}
   196  	ret.DrainC = ret.mergeCh.DrainC
   197  
   198  	return ret, nil
   199  }
   200  
   201  // Attach should be called once to attach this to a Butler.
   202  //
   203  // This must be done before the butler receives any build.proto streams.
   204  func (a *Agent) Attach(b *butler.Butler) {
   205  	b.AddStreamRegistrationCallback(a.onNewStream, true)
   206  }
   207  
   208  var validContentTypes = stringset.NewFromSlice(
   209  	luciexe.BuildProtoContentType,
   210  	luciexe.BuildProtoZlibContentType,
   211  )
   212  
   213  func (a *Agent) onNewStream(desc *logpb.LogStreamDescriptor) butler.StreamChunkCallback {
   214  	if !a.collectingData() {
   215  		return nil
   216  	}
   217  
   218  	namespace, base := types.StreamName(desc.Name).Split()
   219  
   220  	var err error
   221  	zlib := false
   222  	switch validStreamT, validContentT := desc.StreamType == logpb.StreamType_DATAGRAM, validContentTypes.Has(desc.ContentType); {
   223  	case validStreamT && validContentT:
   224  		zlib = desc.ContentType == luciexe.BuildProtoZlibContentType
   225  	case validStreamT && !validContentT:
   226  		err = errors.Reason("stream %q has content type %q, expected one of %v", desc.Name, desc.ContentType, validContentTypes.ToSortedSlice()).Err()
   227  	case !validStreamT && validContentT:
   228  		err = errors.Reason("build proto stream %q has type %q, expected %q", desc.Name, desc.StreamType, logpb.StreamType_DATAGRAM).Err()
   229  	case strings.HasPrefix(desc.Name, string(a.userNamespace)) && base == luciexe.BuildProtoStreamSuffix:
   230  		err = errors.Reason("build.proto stream %q has stream type %q and content type %q, expected %q and one of %v", desc.Name, desc.StreamType, desc.ContentType, logpb.StreamType_DATAGRAM, validContentTypes.ToSortedSlice()).Err()
   231  	default:
   232  		// neither a ".../build.proto" stream nor a stream with valid stream type
   233  		// or content type.
   234  		return nil
   235  	}
   236  
   237  	url, _ := a.calculateURLs("", types.StreamName(desc.Name))
   238  	bState := newBuildStateTracker(a.ctx, a, namespace, zlib, err)
   239  
   240  	a.statesMu.Lock()
   241  	defer a.statesMu.Unlock()
   242  	a.states[url] = bState
   243  	if err == nil {
   244  		return bState.handleNewData
   245  	}
   246  	return nil // no need to handle invalid stream.
   247  }
   248  
   249  // Close causes the Agent to stop collecting data, emit a final merged build,
   250  // and then shut down all internal routines.
   251  func (a *Agent) Close() {
   252  	// stops accepting new trackers
   253  	if atomic.SwapInt32(&a.done, 1) == 1 {
   254  		return
   255  	}
   256  
   257  	// close all states' and process their final work items. Closure should be
   258  	// very quick and will activate all final processing in parallel. GetFinal
   259  	// ensures that the state is completely settled.
   260  	states := a.snapStates()
   261  	for _, t := range states {
   262  		t.Close()
   263  	}
   264  	for _, t := range states {
   265  		t.Drain()
   266  	}
   267  
   268  	// tells our merge Channel to process all the current (now-final) states one
   269  	// last time.
   270  	a.informNewData()
   271  
   272  	// shut down the mergeCh so it will no longer accept new informNewData calls.
   273  	a.mergeCh.Close()
   274  }
   275  
   276  func (a *Agent) snapStates() map[string]*buildStateTracker {
   277  	a.statesMu.RLock()
   278  	trackers := make(map[string]*buildStateTracker, len(a.states))
   279  	for k, v := range a.states {
   280  		trackers[k] = v
   281  	}
   282  	a.statesMu.RUnlock()
   283  	return trackers
   284  }
   285  
   286  func (a *Agent) sendMerge(_ *buffer.Batch) error {
   287  	trackers := a.snapStates()
   288  
   289  	builds := make(map[string]*bbpb.Build, len(trackers))
   290  	stepCount := 0
   291  	for k, v := range trackers {
   292  		build := v.getLatestBuild()
   293  		stepCount += len(build.GetSteps())
   294  		builds[k] = build
   295  	}
   296  
   297  	base := reflectutil.ShallowCopy(a.baseBuild).(*bbpb.Build)
   298  	base.Steps = nil
   299  	if stepCount > 0 {
   300  		base.Steps = make([]*bbpb.Step, 0, stepCount)
   301  	}
   302  
   303  	var insertSteps func(stepNS []string, streamURL string, fromSubBuild bool) *bbpb.Build
   304  	insertSteps = func(stepNS []string, streamURL string, fromSubBuild bool) *bbpb.Build {
   305  		build, ok := builds[streamURL]
   306  		if !ok {
   307  			return nil
   308  		}
   309  		for _, step := range build.GetSteps() {
   310  			mb := step.GetMergeBuild()
   311  			mergeStream := mb.GetFromLogdogStream()
   312  			if mergeStream != "" || len(stepNS) > 0 || fromSubBuild {
   313  				step = proto.Clone(step).(*bbpb.Step)
   314  			}
   315  			baseName := step.Name
   316  			if len(stepNS) > 0 {
   317  				step.Name = strings.Join(append(stepNS, step.Name), "|")
   318  			}
   319  
   320  			base.Steps = append(base.Steps, step)
   321  
   322  			if mergeStream != "" {
   323  				var subNamespace []string
   324  				if !mb.LegacyGlobalNamespace {
   325  					subNamespace = append(stepNS, baseName)
   326  				}
   327  				subBuild := insertSteps(subNamespace, mergeStream, true)
   328  				if subBuild == nil {
   329  					var sb strings.Builder
   330  					if step.SummaryMarkdown != "" {
   331  						sb.WriteString(step.SummaryMarkdown)
   332  						sb.WriteString("\n\n")
   333  					}
   334  					if _, ok := builds[mergeStream]; ok {
   335  						sb.WriteString(fmt.Sprintf("build.proto stream: %q is empty", mergeStream))
   336  					} else {
   337  						sb.WriteString(fmt.Sprintf("build.proto stream: %q is not registered", mergeStream))
   338  					}
   339  					step.SummaryMarkdown = sb.String()
   340  				} else {
   341  					updateStepFromBuild(step, subBuild)
   342  					if mb.LegacyGlobalNamespace {
   343  						updateBuildFromGlobalSubBuild(build, subBuild)
   344  					}
   345  				}
   346  			}
   347  		}
   348  		return build
   349  	}
   350  	updateBaseFromUserBuild(base, insertSteps(nil, a.userRootURL, false))
   351  
   352  	select {
   353  	case a.mergedBuildC <- base:
   354  	case <-a.ctx.Done():
   355  		a.Close()
   356  	}
   357  
   358  	return nil
   359  }
   360  
   361  func (a *Agent) finalize() {
   362  	close(a.mergedBuildC)
   363  }
   364  
   365  func (a *Agent) collectingData() bool {
   366  	return atomic.LoadInt32(&a.done) == 0
   367  }
   368  
   369  // Used for minting protobuf timestamps for buildStateTrackers
   370  func (a *Agent) clockNow() *timestamppb.Timestamp {
   371  	ret, err := ptypes.TimestampProto(clock.Now(a.ctx))
   372  	if err != nil {
   373  		panic(err)
   374  	}
   375  	return ret
   376  }