github.com/bazelbuild/remote-apis-sdks@v0.0.0-20240425170053-8a36686a6350/go/pkg/client/exec.go (about)

     1  package client
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"io"
     7  	"sort"
     8  	"time"
     9  
    10  	"github.com/bazelbuild/remote-apis-sdks/go/pkg/digest"
    11  	log "github.com/golang/glog"
    12  	"google.golang.org/grpc/codes"
    13  	"google.golang.org/grpc/status"
    14  	"google.golang.org/protobuf/proto"
    15  
    16  	// Redundant imports are required for the google3 mirror. Aliases should not be changed.
    17  	regrpc "github.com/bazelbuild/remote-apis/build/bazel/remote/execution/v2"
    18  	repb "github.com/bazelbuild/remote-apis/build/bazel/remote/execution/v2"
    19  	gerrors "github.com/pkg/errors"
    20  	oppb "google.golang.org/genproto/googleapis/longrunning"
    21  	dpb "google.golang.org/protobuf/types/known/durationpb"
    22  )
    23  
    24  const (
    25  	containerImagePropertyName = "container-image"
    26  )
    27  
    28  // Action encodes the full details of an action to be sent to the remote execution service for
    29  // execution. It corresponds roughly, but not exactly, to the Action proto used by the Remote
    30  // Execution API.
    31  type Action struct {
    32  	// Args are the command-line arguments to start the process. The first argument is the process
    33  	// name, and the rest are its arguments.
    34  	Args []string
    35  	// EnvVars are the variables to add to the process's environment.
    36  	EnvVars map[string]string
    37  	// InputRoot and InputFiles contain the details of the input tree, in remote execution format.
    38  	// They should normally be constructed through the PackageTree function.
    39  	InputRoot  digest.Digest
    40  	InputFiles map[digest.Digest][]byte
    41  	// OutputFiles is a list of output files requested (full paths).
    42  	OutputFiles []string
    43  	// OutputDirs is a list of output directories requested (full paths).
    44  	OutputDirs []string
    45  	// Docker image is a docker:// URL to the docker image in which execution will take place.
    46  	DockerImage string
    47  	// Timeout is the maximum execution time for the action. Note that it's not an overall timeout on
    48  	// the process, since there may be additional time for transferring files, waiting for a worker to
    49  	// become available, or other overhead.
    50  	//
    51  	// If 0, the server's default timeout is used.
    52  	Timeout time.Duration
    53  	// DoNotCache, if true, indicates that the result of this action should never be cached. It
    54  	// implies SkipCache.
    55  	DoNotCache bool
    56  	// SkipCache, if true, indicates that this action should be executed even if there is a copy of
    57  	// its result in the action cache that could be used instead.
    58  	SkipCache bool
    59  }
    60  
    61  // ExecuteAction performs all of the steps necessary to execute an action, including checking the
    62  // cache if applicable, uploading necessary protos and inputs to the CAS, queueing the action, and
    63  // waiting for the result.
    64  //
    65  // Execute may block for a long time while the action is in progress. Currently, two-phase
    66  // queue-wait is not supported; the token necessary to query the job is not provided to users.
    67  //
    68  // This method MAY return a non-nil ActionResult along with a non-nil error if the action failed.
    69  // The ActionResult may include, for example, the stdout/stderr digest from the attempt.
    70  //
    71  // ExecuteAction is a convenience method which wraps both PrepAction and ExecuteAndWait, along with
    72  // other steps such as uploading extra inputs and parsing Operation protos.
    73  func (c *Client) ExecuteAction(ctx context.Context, ac *Action) (*repb.ActionResult, error) {
    74  	log.V(1).Infof("Executing action: %v", ac.Args)
    75  
    76  	// Construct the action we're trying to run.
    77  	acDg, res, err := c.PrepAction(ctx, ac)
    78  	if err != nil {
    79  		return nil, err
    80  	}
    81  	// If we found a result in the cache, return that.
    82  	if res != nil {
    83  		return res, nil
    84  	}
    85  
    86  	// Upload any remaining inputs.
    87  	if err := c.WriteBlobs(ctx, ac.InputFiles); err != nil {
    88  		return nil, gerrors.WithMessage(err, "uploading input files to the CAS")
    89  	}
    90  
    91  	log.V(1).Info("Executing job")
    92  	res, err = c.executeJob(ctx, ac.SkipCache, acDg)
    93  	if err != nil {
    94  		return res, gerrors.WithMessage(err, "executing an action")
    95  	}
    96  
    97  	return res, nil
    98  }
    99  
   100  // CheckActionCache queries remote action cache, returning an ActionResult or nil if it doesn't exist.
   101  func (c *Client) CheckActionCache(ctx context.Context, acDg *repb.Digest) (*repb.ActionResult, error) {
   102  	res, err := c.GetActionResult(ctx, &repb.GetActionResultRequest{
   103  		InstanceName: c.InstanceName,
   104  		ActionDigest: acDg,
   105  	})
   106  	switch st, _ := status.FromError(err); st.Code() {
   107  	case codes.OK:
   108  		return res, nil
   109  	case codes.NotFound:
   110  		return nil, nil
   111  	default:
   112  		return nil, gerrors.WithMessage(err, "checking the action cache")
   113  	}
   114  }
   115  
   116  func (c *Client) executeJob(ctx context.Context, skipCache bool, acDg *repb.Digest) (*repb.ActionResult, error) {
   117  	execReq := &repb.ExecuteRequest{
   118  		InstanceName:    c.InstanceName,
   119  		SkipCacheLookup: skipCache,
   120  		ActionDigest:    acDg,
   121  	}
   122  	op, err := c.ExecuteAndWait(ctx, execReq)
   123  	if err != nil {
   124  		return nil, gerrors.WithMessage(err, "execution error")
   125  	}
   126  
   127  	switch r := op.Result.(type) {
   128  	case *oppb.Operation_Error:
   129  		return nil, StatusDetailedError(status.FromProto(r.Error))
   130  	case *oppb.Operation_Response:
   131  		res := new(repb.ExecuteResponse)
   132  		if err := r.Response.UnmarshalTo(res); err != nil {
   133  			return nil, gerrors.WithMessage(err, "extracting ExecuteResponse from execution operation")
   134  		}
   135  		if st := status.FromProto(res.Status); st.Code() != codes.OK {
   136  			return res.Result, gerrors.WithMessage(StatusDetailedError(st), "job failed with error")
   137  		}
   138  		return res.Result, nil
   139  	default:
   140  		return nil, errors.New("unexpected operation result type")
   141  	}
   142  }
   143  
   144  // PrepAction constructs the Command and Action protos, checks the action cache if appropriate, and
   145  // uploads the action if the cache was not checked or if there was no cache hit. If successful,
   146  // PrepAction returns the digest of the Action and a (possibly nil) pointer to an ActionResult
   147  // representing the result of the cache check, if any.
   148  func (c *Client) PrepAction(ctx context.Context, ac *Action) (*repb.Digest, *repb.ActionResult, error) {
   149  	comDg, err := c.WriteProto(ctx, buildCommand(ac))
   150  	if err != nil {
   151  		return nil, nil, gerrors.WithMessage(err, "storing Command proto")
   152  	}
   153  
   154  	reAc := &repb.Action{
   155  		CommandDigest:   comDg.ToProto(),
   156  		InputRootDigest: ac.InputRoot.ToProto(),
   157  		DoNotCache:      ac.DoNotCache,
   158  	}
   159  	// Only set timeout if it's non-zero, because Timeout needs to be nil for the server to use a
   160  	// default.
   161  	if ac.Timeout != 0 {
   162  		reAc.Timeout = dpb.New(ac.Timeout)
   163  	}
   164  
   165  	acBlob, err := proto.Marshal(reAc)
   166  	if err != nil {
   167  		return nil, nil, gerrors.WithMessage(err, "marshalling Action proto")
   168  	}
   169  	acDg := digest.NewFromBlob(acBlob).ToProto()
   170  
   171  	// If the result is cacheable, check if it's already in the cache.
   172  	if !ac.DoNotCache || !ac.SkipCache {
   173  		log.V(1).Info("Checking cache")
   174  		res, err := c.CheckActionCache(ctx, acDg)
   175  		if err != nil {
   176  			return nil, nil, err
   177  		}
   178  		if res != nil {
   179  			return acDg, res, nil
   180  		}
   181  	}
   182  
   183  	// No cache hit, or we didn't check. Upload the action instead.
   184  	if _, err := c.WriteBlob(ctx, acBlob); err != nil {
   185  		return nil, nil, gerrors.WithMessage(err, "uploading action to the CAS")
   186  	}
   187  
   188  	return acDg, nil, nil
   189  }
   190  
   191  func buildCommand(ac *Action) *repb.Command {
   192  	cmd := &repb.Command{
   193  		Arguments: ac.Args,
   194  		// Do not use OutputFiles and OutputDirs directly from the Action, as we need to sort them which
   195  		// implies modification.
   196  		OutputFiles:       make([]string, len(ac.OutputFiles)),
   197  		OutputDirectories: make([]string, len(ac.OutputDirs)),
   198  		Platform: &repb.Platform{
   199  			Properties: []*repb.Platform_Property{{Name: containerImagePropertyName, Value: ac.DockerImage}},
   200  		},
   201  	}
   202  	copy(cmd.OutputFiles, ac.OutputFiles)
   203  	copy(cmd.OutputDirectories, ac.OutputDirs)
   204  	sort.Strings(cmd.OutputFiles)
   205  	sort.Strings(cmd.OutputDirectories)
   206  	for name, val := range ac.EnvVars {
   207  		cmd.EnvironmentVariables = append(cmd.EnvironmentVariables, &repb.Command_EnvironmentVariable{Name: name, Value: val})
   208  	}
   209  	sort.Slice(cmd.EnvironmentVariables, func(i, j int) bool { return cmd.EnvironmentVariables[i].Name < cmd.EnvironmentVariables[j].Name })
   210  	return cmd
   211  }
   212  
   213  // ExecuteAndWait calls Execute on the underlying client and WaitExecution if necessary. It returns
   214  // the completed operation or an error.
   215  //
   216  // The retry logic is complicated. Assuming retries are enabled, we want the retry to call
   217  // WaitExecution if there's an Operation "in progress", and to call Execute otherwise. In practice
   218  // that means:
   219  //  1. If an error occurs before the first operation is returned, or after the final operation is
   220  //     returned (i.e. the one with op.Done==true), retry by calling Execute again.
   221  //  2. Otherwise, retry by calling WaitExecution with the last operation name.
   222  //
   223  // In addition, we want the retrier to trigger based on certain operation statuses as well as on
   224  // explicit errors. (The shouldRetry function knows which statuses.) We do this by mapping statuses,
   225  // if present, to errors inside the closure and then throwing away such "fake" errors outside the
   226  // closure (if we ran out of retries or if there was never a retrier enabled). The exception is
   227  // deadline-exceeded statuses, which we never give to the retrier (and hence will always propagate
   228  // directly to the caller).
   229  func (c *Client) ExecuteAndWait(ctx context.Context, req *repb.ExecuteRequest) (op *oppb.Operation, err error) {
   230  	return c.ExecuteAndWaitProgress(ctx, req, nil)
   231  }
   232  
   233  // ExecuteAndWaitProgress calls Execute on the underlying client and WaitExecution if necessary. It returns
   234  // the completed operation or an error.
   235  // The supplied callback function is called for each message received to update the state of
   236  // the remote action.
   237  func (c *Client) ExecuteAndWaitProgress(ctx context.Context, req *repb.ExecuteRequest, progress func(metadata *repb.ExecuteOperationMetadata)) (op *oppb.Operation, err error) {
   238  	wait := false    // Should we retry by calling WaitExecution instead of Execute?
   239  	opError := false // Are we propagating an Operation status as an error for the retrier's benefit?
   240  	lastOp := &oppb.Operation{}
   241  	closure := func(ctx context.Context) (e error) {
   242  		var res regrpc.Execution_ExecuteClient
   243  		if wait {
   244  			res, e = c.WaitExecution(ctx, &repb.WaitExecutionRequest{Name: lastOp.Name})
   245  		} else {
   246  			res, e = c.Execute(ctx, req)
   247  		}
   248  		if e != nil {
   249  			return e
   250  		}
   251  		for {
   252  			op, e := res.Recv()
   253  			if e == io.EOF {
   254  				break
   255  			}
   256  			if e != nil {
   257  				return e
   258  			}
   259  			wait = !op.Done
   260  			lastOp = op
   261  			if progress != nil {
   262  				metadata := &repb.ExecuteOperationMetadata{}
   263  				if err := op.Metadata.UnmarshalTo(metadata); err == nil {
   264  					progress(metadata)
   265  				}
   266  			}
   267  		}
   268  		st := OperationStatus(lastOp)
   269  		if st != nil {
   270  			opError = true
   271  			if st.Code() == codes.DeadlineExceeded {
   272  				return nil
   273  			}
   274  			return st.Err()
   275  		}
   276  		return nil
   277  	}
   278  	err = c.Retrier.Do(ctx, func() error { return c.CallWithTimeout(ctx, "Execute", closure) })
   279  	if err != nil && !opError {
   280  		if st, ok := status.FromError(err); ok {
   281  			err = StatusDetailedError(st)
   282  		}
   283  		return nil, err
   284  	}
   285  
   286  	// In the off chance that the server closes the stream immediately without returning any Operation
   287  	// values and without returning an error, then lastOp will never be modified. Alternatively
   288  	// the server could return an empty operation explicitly prior to closing the stream. Either
   289  	// case is a server error.
   290  	if proto.Equal(lastOp, &oppb.Operation{}) {
   291  		return nil, errors.New("unexpected server behaviour: an empty Operation was returned, or no operation was returned")
   292  	}
   293  
   294  	return lastOp, nil
   295  }
   296  
   297  // OperationStatus returns an operation error status, if it is present, and nil otherwise.
   298  func OperationStatus(op *oppb.Operation) *status.Status {
   299  	var r *oppb.Operation_Response
   300  	var ok bool
   301  	if r, ok = op.Result.(*oppb.Operation_Response); !ok || r == nil {
   302  		return nil
   303  	}
   304  	respv2 := &repb.ExecuteResponse{}
   305  	if err := r.Response.UnmarshalTo(respv2); err != nil {
   306  		return nil
   307  	}
   308  	if s, ok := status.FromError(status.FromProto(respv2.Status).Err()); ok {
   309  		return s
   310  	}
   311  	return nil
   312  }