github.com/bazelbuild/remote-apis-sdks@v0.0.0-20240425170053-8a36686a6350/go/pkg/client/exec.go (about) 1 package client 2 3 import ( 4 "context" 5 "errors" 6 "io" 7 "sort" 8 "time" 9 10 "github.com/bazelbuild/remote-apis-sdks/go/pkg/digest" 11 log "github.com/golang/glog" 12 "google.golang.org/grpc/codes" 13 "google.golang.org/grpc/status" 14 "google.golang.org/protobuf/proto" 15 16 // Redundant imports are required for the google3 mirror. Aliases should not be changed. 17 regrpc "github.com/bazelbuild/remote-apis/build/bazel/remote/execution/v2" 18 repb "github.com/bazelbuild/remote-apis/build/bazel/remote/execution/v2" 19 gerrors "github.com/pkg/errors" 20 oppb "google.golang.org/genproto/googleapis/longrunning" 21 dpb "google.golang.org/protobuf/types/known/durationpb" 22 ) 23 24 const ( 25 containerImagePropertyName = "container-image" 26 ) 27 28 // Action encodes the full details of an action to be sent to the remote execution service for 29 // execution. It corresponds roughly, but not exactly, to the Action proto used by the Remote 30 // Execution API. 31 type Action struct { 32 // Args are the command-line arguments to start the process. The first argument is the process 33 // name, and the rest are its arguments. 34 Args []string 35 // EnvVars are the variables to add to the process's environment. 36 EnvVars map[string]string 37 // InputRoot and InputFiles contain the details of the input tree, in remote execution format. 38 // They should normally be constructed through the PackageTree function. 39 InputRoot digest.Digest 40 InputFiles map[digest.Digest][]byte 41 // OutputFiles is a list of output files requested (full paths). 42 OutputFiles []string 43 // OutputDirs is a list of output directories requested (full paths). 44 OutputDirs []string 45 // Docker image is a docker:// URL to the docker image in which execution will take place. 46 DockerImage string 47 // Timeout is the maximum execution time for the action. Note that it's not an overall timeout on 48 // the process, since there may be additional time for transferring files, waiting for a worker to 49 // become available, or other overhead. 50 // 51 // If 0, the server's default timeout is used. 52 Timeout time.Duration 53 // DoNotCache, if true, indicates that the result of this action should never be cached. It 54 // implies SkipCache. 55 DoNotCache bool 56 // SkipCache, if true, indicates that this action should be executed even if there is a copy of 57 // its result in the action cache that could be used instead. 58 SkipCache bool 59 } 60 61 // ExecuteAction performs all of the steps necessary to execute an action, including checking the 62 // cache if applicable, uploading necessary protos and inputs to the CAS, queueing the action, and 63 // waiting for the result. 64 // 65 // Execute may block for a long time while the action is in progress. Currently, two-phase 66 // queue-wait is not supported; the token necessary to query the job is not provided to users. 67 // 68 // This method MAY return a non-nil ActionResult along with a non-nil error if the action failed. 69 // The ActionResult may include, for example, the stdout/stderr digest from the attempt. 70 // 71 // ExecuteAction is a convenience method which wraps both PrepAction and ExecuteAndWait, along with 72 // other steps such as uploading extra inputs and parsing Operation protos. 73 func (c *Client) ExecuteAction(ctx context.Context, ac *Action) (*repb.ActionResult, error) { 74 log.V(1).Infof("Executing action: %v", ac.Args) 75 76 // Construct the action we're trying to run. 77 acDg, res, err := c.PrepAction(ctx, ac) 78 if err != nil { 79 return nil, err 80 } 81 // If we found a result in the cache, return that. 82 if res != nil { 83 return res, nil 84 } 85 86 // Upload any remaining inputs. 87 if err := c.WriteBlobs(ctx, ac.InputFiles); err != nil { 88 return nil, gerrors.WithMessage(err, "uploading input files to the CAS") 89 } 90 91 log.V(1).Info("Executing job") 92 res, err = c.executeJob(ctx, ac.SkipCache, acDg) 93 if err != nil { 94 return res, gerrors.WithMessage(err, "executing an action") 95 } 96 97 return res, nil 98 } 99 100 // CheckActionCache queries remote action cache, returning an ActionResult or nil if it doesn't exist. 101 func (c *Client) CheckActionCache(ctx context.Context, acDg *repb.Digest) (*repb.ActionResult, error) { 102 res, err := c.GetActionResult(ctx, &repb.GetActionResultRequest{ 103 InstanceName: c.InstanceName, 104 ActionDigest: acDg, 105 }) 106 switch st, _ := status.FromError(err); st.Code() { 107 case codes.OK: 108 return res, nil 109 case codes.NotFound: 110 return nil, nil 111 default: 112 return nil, gerrors.WithMessage(err, "checking the action cache") 113 } 114 } 115 116 func (c *Client) executeJob(ctx context.Context, skipCache bool, acDg *repb.Digest) (*repb.ActionResult, error) { 117 execReq := &repb.ExecuteRequest{ 118 InstanceName: c.InstanceName, 119 SkipCacheLookup: skipCache, 120 ActionDigest: acDg, 121 } 122 op, err := c.ExecuteAndWait(ctx, execReq) 123 if err != nil { 124 return nil, gerrors.WithMessage(err, "execution error") 125 } 126 127 switch r := op.Result.(type) { 128 case *oppb.Operation_Error: 129 return nil, StatusDetailedError(status.FromProto(r.Error)) 130 case *oppb.Operation_Response: 131 res := new(repb.ExecuteResponse) 132 if err := r.Response.UnmarshalTo(res); err != nil { 133 return nil, gerrors.WithMessage(err, "extracting ExecuteResponse from execution operation") 134 } 135 if st := status.FromProto(res.Status); st.Code() != codes.OK { 136 return res.Result, gerrors.WithMessage(StatusDetailedError(st), "job failed with error") 137 } 138 return res.Result, nil 139 default: 140 return nil, errors.New("unexpected operation result type") 141 } 142 } 143 144 // PrepAction constructs the Command and Action protos, checks the action cache if appropriate, and 145 // uploads the action if the cache was not checked or if there was no cache hit. If successful, 146 // PrepAction returns the digest of the Action and a (possibly nil) pointer to an ActionResult 147 // representing the result of the cache check, if any. 148 func (c *Client) PrepAction(ctx context.Context, ac *Action) (*repb.Digest, *repb.ActionResult, error) { 149 comDg, err := c.WriteProto(ctx, buildCommand(ac)) 150 if err != nil { 151 return nil, nil, gerrors.WithMessage(err, "storing Command proto") 152 } 153 154 reAc := &repb.Action{ 155 CommandDigest: comDg.ToProto(), 156 InputRootDigest: ac.InputRoot.ToProto(), 157 DoNotCache: ac.DoNotCache, 158 } 159 // Only set timeout if it's non-zero, because Timeout needs to be nil for the server to use a 160 // default. 161 if ac.Timeout != 0 { 162 reAc.Timeout = dpb.New(ac.Timeout) 163 } 164 165 acBlob, err := proto.Marshal(reAc) 166 if err != nil { 167 return nil, nil, gerrors.WithMessage(err, "marshalling Action proto") 168 } 169 acDg := digest.NewFromBlob(acBlob).ToProto() 170 171 // If the result is cacheable, check if it's already in the cache. 172 if !ac.DoNotCache || !ac.SkipCache { 173 log.V(1).Info("Checking cache") 174 res, err := c.CheckActionCache(ctx, acDg) 175 if err != nil { 176 return nil, nil, err 177 } 178 if res != nil { 179 return acDg, res, nil 180 } 181 } 182 183 // No cache hit, or we didn't check. Upload the action instead. 184 if _, err := c.WriteBlob(ctx, acBlob); err != nil { 185 return nil, nil, gerrors.WithMessage(err, "uploading action to the CAS") 186 } 187 188 return acDg, nil, nil 189 } 190 191 func buildCommand(ac *Action) *repb.Command { 192 cmd := &repb.Command{ 193 Arguments: ac.Args, 194 // Do not use OutputFiles and OutputDirs directly from the Action, as we need to sort them which 195 // implies modification. 196 OutputFiles: make([]string, len(ac.OutputFiles)), 197 OutputDirectories: make([]string, len(ac.OutputDirs)), 198 Platform: &repb.Platform{ 199 Properties: []*repb.Platform_Property{{Name: containerImagePropertyName, Value: ac.DockerImage}}, 200 }, 201 } 202 copy(cmd.OutputFiles, ac.OutputFiles) 203 copy(cmd.OutputDirectories, ac.OutputDirs) 204 sort.Strings(cmd.OutputFiles) 205 sort.Strings(cmd.OutputDirectories) 206 for name, val := range ac.EnvVars { 207 cmd.EnvironmentVariables = append(cmd.EnvironmentVariables, &repb.Command_EnvironmentVariable{Name: name, Value: val}) 208 } 209 sort.Slice(cmd.EnvironmentVariables, func(i, j int) bool { return cmd.EnvironmentVariables[i].Name < cmd.EnvironmentVariables[j].Name }) 210 return cmd 211 } 212 213 // ExecuteAndWait calls Execute on the underlying client and WaitExecution if necessary. It returns 214 // the completed operation or an error. 215 // 216 // The retry logic is complicated. Assuming retries are enabled, we want the retry to call 217 // WaitExecution if there's an Operation "in progress", and to call Execute otherwise. In practice 218 // that means: 219 // 1. If an error occurs before the first operation is returned, or after the final operation is 220 // returned (i.e. the one with op.Done==true), retry by calling Execute again. 221 // 2. Otherwise, retry by calling WaitExecution with the last operation name. 222 // 223 // In addition, we want the retrier to trigger based on certain operation statuses as well as on 224 // explicit errors. (The shouldRetry function knows which statuses.) We do this by mapping statuses, 225 // if present, to errors inside the closure and then throwing away such "fake" errors outside the 226 // closure (if we ran out of retries or if there was never a retrier enabled). The exception is 227 // deadline-exceeded statuses, which we never give to the retrier (and hence will always propagate 228 // directly to the caller). 229 func (c *Client) ExecuteAndWait(ctx context.Context, req *repb.ExecuteRequest) (op *oppb.Operation, err error) { 230 return c.ExecuteAndWaitProgress(ctx, req, nil) 231 } 232 233 // ExecuteAndWaitProgress calls Execute on the underlying client and WaitExecution if necessary. It returns 234 // the completed operation or an error. 235 // The supplied callback function is called for each message received to update the state of 236 // the remote action. 237 func (c *Client) ExecuteAndWaitProgress(ctx context.Context, req *repb.ExecuteRequest, progress func(metadata *repb.ExecuteOperationMetadata)) (op *oppb.Operation, err error) { 238 wait := false // Should we retry by calling WaitExecution instead of Execute? 239 opError := false // Are we propagating an Operation status as an error for the retrier's benefit? 240 lastOp := &oppb.Operation{} 241 closure := func(ctx context.Context) (e error) { 242 var res regrpc.Execution_ExecuteClient 243 if wait { 244 res, e = c.WaitExecution(ctx, &repb.WaitExecutionRequest{Name: lastOp.Name}) 245 } else { 246 res, e = c.Execute(ctx, req) 247 } 248 if e != nil { 249 return e 250 } 251 for { 252 op, e := res.Recv() 253 if e == io.EOF { 254 break 255 } 256 if e != nil { 257 return e 258 } 259 wait = !op.Done 260 lastOp = op 261 if progress != nil { 262 metadata := &repb.ExecuteOperationMetadata{} 263 if err := op.Metadata.UnmarshalTo(metadata); err == nil { 264 progress(metadata) 265 } 266 } 267 } 268 st := OperationStatus(lastOp) 269 if st != nil { 270 opError = true 271 if st.Code() == codes.DeadlineExceeded { 272 return nil 273 } 274 return st.Err() 275 } 276 return nil 277 } 278 err = c.Retrier.Do(ctx, func() error { return c.CallWithTimeout(ctx, "Execute", closure) }) 279 if err != nil && !opError { 280 if st, ok := status.FromError(err); ok { 281 err = StatusDetailedError(st) 282 } 283 return nil, err 284 } 285 286 // In the off chance that the server closes the stream immediately without returning any Operation 287 // values and without returning an error, then lastOp will never be modified. Alternatively 288 // the server could return an empty operation explicitly prior to closing the stream. Either 289 // case is a server error. 290 if proto.Equal(lastOp, &oppb.Operation{}) { 291 return nil, errors.New("unexpected server behaviour: an empty Operation was returned, or no operation was returned") 292 } 293 294 return lastOp, nil 295 } 296 297 // OperationStatus returns an operation error status, if it is present, and nil otherwise. 298 func OperationStatus(op *oppb.Operation) *status.Status { 299 var r *oppb.Operation_Response 300 var ok bool 301 if r, ok = op.Result.(*oppb.Operation_Response); !ok || r == nil { 302 return nil 303 } 304 respv2 := &repb.ExecuteResponse{} 305 if err := r.Response.UnmarshalTo(respv2); err != nil { 306 return nil 307 } 308 if s, ok := status.FromError(status.FromProto(respv2.Status).Err()); ok { 309 return s 310 } 311 return nil 312 }