github.com/inspektor-gadget/inspektor-gadget@v0.28.1/pkg/runtime/grpc/grpc-runtime.go (about)

     1  // Copyright 2023-2024 The Inspektor Gadget authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package grpcruntime
    16  
    17  import (
    18  	"context"
    19  	_ "embed"
    20  	"errors"
    21  	"fmt"
    22  	"io"
    23  	"net"
    24  	"net/url"
    25  	"strings"
    26  	"sync"
    27  	"time"
    28  
    29  	log "github.com/sirupsen/logrus"
    30  	"google.golang.org/grpc"
    31  	"google.golang.org/grpc/credentials/insecure"
    32  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    33  	"k8s.io/client-go/kubernetes"
    34  	"k8s.io/client-go/rest"
    35  
    36  	"github.com/inspektor-gadget/inspektor-gadget/internal/deployinfo"
    37  	"github.com/inspektor-gadget/inspektor-gadget/pkg/gadget-service/api"
    38  	"github.com/inspektor-gadget/inspektor-gadget/pkg/gadgets"
    39  	"github.com/inspektor-gadget/inspektor-gadget/pkg/logger"
    40  	"github.com/inspektor-gadget/inspektor-gadget/pkg/operators"
    41  	"github.com/inspektor-gadget/inspektor-gadget/pkg/params"
    42  	"github.com/inspektor-gadget/inspektor-gadget/pkg/runtime"
    43  )
    44  
    45  type ConnectionMode int
    46  
    47  const (
    48  	// ConnectionModeDirect will connect directly to the remote using the gRPC protocol; the remote side can either
    49  	// be a tcp or a unix socket endpoint
    50  	ConnectionModeDirect ConnectionMode = iota
    51  
    52  	// ConnectionModeKubernetesProxy will connect to a gRPC endpoint through a kubernetes API server by first looking
    53  	// up an appropriate target node using the kubernetes API, then using the port forward
    54  	// endpoint of the Kubernetes API to forward the gRPC connection to the service listener (see gadgettracermgr).
    55  	ConnectionModeKubernetesProxy
    56  )
    57  
    58  const (
    59  	ParamNode              = "node"
    60  	ParamRemoteAddress     = "remote-address"
    61  	ParamConnectionMethod  = "connection-method"
    62  	ParamConnectionTimeout = "connection-timeout"
    63  
    64  	// ParamGadgetServiceTCPPort is only used in combination with KubernetesProxyConnectionMethodTCP
    65  	ParamGadgetServiceTCPPort = "tcp-port"
    66  
    67  	// ConnectTimeout is the time in seconds we wait for a connection to the remote to
    68  	// succeed
    69  	ConnectTimeout = 5
    70  
    71  	// ResultTimeout is the time in seconds we wait for a result to return from the gadget
    72  	// after sending a Stop command
    73  	ResultTimeout = 30
    74  
    75  	ParamGadgetNamespace   string = "gadget-namespace"
    76  	DefaultGadgetNamespace string = "gadget"
    77  )
    78  
    79  type Runtime struct {
    80  	info           *deployinfo.DeployInfo
    81  	defaultValues  map[string]string
    82  	globalParams   *params.Params
    83  	restConfig     *rest.Config
    84  	connectionMode ConnectionMode
    85  }
    86  
    87  type RunClient interface {
    88  	Recv() (*api.GadgetEvent, error)
    89  }
    90  
    91  // New instantiates the runtime and loads the locally stored gadget info. If no info is stored locally,
    92  // it will try to fetch one from one of the gadget nodes and store it locally. It will issue warnings on
    93  // failures.
    94  func New(options ...Option) *Runtime {
    95  	r := &Runtime{
    96  		defaultValues: map[string]string{},
    97  	}
    98  	for _, option := range options {
    99  		option(r)
   100  	}
   101  	return r
   102  }
   103  
   104  func (r *Runtime) Init(runtimeGlobalParams *params.Params) error {
   105  	if runtimeGlobalParams == nil {
   106  		runtimeGlobalParams = r.GlobalParamDescs().ToParams()
   107  	}
   108  
   109  	// overwrite only if not yet initialized; for gadgetctl, this initialization happens
   110  	// already in the main.go to specify a target address
   111  	if r.globalParams == nil {
   112  		r.globalParams = runtimeGlobalParams
   113  	}
   114  	return nil
   115  }
   116  
   117  func (r *Runtime) SetRestConfig(config *rest.Config) {
   118  	r.restConfig = config
   119  }
   120  
   121  func (r *Runtime) Close() error {
   122  	return nil
   123  }
   124  
   125  func checkForDuplicates(subject string) func(value string) error {
   126  	return func(value string) error {
   127  		values := strings.Split(value, ",")
   128  		valueMap := make(map[string]struct{})
   129  		for _, v := range values {
   130  			if _, ok := valueMap[v]; ok {
   131  				return fmt.Errorf("duplicate %s: %s", subject, v)
   132  			}
   133  			valueMap[v] = struct{}{}
   134  		}
   135  		return nil
   136  	}
   137  }
   138  
   139  func (r *Runtime) ParamDescs() params.ParamDescs {
   140  	p := params.ParamDescs{}
   141  	switch r.connectionMode {
   142  	case ConnectionModeDirect:
   143  		return p
   144  	case ConnectionModeKubernetesProxy:
   145  		p.Add(params.ParamDescs{
   146  			{
   147  				Key:         ParamNode,
   148  				Description: "Comma-separated list of nodes to run the gadget on",
   149  				Validator:   checkForDuplicates("node"),
   150  			},
   151  		}...)
   152  		return p
   153  	}
   154  	panic("invalid connection mode set for grpc-runtime")
   155  }
   156  
   157  func (r *Runtime) GlobalParamDescs() params.ParamDescs {
   158  	p := params.ParamDescs{
   159  		{
   160  			Key:          ParamConnectionTimeout,
   161  			Description:  "Maximum time to establish a connection to remote target in seconds",
   162  			DefaultValue: fmt.Sprintf("%d", ConnectTimeout),
   163  			TypeHint:     params.TypeUint16,
   164  		},
   165  	}
   166  	switch r.connectionMode {
   167  	case ConnectionModeDirect:
   168  		p.Add(params.ParamDescs{
   169  			{
   170  				Key:          ParamRemoteAddress,
   171  				Description:  "Comma-separated list of remote address (gRPC) to connect to",
   172  				DefaultValue: api.DefaultDaemonPath,
   173  				Validator:    checkForDuplicates("address"),
   174  			},
   175  		}...)
   176  		return p
   177  	case ConnectionModeKubernetesProxy:
   178  		p.Add(params.ParamDescs{
   179  			{
   180  				Key:          ParamGadgetServiceTCPPort,
   181  				Description:  "Port used to connect to the gadget service",
   182  				DefaultValue: fmt.Sprintf("%d", api.GadgetServicePort),
   183  				TypeHint:     params.TypeUint16,
   184  			},
   185  			{
   186  				Key:          ParamGadgetNamespace,
   187  				Description:  "Namespace where the Inspektor Gadget is deployed",
   188  				DefaultValue: DefaultGadgetNamespace,
   189  				TypeHint:     params.TypeString,
   190  			},
   191  		}...)
   192  		return p
   193  	}
   194  	panic("invalid connection mode set for grpc-runtime")
   195  }
   196  
   197  type target struct {
   198  	addressOrPod string
   199  	node         string
   200  }
   201  
   202  func getGadgetPods(ctx context.Context, config *rest.Config, nodes []string, gadgetNamespace string) ([]target, error) {
   203  	client, err := kubernetes.NewForConfig(config)
   204  	if err != nil {
   205  		return nil, fmt.Errorf("setting up trace client: %w", err)
   206  	}
   207  
   208  	opts := metav1.ListOptions{LabelSelector: "k8s-app=gadget"}
   209  	pods, err := client.CoreV1().Pods(gadgetNamespace).List(ctx, opts)
   210  	if err != nil {
   211  		return nil, fmt.Errorf("getting pods: %w", err)
   212  	}
   213  
   214  	if len(pods.Items) == 0 {
   215  		return nil, fmt.Errorf("no gadget pods found in namespace %q. Is Inspektor Gadget deployed?", gadgetNamespace)
   216  	}
   217  
   218  	if len(nodes) == 0 {
   219  		res := make([]target, 0, len(pods.Items))
   220  
   221  		for _, pod := range pods.Items {
   222  			res = append(res, target{addressOrPod: pod.Name, node: pod.Spec.NodeName})
   223  		}
   224  
   225  		return res, nil
   226  	}
   227  
   228  	res := make([]target, 0, len(nodes))
   229  nodesLoop:
   230  	for _, node := range nodes {
   231  		for _, pod := range pods.Items {
   232  			if node == pod.Spec.NodeName {
   233  				res = append(res, target{addressOrPod: pod.Name, node: node})
   234  				continue nodesLoop
   235  			}
   236  		}
   237  		return nil, fmt.Errorf("node %q does not have a gadget pod", node)
   238  	}
   239  
   240  	return res, nil
   241  }
   242  
   243  func (r *Runtime) getTargets(ctx context.Context, params *params.Params) ([]target, error) {
   244  	switch r.connectionMode {
   245  	case ConnectionModeKubernetesProxy:
   246  		// Get nodes to run on
   247  		nodes := params.Get(ParamNode).AsStringSlice()
   248  		gadgetNamespace := r.globalParams.Get(ParamGadgetNamespace).AsString()
   249  		pods, err := getGadgetPods(ctx, r.restConfig, nodes, gadgetNamespace)
   250  		if err != nil {
   251  			return nil, fmt.Errorf("get gadget pods: %w", err)
   252  		}
   253  		if len(pods) == 0 {
   254  			return nil, fmt.Errorf("get gadget pods: Inspektor Gadget is not running on the requested node(s): %v", nodes)
   255  		}
   256  		return pods, nil
   257  	case ConnectionModeDirect:
   258  		inTargets := r.globalParams.Get(ParamRemoteAddress).AsStringSlice()
   259  		targets := make([]target, 0)
   260  		for _, t := range inTargets {
   261  			purl, err := url.Parse(t)
   262  			if err != nil {
   263  				return nil, fmt.Errorf("invalid remote address %q: %w", t, err)
   264  			}
   265  			tg := target{
   266  				addressOrPod: purl.Host,
   267  				node:         purl.Hostname(),
   268  			}
   269  			if purl.Scheme == "unix" {
   270  				// use the whole url in case of a unix socket and "local" as node
   271  				tg.addressOrPod = t
   272  				tg.node = "local"
   273  			}
   274  			targets = append(targets, tg)
   275  		}
   276  		return targets, nil
   277  	}
   278  	return nil, fmt.Errorf("unsupported connection mode")
   279  }
   280  
   281  func (r *Runtime) RunBuiltInGadget(gadgetCtx runtime.GadgetContext) (runtime.CombinedGadgetResult, error) {
   282  	paramMap := make(map[string]string)
   283  	gadgets.ParamsToMap(
   284  		paramMap,
   285  		gadgetCtx.GadgetParams(),
   286  		gadgetCtx.RuntimeParams(),
   287  		gadgetCtx.OperatorsParamCollection(),
   288  	)
   289  
   290  	gadgetCtx.Logger().Debugf("Params")
   291  	for k, v := range paramMap {
   292  		gadgetCtx.Logger().Debugf("- %s: %q", k, v)
   293  	}
   294  
   295  	targets, err := r.getTargets(gadgetCtx.Context(), gadgetCtx.RuntimeParams())
   296  	if err != nil {
   297  		return nil, fmt.Errorf("getting target nodes: %w", err)
   298  	}
   299  	return r.runBuiltInGadgetOnTargets(gadgetCtx, paramMap, targets)
   300  }
   301  
   302  func (r *Runtime) getConnToRandomTarget(ctx context.Context, runtimeParams *params.Params) (*grpc.ClientConn, error) {
   303  	targets, err := r.getTargets(ctx, runtimeParams)
   304  	if err != nil {
   305  		return nil, err
   306  	}
   307  	if len(targets) == 0 {
   308  		return nil, fmt.Errorf("no valid targets")
   309  	}
   310  	target := targets[0]
   311  	log.Debugf("using target %q (%q)", target.addressOrPod, target.node)
   312  
   313  	timeout := time.Second * time.Duration(r.globalParams.Get(ParamConnectionTimeout).AsUint16())
   314  	conn, err := r.dialContext(ctx, target, timeout)
   315  	if err != nil {
   316  		return nil, fmt.Errorf("dialing %q (%q): %w", target.addressOrPod, target.node, err)
   317  	}
   318  	return conn, nil
   319  }
   320  
   321  func (r *Runtime) runBuiltInGadgetOnTargets(
   322  	gadgetCtx runtime.GadgetContext,
   323  	paramMap map[string]string,
   324  	targets []target,
   325  ) (runtime.CombinedGadgetResult, error) {
   326  	gType := gadgetCtx.GadgetDesc().Type()
   327  
   328  	if gType == gadgets.TypeTraceIntervals {
   329  		gadgetCtx.Parser().EnableSnapshots(
   330  			gadgetCtx.Context(),
   331  			time.Duration(gadgetCtx.GadgetParams().Get(gadgets.ParamInterval).AsInt32())*time.Second,
   332  			2,
   333  		)
   334  		defer gadgetCtx.Parser().Flush()
   335  	}
   336  
   337  	if gType == gadgets.TypeOneShot {
   338  		gadgetCtx.Parser().EnableCombiner()
   339  		defer gadgetCtx.Parser().Flush()
   340  	}
   341  
   342  	results := make(runtime.CombinedGadgetResult)
   343  	var resultsLock sync.Mutex
   344  
   345  	wg := sync.WaitGroup{}
   346  	for _, t := range targets {
   347  		wg.Add(1)
   348  		go func(target target) {
   349  			gadgetCtx.Logger().Debugf("running gadget on node %q", target.node)
   350  			res, err := r.runBuiltInGadget(gadgetCtx, target, paramMap)
   351  			resultsLock.Lock()
   352  			results[target.node] = &runtime.GadgetResult{
   353  				Payload: res,
   354  				Error:   err,
   355  			}
   356  			resultsLock.Unlock()
   357  			wg.Done()
   358  		}(t)
   359  	}
   360  
   361  	wg.Wait()
   362  	return results, results.Err()
   363  }
   364  
   365  func (r *Runtime) dialContext(dialCtx context.Context, target target, timeout time.Duration) (*grpc.ClientConn, error) {
   366  	opts := []grpc.DialOption{
   367  		grpc.WithTransportCredentials(insecure.NewCredentials()),
   368  		grpc.WithBlock(),
   369  	}
   370  
   371  	// If we're in Kubernetes connection mode, we need a custom dialer
   372  	if r.connectionMode == ConnectionModeKubernetesProxy {
   373  		opts = append(opts, grpc.WithContextDialer(func(ctx context.Context, s string) (net.Conn, error) {
   374  			port := r.globalParams.Get(ParamGadgetServiceTCPPort).AsUint16()
   375  			gadgetNamespace := r.globalParams.Get(ParamGadgetNamespace).AsString()
   376  			return NewK8SPortFwdConn(ctx, r.restConfig, gadgetNamespace, target, port, timeout)
   377  		}))
   378  	} else {
   379  		newCtx, cancel := context.WithTimeout(dialCtx, timeout)
   380  		defer cancel()
   381  		dialCtx = newCtx
   382  	}
   383  
   384  	conn, err := grpc.DialContext(dialCtx, "passthrough:///"+target.addressOrPod, opts...)
   385  	if err != nil {
   386  		return nil, fmt.Errorf("dialing %q (%q): %w", target.addressOrPod, target.node, err)
   387  	}
   388  	return conn, nil
   389  }
   390  
   391  func (r *Runtime) runBuiltInGadget(gadgetCtx runtime.GadgetContext, target target, allParams map[string]string) ([]byte, error) {
   392  	// Notice that we cannot use gadgetCtx.Context() here, as that would - when cancelled by the user - also cancel the
   393  	// underlying gRPC connection. That would then lead to results not being received anymore (mostly for profile
   394  	// gadgets.)
   395  	connCtx, cancel := context.WithCancel(context.Background())
   396  	defer cancel()
   397  
   398  	timeout := time.Second * time.Duration(r.globalParams.Get(ParamConnectionTimeout).AsUint16())
   399  	dialCtx, cancelDial := context.WithTimeout(gadgetCtx.Context(), timeout)
   400  	defer cancelDial()
   401  
   402  	conn, err := r.dialContext(dialCtx, target, timeout)
   403  	if err != nil {
   404  		return nil, fmt.Errorf("dialing target on node %q: %w", target.node, err)
   405  	}
   406  	defer conn.Close()
   407  	client := api.NewBuiltInGadgetManagerClient(conn)
   408  
   409  	runRequest := &api.BuiltInGadgetRunRequest{
   410  		GadgetName:     gadgetCtx.GadgetDesc().Name(),
   411  		GadgetCategory: gadgetCtx.GadgetDesc().Category(),
   412  		Params:         allParams,
   413  		Args:           gadgetCtx.Args(),
   414  		Nodes:          nil,
   415  		FanOut:         false,
   416  		LogLevel:       uint32(gadgetCtx.Logger().GetLevel()),
   417  		Timeout:        int64(gadgetCtx.Timeout()),
   418  	}
   419  
   420  	runClient, err := client.RunBuiltInGadget(connCtx)
   421  	if err != nil && !errors.Is(err, context.Canceled) {
   422  		return nil, err
   423  	}
   424  
   425  	controlRequest := &api.BuiltInGadgetControlRequest{Event: &api.BuiltInGadgetControlRequest_RunRequest{RunRequest: runRequest}}
   426  	err = runClient.Send(controlRequest)
   427  	if err != nil {
   428  		return nil, err
   429  	}
   430  
   431  	parser := gadgetCtx.Parser()
   432  
   433  	jsonHandler := func([]byte) {}
   434  	jsonArrayHandler := func([]byte) {}
   435  
   436  	if parser != nil {
   437  		var enrichers []func(any) error
   438  		ev := gadgetCtx.GadgetDesc().EventPrototype()
   439  		if _, ok := ev.(operators.NodeSetter); ok {
   440  			enrichers = append(enrichers, func(ev any) error {
   441  				ev.(operators.NodeSetter).SetNode(target.node)
   442  				return nil
   443  			})
   444  		}
   445  
   446  		jsonHandler = parser.JSONHandlerFunc(enrichers...)
   447  		jsonArrayHandler = parser.JSONHandlerFuncArray(target.node, enrichers...)
   448  	}
   449  
   450  	doneChan := make(chan error)
   451  
   452  	var result []byte
   453  	expectedSeq := uint32(1)
   454  
   455  	go func() {
   456  		for {
   457  			ev, err := runClient.Recv()
   458  			if err != nil {
   459  				gadgetCtx.Logger().Debugf("%-20s | runClient returned with %v", target.node, err)
   460  				if !errors.Is(err, io.EOF) {
   461  					doneChan <- err
   462  					return
   463  				}
   464  				doneChan <- nil
   465  				return
   466  			}
   467  			switch ev.Type {
   468  			case api.EventTypeGadgetPayload:
   469  				if expectedSeq != ev.Seq {
   470  					gadgetCtx.Logger().Warnf("%-20s | expected seq %d, got %d, %d messages dropped", target.node, expectedSeq, ev.Seq, ev.Seq-expectedSeq)
   471  				}
   472  				expectedSeq = ev.Seq + 1
   473  				if len(ev.Payload) > 0 && ev.Payload[0] == '[' {
   474  					jsonArrayHandler(ev.Payload)
   475  					continue
   476  				}
   477  				jsonHandler(ev.Payload)
   478  			case api.EventTypeGadgetResult:
   479  				gadgetCtx.Logger().Debugf("%-20s | got result from server", target.node)
   480  				result = ev.Payload
   481  			case api.EventTypeGadgetJobID: // not needed right now
   482  			default:
   483  				if ev.Type >= 1<<api.EventLogShift {
   484  					gadgetCtx.Logger().Log(logger.Level(ev.Type>>api.EventLogShift), fmt.Sprintf("%-20s | %s", target.node, string(ev.Payload)))
   485  					continue
   486  				}
   487  				gadgetCtx.Logger().Warnf("unknown payload type %d: %s", ev.Type, ev.Payload)
   488  			}
   489  		}
   490  	}()
   491  
   492  	var runErr error
   493  	select {
   494  	case doneErr := <-doneChan:
   495  		gadgetCtx.Logger().Debugf("%-20s | done from server side (%v)", target.node, doneErr)
   496  		runErr = doneErr
   497  	case <-gadgetCtx.Context().Done():
   498  		// Send stop request
   499  		gadgetCtx.Logger().Debugf("%-20s | sending stop request", target.node)
   500  		controlRequest := &api.BuiltInGadgetControlRequest{Event: &api.BuiltInGadgetControlRequest_StopRequest{StopRequest: &api.BuiltInGadgetStopRequest{}}}
   501  		runClient.Send(controlRequest)
   502  
   503  		// Wait for done or timeout
   504  		select {
   505  		case doneErr := <-doneChan:
   506  			gadgetCtx.Logger().Debugf("%-20s | done after cancel request (%v)", target.node, doneErr)
   507  			runErr = doneErr
   508  		case <-time.After(ResultTimeout * time.Second):
   509  			return nil, fmt.Errorf("timed out while getting result")
   510  		}
   511  	}
   512  	return result, runErr
   513  }
   514  
   515  func (r *Runtime) GetCatalog() (*runtime.Catalog, error) {
   516  	if r.info == nil {
   517  		return nil, nil
   518  	}
   519  	return r.info.Catalog, nil
   520  }
   521  
   522  func (r *Runtime) SetDefaultValue(key params.ValueHint, value string) {
   523  	r.defaultValues[strings.ToLower(string(key))] = value
   524  }
   525  
   526  func (r *Runtime) GetDefaultValue(key params.ValueHint) (string, bool) {
   527  	val, ok := r.defaultValues[strings.ToLower(string(key))]
   528  	return val, ok
   529  }