github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/resourcemanager/fetcher/plugin/endpoint.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package plugin
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  	"time"
    24  
    25  	"google.golang.org/grpc"
    26  	"google.golang.org/grpc/status"
    27  	"k8s.io/klog/v2"
    28  
    29  	"github.com/kubewharf/katalyst-api/pkg/protocol/reporterplugin/v1alpha1"
    30  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    31  	"github.com/kubewharf/katalyst-core/pkg/util/process"
    32  )
    33  
    34  const (
    35  	dialRemoteEndpointTimeout = 10 * time.Second
    36  	getReportContentTimeout   = 10 * time.Second
    37  )
    38  
    39  // ListAndWatchCallback should be called when plugins report info update.
    40  type ListAndWatchCallback func(string, *v1alpha1.GetReportContentResponse)
    41  
    42  // Endpoint represents a single registered plugin. It is responsible
    43  // for managing gRPC communications with the reporter plugin and caching reporter states.
    44  type Endpoint interface {
    45  	// Run initializes a ListAndWatch steam which will send a signal to the success channel
    46  	// when it initializes successfully
    47  	Run(success chan<- bool)
    48  	// Stop will be call when this Endpoint was de-registered or some error happened in ListAndWatch
    49  	Stop()
    50  	// GetReportContent will call rpc GetReportContent to plugin directly
    51  	GetReportContent(c context.Context) (*v1alpha1.GetReportContentResponse, error)
    52  	// ListAndWatchReportContentCallback will be call when this Endpoint receives plugin ListAndWatch send
    53  	ListAndWatchReportContentCallback(string, *v1alpha1.GetReportContentResponse)
    54  	// GetCache get response cache of this Endpoint
    55  	GetCache() *v1alpha1.GetReportContentResponse
    56  	// IsStopped check this Endpoint whether be called stop function before
    57  	IsStopped() bool
    58  	// StopGracePeriodExpired check if this Endpoint has been stopped and exceeded the
    59  	// grace period since the stop timestamp
    60  	StopGracePeriodExpired() bool
    61  }
    62  
    63  // NewRemoteEndpoint creates a new Endpoint for the given reporter' plugin name.
    64  // This is to be used during normal reporter' plugin registration.
    65  func NewRemoteEndpoint(socketPath, pluginName string, cache *v1alpha1.GetReportContentResponse,
    66  	emitter metrics.MetricEmitter, callback ListAndWatchCallback,
    67  ) (Endpoint, error) {
    68  	c, err := process.Dial(socketPath, dialRemoteEndpointTimeout)
    69  	if err != nil {
    70  		klog.Errorf("Can't create new Endpoint with path %s err %v", socketPath, err)
    71  		return nil, err
    72  	}
    73  
    74  	return &remoteEndpointImpl{
    75  		client:     v1alpha1.NewReporterPluginClient(c),
    76  		clientConn: c,
    77  
    78  		socketPath: socketPath,
    79  		pluginName: pluginName,
    80  		cache:      cache,
    81  		emitter:    emitter,
    82  
    83  		cb:          callback,
    84  		StopControl: process.NewStopControl(time.Time{}),
    85  	}, nil
    86  }
    87  
    88  // NewStoppedRemoteEndpoint creates a new Endpoint for the given pluginName with stopTime set.
    89  // This is to be used during Agent restart, before the actual reporter plugin re-registers.
    90  func NewStoppedRemoteEndpoint(pluginName string, cache *v1alpha1.GetReportContentResponse) Endpoint {
    91  	return &remoteEndpointImpl{
    92  		pluginName:  pluginName,
    93  		cache:       cache,
    94  		StopControl: process.NewStopControl(time.Now()),
    95  	}
    96  }
    97  
    98  type remoteEndpointImpl struct {
    99  	client     v1alpha1.ReporterPluginClient
   100  	clientConn *grpc.ClientConn
   101  
   102  	cache      *v1alpha1.GetReportContentResponse
   103  	socketPath string
   104  	pluginName string
   105  	emitter    metrics.MetricEmitter
   106  
   107  	cb ListAndWatchCallback
   108  
   109  	mutex sync.Mutex
   110  	*process.StopControl
   111  }
   112  
   113  // Run initializes ListAndWatch gRPC call for the plugin and blocks
   114  // on receiving ListAndWatch gRPC stream updates. Each stream-item
   115  // for ListAndWatch contains a new list of report content.
   116  // It then triggers the callback function to pass this item to the manager.
   117  func (e *remoteEndpointImpl) Run(success chan<- bool) {
   118  	stream, err := e.client.ListAndWatchReportContent(context.Background(), &v1alpha1.Empty{})
   119  	if err != nil {
   120  		s, _ := status.FromError(err)
   121  		_ = e.emitter.StoreInt64("reporter_plugin_lw_content_failed", 1, metrics.MetricTypeNameCount, []metrics.MetricTag{
   122  			{Key: "code", Val: s.Code().String()},
   123  			{Key: "plugin", Val: e.pluginName},
   124  		}...)
   125  		klog.Errorf("ListAndWatch ended unexpectedly for reporter plugin %s with error %v", e.pluginName, err)
   126  		success <- false
   127  		return
   128  	}
   129  
   130  	success <- true
   131  
   132  	for {
   133  		response, err := stream.Recv()
   134  		if err != nil {
   135  			s, _ := status.FromError(err)
   136  			_ = e.emitter.StoreInt64("reporter_plugin_lw_recv_failed", 1, metrics.MetricTypeNameCount, []metrics.MetricTag{
   137  				{Key: "code", Val: s.Code().String()},
   138  				{Key: "plugin", Val: e.pluginName},
   139  			}...)
   140  			klog.Errorf("ListAndWatch recv failed for reporter plugin %s with error %v", e.pluginName, err)
   141  			err := stream.CloseSend()
   142  			if err != nil {
   143  				s, _ := status.FromError(err)
   144  				_ = e.emitter.StoreInt64("reporter_plugin_lw_close_failed", 1, metrics.MetricTypeNameCount, []metrics.MetricTag{
   145  					{Key: "code", Val: s.Code().String()},
   146  					{Key: "plugin", Val: e.pluginName},
   147  				}...)
   148  				klog.Errorf("ListAndWatch close send failed for reporter plugin %s with error %v", e.pluginName, err)
   149  			}
   150  			return
   151  		}
   152  
   153  		klog.V(2).Infof("content list pushed for reporter plugin %s", e.pluginName)
   154  
   155  		e.ListAndWatchReportContentCallback(e.pluginName, response)
   156  	}
   157  }
   158  
   159  // Stop close client connection and set stop timestamp
   160  func (e *remoteEndpointImpl) Stop() {
   161  	e.mutex.Lock()
   162  	defer e.mutex.Unlock()
   163  
   164  	if e.clientConn != nil {
   165  		_ = e.clientConn.Close()
   166  	}
   167  
   168  	e.StopControl.Stop()
   169  }
   170  
   171  // GetReportContent get report content by rpc call directly and store to cache if it is successful
   172  func (e *remoteEndpointImpl) GetReportContent(c context.Context) (*v1alpha1.GetReportContentResponse, error) {
   173  	if e.IsStopped() {
   174  		return nil, fmt.Errorf("endpoint %v has been stopped", e.pluginName)
   175  	}
   176  
   177  	ctx, cancel := context.WithTimeout(c, getReportContentTimeout)
   178  	defer cancel()
   179  	resp, err := e.client.GetReportContent(ctx, &v1alpha1.Empty{})
   180  	if err == nil {
   181  		e.setCache(resp)
   182  	}
   183  
   184  	return resp, err
   185  }
   186  
   187  // ListAndWatchReportContentCallback store to cache first and then call callback function
   188  func (e *remoteEndpointImpl) ListAndWatchReportContentCallback(pluginName string, response *v1alpha1.GetReportContentResponse) {
   189  	e.setCache(response)
   190  
   191  	e.cb(pluginName, response)
   192  }
   193  
   194  func (e *remoteEndpointImpl) GetCache() *v1alpha1.GetReportContentResponse {
   195  	e.mutex.Lock()
   196  	defer e.mutex.Unlock()
   197  
   198  	return e.cache
   199  }
   200  
   201  func (e *remoteEndpointImpl) setCache(cache *v1alpha1.GetReportContentResponse) {
   202  	e.mutex.Lock()
   203  	defer e.mutex.Unlock()
   204  
   205  	e.cache = cache
   206  }