github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/resourcemanager/fetcher/plugin/endpoint.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package plugin 18 19 import ( 20 "context" 21 "fmt" 22 "sync" 23 "time" 24 25 "google.golang.org/grpc" 26 "google.golang.org/grpc/status" 27 "k8s.io/klog/v2" 28 29 "github.com/kubewharf/katalyst-api/pkg/protocol/reporterplugin/v1alpha1" 30 "github.com/kubewharf/katalyst-core/pkg/metrics" 31 "github.com/kubewharf/katalyst-core/pkg/util/process" 32 ) 33 34 const ( 35 dialRemoteEndpointTimeout = 10 * time.Second 36 getReportContentTimeout = 10 * time.Second 37 ) 38 39 // ListAndWatchCallback should be called when plugins report info update. 40 type ListAndWatchCallback func(string, *v1alpha1.GetReportContentResponse) 41 42 // Endpoint represents a single registered plugin. It is responsible 43 // for managing gRPC communications with the reporter plugin and caching reporter states. 44 type Endpoint interface { 45 // Run initializes a ListAndWatch steam which will send a signal to the success channel 46 // when it initializes successfully 47 Run(success chan<- bool) 48 // Stop will be call when this Endpoint was de-registered or some error happened in ListAndWatch 49 Stop() 50 // GetReportContent will call rpc GetReportContent to plugin directly 51 GetReportContent(c context.Context) (*v1alpha1.GetReportContentResponse, error) 52 // ListAndWatchReportContentCallback will be call when this Endpoint receives plugin ListAndWatch send 53 ListAndWatchReportContentCallback(string, *v1alpha1.GetReportContentResponse) 54 // GetCache get response cache of this Endpoint 55 GetCache() *v1alpha1.GetReportContentResponse 56 // IsStopped check this Endpoint whether be called stop function before 57 IsStopped() bool 58 // StopGracePeriodExpired check if this Endpoint has been stopped and exceeded the 59 // grace period since the stop timestamp 60 StopGracePeriodExpired() bool 61 } 62 63 // NewRemoteEndpoint creates a new Endpoint for the given reporter' plugin name. 64 // This is to be used during normal reporter' plugin registration. 65 func NewRemoteEndpoint(socketPath, pluginName string, cache *v1alpha1.GetReportContentResponse, 66 emitter metrics.MetricEmitter, callback ListAndWatchCallback, 67 ) (Endpoint, error) { 68 c, err := process.Dial(socketPath, dialRemoteEndpointTimeout) 69 if err != nil { 70 klog.Errorf("Can't create new Endpoint with path %s err %v", socketPath, err) 71 return nil, err 72 } 73 74 return &remoteEndpointImpl{ 75 client: v1alpha1.NewReporterPluginClient(c), 76 clientConn: c, 77 78 socketPath: socketPath, 79 pluginName: pluginName, 80 cache: cache, 81 emitter: emitter, 82 83 cb: callback, 84 StopControl: process.NewStopControl(time.Time{}), 85 }, nil 86 } 87 88 // NewStoppedRemoteEndpoint creates a new Endpoint for the given pluginName with stopTime set. 89 // This is to be used during Agent restart, before the actual reporter plugin re-registers. 90 func NewStoppedRemoteEndpoint(pluginName string, cache *v1alpha1.GetReportContentResponse) Endpoint { 91 return &remoteEndpointImpl{ 92 pluginName: pluginName, 93 cache: cache, 94 StopControl: process.NewStopControl(time.Now()), 95 } 96 } 97 98 type remoteEndpointImpl struct { 99 client v1alpha1.ReporterPluginClient 100 clientConn *grpc.ClientConn 101 102 cache *v1alpha1.GetReportContentResponse 103 socketPath string 104 pluginName string 105 emitter metrics.MetricEmitter 106 107 cb ListAndWatchCallback 108 109 mutex sync.Mutex 110 *process.StopControl 111 } 112 113 // Run initializes ListAndWatch gRPC call for the plugin and blocks 114 // on receiving ListAndWatch gRPC stream updates. Each stream-item 115 // for ListAndWatch contains a new list of report content. 116 // It then triggers the callback function to pass this item to the manager. 117 func (e *remoteEndpointImpl) Run(success chan<- bool) { 118 stream, err := e.client.ListAndWatchReportContent(context.Background(), &v1alpha1.Empty{}) 119 if err != nil { 120 s, _ := status.FromError(err) 121 _ = e.emitter.StoreInt64("reporter_plugin_lw_content_failed", 1, metrics.MetricTypeNameCount, []metrics.MetricTag{ 122 {Key: "code", Val: s.Code().String()}, 123 {Key: "plugin", Val: e.pluginName}, 124 }...) 125 klog.Errorf("ListAndWatch ended unexpectedly for reporter plugin %s with error %v", e.pluginName, err) 126 success <- false 127 return 128 } 129 130 success <- true 131 132 for { 133 response, err := stream.Recv() 134 if err != nil { 135 s, _ := status.FromError(err) 136 _ = e.emitter.StoreInt64("reporter_plugin_lw_recv_failed", 1, metrics.MetricTypeNameCount, []metrics.MetricTag{ 137 {Key: "code", Val: s.Code().String()}, 138 {Key: "plugin", Val: e.pluginName}, 139 }...) 140 klog.Errorf("ListAndWatch recv failed for reporter plugin %s with error %v", e.pluginName, err) 141 err := stream.CloseSend() 142 if err != nil { 143 s, _ := status.FromError(err) 144 _ = e.emitter.StoreInt64("reporter_plugin_lw_close_failed", 1, metrics.MetricTypeNameCount, []metrics.MetricTag{ 145 {Key: "code", Val: s.Code().String()}, 146 {Key: "plugin", Val: e.pluginName}, 147 }...) 148 klog.Errorf("ListAndWatch close send failed for reporter plugin %s with error %v", e.pluginName, err) 149 } 150 return 151 } 152 153 klog.V(2).Infof("content list pushed for reporter plugin %s", e.pluginName) 154 155 e.ListAndWatchReportContentCallback(e.pluginName, response) 156 } 157 } 158 159 // Stop close client connection and set stop timestamp 160 func (e *remoteEndpointImpl) Stop() { 161 e.mutex.Lock() 162 defer e.mutex.Unlock() 163 164 if e.clientConn != nil { 165 _ = e.clientConn.Close() 166 } 167 168 e.StopControl.Stop() 169 } 170 171 // GetReportContent get report content by rpc call directly and store to cache if it is successful 172 func (e *remoteEndpointImpl) GetReportContent(c context.Context) (*v1alpha1.GetReportContentResponse, error) { 173 if e.IsStopped() { 174 return nil, fmt.Errorf("endpoint %v has been stopped", e.pluginName) 175 } 176 177 ctx, cancel := context.WithTimeout(c, getReportContentTimeout) 178 defer cancel() 179 resp, err := e.client.GetReportContent(ctx, &v1alpha1.Empty{}) 180 if err == nil { 181 e.setCache(resp) 182 } 183 184 return resp, err 185 } 186 187 // ListAndWatchReportContentCallback store to cache first and then call callback function 188 func (e *remoteEndpointImpl) ListAndWatchReportContentCallback(pluginName string, response *v1alpha1.GetReportContentResponse) { 189 e.setCache(response) 190 191 e.cb(pluginName, response) 192 } 193 194 func (e *remoteEndpointImpl) GetCache() *v1alpha1.GetReportContentResponse { 195 e.mutex.Lock() 196 defer e.mutex.Unlock() 197 198 return e.cache 199 } 200 201 func (e *remoteEndpointImpl) setCache(cache *v1alpha1.GetReportContentResponse) { 202 e.mutex.Lock() 203 defer e.mutex.Unlock() 204 205 e.cache = cache 206 }