github.com/kubewharf/katalyst-core@v0.5.3/pkg/custom-metric/store/remote/remote_store.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package remote 18 19 import ( 20 "bytes" 21 "context" 22 "encoding/json" 23 "fmt" 24 "io" 25 "net/http" 26 "sync" 27 "time" 28 29 "k8s.io/apimachinery/pkg/labels" 30 "k8s.io/apimachinery/pkg/runtime/schema" 31 "k8s.io/klog/v2" 32 33 katalystbase "github.com/kubewharf/katalyst-core/cmd/base" 34 metricconf "github.com/kubewharf/katalyst-core/pkg/config/metric" 35 "github.com/kubewharf/katalyst-core/pkg/custom-metric/store" 36 "github.com/kubewharf/katalyst-core/pkg/custom-metric/store/data" 37 "github.com/kubewharf/katalyst-core/pkg/custom-metric/store/data/types" 38 "github.com/kubewharf/katalyst-core/pkg/custom-metric/store/local" 39 "github.com/kubewharf/katalyst-core/pkg/metrics" 40 "github.com/kubewharf/katalyst-core/pkg/util/process" 41 ) 42 43 const MetricStoreNameRemoteMemory = "remote-memory-store" 44 45 const ( 46 metricsNameStoreRemoteGetCostFinish = "kcmas_get_finish" 47 metricsNameStoreRemoteGetCostSendRequests = "kcmas_get_requests" 48 metricsNameStoreRemoteGetMetricCount = "kcmas_get_metric_count" 49 metricsNameStoreRemoteGetItemCount = "kcmas_get_item_count" 50 51 metricsNameStoreRemoteSendRequest = "kcmas_send_request" 52 ) 53 54 // RemoteMemoryMetricStore implements MetricStore with multiple-nodes versioned 55 // in-memory storage, and each shard will be responsible for some splits of the 56 // total metrics. it will be used when the cluster becomes too large. 57 // 58 // RemoteMemoryMetricStore itself will be responsible for shard-splitting logic, 59 // and it should be a wrapper of LocalMemoryMetricStore to reuse its internalMetric structure. 60 type RemoteMemoryMetricStore struct { 61 ctx context.Context 62 tags []metrics.MetricTag 63 storeConf *metricconf.StoreConfiguration 64 genericConf *metricconf.GenericMetricConfiguration 65 66 client *http.Client 67 emitter metrics.MetricEmitter 68 69 sharding *ShardingController 70 } 71 72 var _ store.MetricStore = &RemoteMemoryMetricStore{} 73 74 func NewRemoteMemoryMetricStore(ctx context.Context, baseCtx *katalystbase.GenericContext, 75 genericConf *metricconf.GenericMetricConfiguration, storeConf *metricconf.StoreConfiguration, 76 ) (*RemoteMemoryMetricStore, error) { 77 client := process.NewDefaultHTTPClient() 78 79 if storeConf.StoreServerReplicaTotal <= 0 { 80 return nil, fmt.Errorf("total store server replica must be positive") 81 } 82 sharding, err := NewShardingController(ctx, baseCtx, storeConf) 83 if err != nil { 84 return nil, err 85 } 86 87 tags := []metrics.MetricTag{ 88 {Key: "name", Val: MetricStoreNameRemoteMemory}, 89 } 90 return &RemoteMemoryMetricStore{ 91 ctx: ctx, 92 tags: tags, 93 genericConf: genericConf, 94 storeConf: storeConf, 95 client: client, 96 emitter: baseCtx.EmitterPool.GetDefaultMetricsEmitter().WithTags("remote_store"), 97 sharding: sharding, 98 }, nil 99 } 100 101 func (r *RemoteMemoryMetricStore) Name() string { return MetricStoreNameRemoteMemory } 102 103 func (r *RemoteMemoryMetricStore) Start() error { 104 return r.sharding.Start() 105 } 106 107 func (r *RemoteMemoryMetricStore) Stop() error { 108 return r.sharding.Stop() 109 } 110 111 func (r *RemoteMemoryMetricStore) InsertMetric(seriesList []*data.MetricSeries) error { 112 start := time.Now() 113 114 contents, err := json.Marshal(seriesList) 115 if err != nil { 116 return err 117 } 118 119 newCtx, cancel := context.WithCancel(context.Background()) 120 defer func() { 121 cancel() 122 }() 123 requests, err := r.sharding.GetRequests(newCtx, local.ServingSetPath) 124 if err != nil { 125 return err 126 } 127 128 _, wCnt := r.sharding.GetRWCount() 129 klog.V(4).Infof("insert need to write %v among %v", wCnt, len(requests)) 130 131 success := 0 132 var responseLock sync.Mutex 133 // insert will always try to write into all store instances instead of write-counts 134 err = r.sendRequests(cancel, requests, len(requests), r.tags, 135 func(req *http.Request) { 136 req.Body = io.NopCloser(bytes.NewReader(contents)) 137 }, 138 func(_ io.ReadCloser) error { 139 responseLock.Lock() 140 success++ 141 responseLock.Unlock() 142 return nil 143 }, 144 ) 145 if err != nil { 146 return err 147 } 148 149 defer func() { 150 finished := time.Now() 151 klog.V(6).Infof("insert cost %v", finished.Sub(start)) 152 }() 153 154 if success < wCnt { 155 return fmt.Errorf("failed to perform quorum write actual %v expect %v", success, wCnt) 156 } 157 158 klog.V(4).Infof("successfully set with len %v", len(seriesList)) 159 return nil 160 } 161 162 func (r *RemoteMemoryMetricStore) GetMetric(_ context.Context, namespace, metricName, objName string, gr *schema.GroupResource, 163 objSelector, metricSelector labels.Selector, latest bool, 164 ) ([]types.Metric, error) { 165 start := time.Now() 166 tags := r.generateMetricsTags(metricName, objName) 167 168 newCtx, cancel := context.WithCancel(context.Background()) 169 defer func() { 170 cancel() 171 }() 172 requests, err := r.sharding.GetRequests(newCtx, local.ServingGetPath) 173 if err != nil { 174 return nil, err 175 } 176 177 rCnt, _ := r.sharding.GetRWCount() 178 klog.Infof("[remote-store] metric %v, obj %v, get need to read %v among %v", metricName, objName, rCnt, len(requests)) 179 180 var responseLock sync.Mutex 181 var metricLists [][]types.Metric 182 err = r.sendRequests(cancel, requests, rCnt, tags, 183 func(req *http.Request) { 184 values := req.URL.Query() 185 if len(namespace) > 0 { 186 values.Set(local.StoreGETParamNamespace, namespace) 187 } 188 if len(metricName) > 0 { 189 values.Set(local.StoreGETParamMetricName, metricName) 190 } 191 if metricSelector != nil && metricSelector.String() != "" { 192 values.Set(local.StoreGETParamMetricSelector, metricSelector.String()) 193 } 194 if gr != nil { 195 values.Set(local.StoreGETParamObjectGR, gr.String()) 196 } 197 if len(objName) > 0 { 198 values.Set(local.StoreGETParamObjectName, objName) 199 } 200 if objSelector != nil && objSelector.String() != "" { 201 values.Set(local.StoreGETParamMObjectSelector, objSelector.String()) 202 } 203 if latest { 204 values.Set(local.StoreGETParamLatest, fmt.Sprintf("%v", latest)) 205 } 206 207 req.URL.RawQuery = values.Encode() 208 }, 209 func(body io.ReadCloser) error { 210 metricList, err := types.DecodeMetricList(body, metricName) 211 if err != nil { 212 return fmt.Errorf("decode err: %v", err) 213 } 214 responseLock.Lock() 215 metricLists = append(metricLists, metricList) 216 responseLock.Unlock() 217 return nil 218 }, 219 ) 220 if err != nil { 221 return nil, err 222 } 223 224 defer func() { 225 finishCosts := time.Now().Sub(start).Microseconds() 226 klog.Infof("[remote-store] get-finish: metric %v, obj %v, costs %v(ms), resultCount %v", metricName, objName, finishCosts, len(metricLists)) 227 _ = r.emitter.StoreInt64(metricsNameStoreRemoteGetCostFinish, finishCosts, metrics.MetricTypeNameRaw, append(tags, 228 metrics.MetricTag{Key: "count", Val: fmt.Sprintf("%v", len(metricLists))})...) 229 }() 230 231 finishCosts := time.Now().Sub(start).Microseconds() 232 klog.Infof("[remote-store] get-requests: metric %v, obj %v, costs %v(ms)", metricName, objName, finishCosts) 233 _ = r.emitter.StoreInt64(metricsNameStoreRemoteGetCostSendRequests, finishCosts, metrics.MetricTypeNameRaw, tags...) 234 235 if len(metricLists) < rCnt { 236 return nil, fmt.Errorf("failed to perform quorum read actual %v expect %v", len(metricLists), rCnt) 237 } 238 239 res := data.MergeInternalMetricList(metricName, metricLists...) 240 itemLen := int64(0) 241 for _, r := range res { 242 itemLen += int64(r.Len()) 243 } 244 klog.Infof("[remote-store] metric %v, obj %v, successfully get with len %v", metricName, objName, len(res)) 245 _ = r.emitter.StoreInt64(metricsNameStoreRemoteGetMetricCount, int64(len(res)), metrics.MetricTypeNameRaw, tags...) 246 _ = r.emitter.StoreInt64(metricsNameStoreRemoteGetItemCount, itemLen, metrics.MetricTypeNameRaw, tags...) 247 return res, nil 248 } 249 250 func (r *RemoteMemoryMetricStore) ListMetricMeta(ctx context.Context, withObject bool) ([]types.MetricMeta, error) { 251 start := time.Now() 252 253 newCtx, cancel := context.WithCancel(context.Background()) 254 defer func() { 255 cancel() 256 }() 257 requests, err := r.sharding.GetRequests(newCtx, local.ServingListPath) 258 if err != nil { 259 return nil, err 260 } 261 262 rCnt, _ := r.sharding.GetRWCount() 263 klog.V(6).Infof("list with objects need to read %v among %v", rCnt, len(requests)) 264 265 var responseLock sync.Mutex 266 var metricMetaLists [][]types.MetricMeta 267 err = r.sendRequests(cancel, requests, rCnt, r.tags, 268 func(req *http.Request) { 269 values := req.URL.Query() 270 if withObject { 271 values.Set(local.StoreListParamObjected, "true") 272 } 273 req.URL.RawQuery = values.Encode() 274 }, 275 func(body io.ReadCloser) error { 276 metricMetaList, err := types.DecodeMetricMetaList(body) 277 if err != nil { 278 return fmt.Errorf("decode response err: %v", err) 279 } 280 responseLock.Lock() 281 metricMetaLists = append(metricMetaLists, metricMetaList) 282 responseLock.Unlock() 283 return nil 284 }, 285 ) 286 if err != nil { 287 return nil, err 288 } 289 290 defer func() { 291 finished := time.Now() 292 klog.V(6).Infof("list with objects cost %v", finished.Sub(start)) 293 }() 294 295 if len(metricMetaLists) < rCnt { 296 return nil, fmt.Errorf("failed to perform quorum read actual %v expect %v", len(metricMetaLists), rCnt) 297 } 298 299 res := types.PackMetricMetaList(metricMetaLists...) 300 klog.V(4).Infof("successfully list with len %v", len(res)) 301 return res, nil 302 } 303 304 // todo, currently we will not support any timeout configurations for http-requests 305 func (r *RemoteMemoryMetricStore) sendRequests(cancel func(), 306 reqs []*http.Request, readyCnt int, tags []metrics.MetricTag, 307 requestWrapF func(req *http.Request), responseWrapF func(body io.ReadCloser) error, 308 ) error { 309 if len(reqs) == 0 { 310 return nil 311 } 312 313 failChan := make(chan error, len(reqs)) 314 successChan := make(chan struct{}, len(reqs)) 315 316 wg := sync.WaitGroup{} 317 for i := range reqs { 318 wg.Add(1) 319 req := reqs[i] 320 321 go func() { 322 err := r.sendRequest(req, tags, requestWrapF, responseWrapF) 323 if err != nil { 324 failChan <- fmt.Errorf("%v send request err: %v", req.URL.String(), err) 325 } else { 326 successChan <- struct{}{} 327 } 328 wg.Done() 329 }() 330 } 331 332 fail, success := 0, 0 333 for { 334 select { 335 case err := <-failChan: 336 fail++ 337 klog.Errorf("failed to send request %v", err) 338 case <-successChan: 339 success++ 340 } 341 342 if success+fail >= len(reqs) { 343 // always try to cancel all requests before quiting 344 cancel() 345 klog.Infof("break sending requests, success %v, fail %v, total %v", success, fail, len(reqs)) 346 break 347 } 348 } 349 // wait for all goroutines to quit, and then close all channels to avoid memory leak 350 wg.Wait() 351 close(failChan) 352 close(successChan) 353 354 if success < readyCnt { 355 return fmt.Errorf("failed to get more than %v valid responses", readyCnt) 356 } 357 return nil 358 } 359 360 // sendRequest works as a uniformed function to construct http requests, as 361 // well as send this requests to the server side. 362 func (r *RemoteMemoryMetricStore) sendRequest(req *http.Request, tags []metrics.MetricTag, 363 requestWrapFunc func(req *http.Request), responseWrapF func(body io.ReadCloser) error, 364 ) error { 365 start := time.Now() 366 defer func() { 367 finishCosts := time.Now().Sub(start).Microseconds() 368 klog.Infof("[remote-store] send-request: url %+v, costs %v(ms)", req.URL, finishCosts) 369 _ = r.emitter.StoreInt64(metricsNameStoreRemoteSendRequest, finishCosts, metrics.MetricTypeNameRaw, tags...) 370 }() 371 372 requestWrapFunc(req) 373 374 klog.V(6).Infof("sendRequest %v", req.URL) 375 resp, err := r.client.Do(req) 376 defer func() { 377 if resp != nil && resp.Body != nil { 378 _ = resp.Body.Close() 379 } 380 }() 381 382 if err != nil { 383 return fmt.Errorf("send http requests err: %v", err) 384 } else if resp == nil { 385 return fmt.Errorf("response err: %v", "respnsonse nil") 386 } else if resp.Body == nil { 387 return fmt.Errorf("response err: %v", "body is nil") 388 } else if resp.StatusCode != http.StatusOK { 389 buf := bytes.NewBuffer([]byte{}) 390 _, _ = io.Copy(buf, resp.Body) 391 return fmt.Errorf("response err: status code %v, body: %v", resp.StatusCode, buf.String()) 392 } 393 394 if err := responseWrapF(resp.Body); err != nil { 395 return fmt.Errorf("failed to handle response %v", err) 396 } 397 return nil 398 } 399 400 // generateMetricsTags returns tags for the corresponding requests 401 func (r *RemoteMemoryMetricStore) generateMetricsTags(metricName, objName string) []metrics.MetricTag { 402 if metricName == "" { 403 metricName = "empty" 404 } 405 if objName == "" { 406 objName = "empty" 407 } 408 return append(r.tags, 409 metrics.MetricTag{Key: "metric_name", Val: metricName}, 410 metrics.MetricTag{Key: "object_name", Val: objName}, 411 ) 412 }