github.com/kubewharf/katalyst-core@v0.5.3/pkg/custom-metric/store/remote/remote_store.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package remote
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"encoding/json"
    23  	"fmt"
    24  	"io"
    25  	"net/http"
    26  	"sync"
    27  	"time"
    28  
    29  	"k8s.io/apimachinery/pkg/labels"
    30  	"k8s.io/apimachinery/pkg/runtime/schema"
    31  	"k8s.io/klog/v2"
    32  
    33  	katalystbase "github.com/kubewharf/katalyst-core/cmd/base"
    34  	metricconf "github.com/kubewharf/katalyst-core/pkg/config/metric"
    35  	"github.com/kubewharf/katalyst-core/pkg/custom-metric/store"
    36  	"github.com/kubewharf/katalyst-core/pkg/custom-metric/store/data"
    37  	"github.com/kubewharf/katalyst-core/pkg/custom-metric/store/data/types"
    38  	"github.com/kubewharf/katalyst-core/pkg/custom-metric/store/local"
    39  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    40  	"github.com/kubewharf/katalyst-core/pkg/util/process"
    41  )
    42  
    43  const MetricStoreNameRemoteMemory = "remote-memory-store"
    44  
    45  const (
    46  	metricsNameStoreRemoteGetCostFinish       = "kcmas_get_finish"
    47  	metricsNameStoreRemoteGetCostSendRequests = "kcmas_get_requests"
    48  	metricsNameStoreRemoteGetMetricCount      = "kcmas_get_metric_count"
    49  	metricsNameStoreRemoteGetItemCount        = "kcmas_get_item_count"
    50  
    51  	metricsNameStoreRemoteSendRequest = "kcmas_send_request"
    52  )
    53  
    54  // RemoteMemoryMetricStore implements MetricStore with multiple-nodes versioned
    55  // in-memory storage, and each shard will be responsible for some splits of the
    56  // total metrics. it will be used when the cluster becomes too large.
    57  //
    58  // RemoteMemoryMetricStore itself will be responsible for shard-splitting logic,
    59  // and it should be a wrapper of LocalMemoryMetricStore to reuse its internalMetric structure.
    60  type RemoteMemoryMetricStore struct {
    61  	ctx         context.Context
    62  	tags        []metrics.MetricTag
    63  	storeConf   *metricconf.StoreConfiguration
    64  	genericConf *metricconf.GenericMetricConfiguration
    65  
    66  	client  *http.Client
    67  	emitter metrics.MetricEmitter
    68  
    69  	sharding *ShardingController
    70  }
    71  
    72  var _ store.MetricStore = &RemoteMemoryMetricStore{}
    73  
    74  func NewRemoteMemoryMetricStore(ctx context.Context, baseCtx *katalystbase.GenericContext,
    75  	genericConf *metricconf.GenericMetricConfiguration, storeConf *metricconf.StoreConfiguration,
    76  ) (*RemoteMemoryMetricStore, error) {
    77  	client := process.NewDefaultHTTPClient()
    78  
    79  	if storeConf.StoreServerReplicaTotal <= 0 {
    80  		return nil, fmt.Errorf("total store server replica must be positive")
    81  	}
    82  	sharding, err := NewShardingController(ctx, baseCtx, storeConf)
    83  	if err != nil {
    84  		return nil, err
    85  	}
    86  
    87  	tags := []metrics.MetricTag{
    88  		{Key: "name", Val: MetricStoreNameRemoteMemory},
    89  	}
    90  	return &RemoteMemoryMetricStore{
    91  		ctx:         ctx,
    92  		tags:        tags,
    93  		genericConf: genericConf,
    94  		storeConf:   storeConf,
    95  		client:      client,
    96  		emitter:     baseCtx.EmitterPool.GetDefaultMetricsEmitter().WithTags("remote_store"),
    97  		sharding:    sharding,
    98  	}, nil
    99  }
   100  
   101  func (r *RemoteMemoryMetricStore) Name() string { return MetricStoreNameRemoteMemory }
   102  
   103  func (r *RemoteMemoryMetricStore) Start() error {
   104  	return r.sharding.Start()
   105  }
   106  
   107  func (r *RemoteMemoryMetricStore) Stop() error {
   108  	return r.sharding.Stop()
   109  }
   110  
   111  func (r *RemoteMemoryMetricStore) InsertMetric(seriesList []*data.MetricSeries) error {
   112  	start := time.Now()
   113  
   114  	contents, err := json.Marshal(seriesList)
   115  	if err != nil {
   116  		return err
   117  	}
   118  
   119  	newCtx, cancel := context.WithCancel(context.Background())
   120  	defer func() {
   121  		cancel()
   122  	}()
   123  	requests, err := r.sharding.GetRequests(newCtx, local.ServingSetPath)
   124  	if err != nil {
   125  		return err
   126  	}
   127  
   128  	_, wCnt := r.sharding.GetRWCount()
   129  	klog.V(4).Infof("insert need to write %v among %v", wCnt, len(requests))
   130  
   131  	success := 0
   132  	var responseLock sync.Mutex
   133  	// insert will always try to write into all store instances instead of write-counts
   134  	err = r.sendRequests(cancel, requests, len(requests), r.tags,
   135  		func(req *http.Request) {
   136  			req.Body = io.NopCloser(bytes.NewReader(contents))
   137  		},
   138  		func(_ io.ReadCloser) error {
   139  			responseLock.Lock()
   140  			success++
   141  			responseLock.Unlock()
   142  			return nil
   143  		},
   144  	)
   145  	if err != nil {
   146  		return err
   147  	}
   148  
   149  	defer func() {
   150  		finished := time.Now()
   151  		klog.V(6).Infof("insert cost %v", finished.Sub(start))
   152  	}()
   153  
   154  	if success < wCnt {
   155  		return fmt.Errorf("failed to perform quorum write actual %v expect %v", success, wCnt)
   156  	}
   157  
   158  	klog.V(4).Infof("successfully set with len %v", len(seriesList))
   159  	return nil
   160  }
   161  
   162  func (r *RemoteMemoryMetricStore) GetMetric(_ context.Context, namespace, metricName, objName string, gr *schema.GroupResource,
   163  	objSelector, metricSelector labels.Selector, latest bool,
   164  ) ([]types.Metric, error) {
   165  	start := time.Now()
   166  	tags := r.generateMetricsTags(metricName, objName)
   167  
   168  	newCtx, cancel := context.WithCancel(context.Background())
   169  	defer func() {
   170  		cancel()
   171  	}()
   172  	requests, err := r.sharding.GetRequests(newCtx, local.ServingGetPath)
   173  	if err != nil {
   174  		return nil, err
   175  	}
   176  
   177  	rCnt, _ := r.sharding.GetRWCount()
   178  	klog.Infof("[remote-store] metric %v, obj %v, get need to read %v among %v", metricName, objName, rCnt, len(requests))
   179  
   180  	var responseLock sync.Mutex
   181  	var metricLists [][]types.Metric
   182  	err = r.sendRequests(cancel, requests, rCnt, tags,
   183  		func(req *http.Request) {
   184  			values := req.URL.Query()
   185  			if len(namespace) > 0 {
   186  				values.Set(local.StoreGETParamNamespace, namespace)
   187  			}
   188  			if len(metricName) > 0 {
   189  				values.Set(local.StoreGETParamMetricName, metricName)
   190  			}
   191  			if metricSelector != nil && metricSelector.String() != "" {
   192  				values.Set(local.StoreGETParamMetricSelector, metricSelector.String())
   193  			}
   194  			if gr != nil {
   195  				values.Set(local.StoreGETParamObjectGR, gr.String())
   196  			}
   197  			if len(objName) > 0 {
   198  				values.Set(local.StoreGETParamObjectName, objName)
   199  			}
   200  			if objSelector != nil && objSelector.String() != "" {
   201  				values.Set(local.StoreGETParamMObjectSelector, objSelector.String())
   202  			}
   203  			if latest {
   204  				values.Set(local.StoreGETParamLatest, fmt.Sprintf("%v", latest))
   205  			}
   206  
   207  			req.URL.RawQuery = values.Encode()
   208  		},
   209  		func(body io.ReadCloser) error {
   210  			metricList, err := types.DecodeMetricList(body, metricName)
   211  			if err != nil {
   212  				return fmt.Errorf("decode err: %v", err)
   213  			}
   214  			responseLock.Lock()
   215  			metricLists = append(metricLists, metricList)
   216  			responseLock.Unlock()
   217  			return nil
   218  		},
   219  	)
   220  	if err != nil {
   221  		return nil, err
   222  	}
   223  
   224  	defer func() {
   225  		finishCosts := time.Now().Sub(start).Microseconds()
   226  		klog.Infof("[remote-store] get-finish: metric %v, obj %v, costs %v(ms), resultCount %v", metricName, objName, finishCosts, len(metricLists))
   227  		_ = r.emitter.StoreInt64(metricsNameStoreRemoteGetCostFinish, finishCosts, metrics.MetricTypeNameRaw, append(tags,
   228  			metrics.MetricTag{Key: "count", Val: fmt.Sprintf("%v", len(metricLists))})...)
   229  	}()
   230  
   231  	finishCosts := time.Now().Sub(start).Microseconds()
   232  	klog.Infof("[remote-store] get-requests: metric %v, obj %v, costs %v(ms)", metricName, objName, finishCosts)
   233  	_ = r.emitter.StoreInt64(metricsNameStoreRemoteGetCostSendRequests, finishCosts, metrics.MetricTypeNameRaw, tags...)
   234  
   235  	if len(metricLists) < rCnt {
   236  		return nil, fmt.Errorf("failed to perform quorum read actual %v expect %v", len(metricLists), rCnt)
   237  	}
   238  
   239  	res := data.MergeInternalMetricList(metricName, metricLists...)
   240  	itemLen := int64(0)
   241  	for _, r := range res {
   242  		itemLen += int64(r.Len())
   243  	}
   244  	klog.Infof("[remote-store] metric %v, obj %v, successfully get with len %v", metricName, objName, len(res))
   245  	_ = r.emitter.StoreInt64(metricsNameStoreRemoteGetMetricCount, int64(len(res)), metrics.MetricTypeNameRaw, tags...)
   246  	_ = r.emitter.StoreInt64(metricsNameStoreRemoteGetItemCount, itemLen, metrics.MetricTypeNameRaw, tags...)
   247  	return res, nil
   248  }
   249  
   250  func (r *RemoteMemoryMetricStore) ListMetricMeta(ctx context.Context, withObject bool) ([]types.MetricMeta, error) {
   251  	start := time.Now()
   252  
   253  	newCtx, cancel := context.WithCancel(context.Background())
   254  	defer func() {
   255  		cancel()
   256  	}()
   257  	requests, err := r.sharding.GetRequests(newCtx, local.ServingListPath)
   258  	if err != nil {
   259  		return nil, err
   260  	}
   261  
   262  	rCnt, _ := r.sharding.GetRWCount()
   263  	klog.V(6).Infof("list with objects need to read %v among %v", rCnt, len(requests))
   264  
   265  	var responseLock sync.Mutex
   266  	var metricMetaLists [][]types.MetricMeta
   267  	err = r.sendRequests(cancel, requests, rCnt, r.tags,
   268  		func(req *http.Request) {
   269  			values := req.URL.Query()
   270  			if withObject {
   271  				values.Set(local.StoreListParamObjected, "true")
   272  			}
   273  			req.URL.RawQuery = values.Encode()
   274  		},
   275  		func(body io.ReadCloser) error {
   276  			metricMetaList, err := types.DecodeMetricMetaList(body)
   277  			if err != nil {
   278  				return fmt.Errorf("decode response err: %v", err)
   279  			}
   280  			responseLock.Lock()
   281  			metricMetaLists = append(metricMetaLists, metricMetaList)
   282  			responseLock.Unlock()
   283  			return nil
   284  		},
   285  	)
   286  	if err != nil {
   287  		return nil, err
   288  	}
   289  
   290  	defer func() {
   291  		finished := time.Now()
   292  		klog.V(6).Infof("list with objects cost %v", finished.Sub(start))
   293  	}()
   294  
   295  	if len(metricMetaLists) < rCnt {
   296  		return nil, fmt.Errorf("failed to perform quorum read actual %v expect %v", len(metricMetaLists), rCnt)
   297  	}
   298  
   299  	res := types.PackMetricMetaList(metricMetaLists...)
   300  	klog.V(4).Infof("successfully list with len %v", len(res))
   301  	return res, nil
   302  }
   303  
   304  // todo, currently we will not support any timeout configurations for http-requests
   305  func (r *RemoteMemoryMetricStore) sendRequests(cancel func(),
   306  	reqs []*http.Request, readyCnt int, tags []metrics.MetricTag,
   307  	requestWrapF func(req *http.Request), responseWrapF func(body io.ReadCloser) error,
   308  ) error {
   309  	if len(reqs) == 0 {
   310  		return nil
   311  	}
   312  
   313  	failChan := make(chan error, len(reqs))
   314  	successChan := make(chan struct{}, len(reqs))
   315  
   316  	wg := sync.WaitGroup{}
   317  	for i := range reqs {
   318  		wg.Add(1)
   319  		req := reqs[i]
   320  
   321  		go func() {
   322  			err := r.sendRequest(req, tags, requestWrapF, responseWrapF)
   323  			if err != nil {
   324  				failChan <- fmt.Errorf("%v send request err: %v", req.URL.String(), err)
   325  			} else {
   326  				successChan <- struct{}{}
   327  			}
   328  			wg.Done()
   329  		}()
   330  	}
   331  
   332  	fail, success := 0, 0
   333  	for {
   334  		select {
   335  		case err := <-failChan:
   336  			fail++
   337  			klog.Errorf("failed to send request %v", err)
   338  		case <-successChan:
   339  			success++
   340  		}
   341  
   342  		if success+fail >= len(reqs) {
   343  			// always try to cancel all requests before quiting
   344  			cancel()
   345  			klog.Infof("break sending requests, success %v, fail %v, total %v", success, fail, len(reqs))
   346  			break
   347  		}
   348  	}
   349  	// wait for all goroutines to quit, and then close all channels to avoid memory leak
   350  	wg.Wait()
   351  	close(failChan)
   352  	close(successChan)
   353  
   354  	if success < readyCnt {
   355  		return fmt.Errorf("failed to get more than %v valid responses", readyCnt)
   356  	}
   357  	return nil
   358  }
   359  
   360  // sendRequest works as a uniformed function to construct http requests, as
   361  // well as send this requests to the server side.
   362  func (r *RemoteMemoryMetricStore) sendRequest(req *http.Request, tags []metrics.MetricTag,
   363  	requestWrapFunc func(req *http.Request), responseWrapF func(body io.ReadCloser) error,
   364  ) error {
   365  	start := time.Now()
   366  	defer func() {
   367  		finishCosts := time.Now().Sub(start).Microseconds()
   368  		klog.Infof("[remote-store] send-request: url %+v, costs %v(ms)", req.URL, finishCosts)
   369  		_ = r.emitter.StoreInt64(metricsNameStoreRemoteSendRequest, finishCosts, metrics.MetricTypeNameRaw, tags...)
   370  	}()
   371  
   372  	requestWrapFunc(req)
   373  
   374  	klog.V(6).Infof("sendRequest %v", req.URL)
   375  	resp, err := r.client.Do(req)
   376  	defer func() {
   377  		if resp != nil && resp.Body != nil {
   378  			_ = resp.Body.Close()
   379  		}
   380  	}()
   381  
   382  	if err != nil {
   383  		return fmt.Errorf("send http requests err: %v", err)
   384  	} else if resp == nil {
   385  		return fmt.Errorf("response err: %v", "respnsonse nil")
   386  	} else if resp.Body == nil {
   387  		return fmt.Errorf("response err: %v", "body is nil")
   388  	} else if resp.StatusCode != http.StatusOK {
   389  		buf := bytes.NewBuffer([]byte{})
   390  		_, _ = io.Copy(buf, resp.Body)
   391  		return fmt.Errorf("response err: status code %v, body: %v", resp.StatusCode, buf.String())
   392  	}
   393  
   394  	if err := responseWrapF(resp.Body); err != nil {
   395  		return fmt.Errorf("failed to handle response %v", err)
   396  	}
   397  	return nil
   398  }
   399  
   400  // generateMetricsTags returns tags for the corresponding requests
   401  func (r *RemoteMemoryMetricStore) generateMetricsTags(metricName, objName string) []metrics.MetricTag {
   402  	if metricName == "" {
   403  		metricName = "empty"
   404  	}
   405  	if objName == "" {
   406  		objName = "empty"
   407  	}
   408  	return append(r.tags,
   409  		metrics.MetricTag{Key: "metric_name", Val: metricName},
   410  		metrics.MetricTag{Key: "object_name", Val: objName},
   411  	)
   412  }