github.com/kubewharf/katalyst-core@v0.5.3/cmd/katalyst-agent/app/agent.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package app
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  	"time"
    24  
    25  	"k8s.io/klog/v2"
    26  
    27  	katalystbase "github.com/kubewharf/katalyst-core/cmd/base"
    28  	"github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent"
    29  	"github.com/kubewharf/katalyst-core/pkg/client"
    30  	"github.com/kubewharf/katalyst-core/pkg/config"
    31  	"github.com/kubewharf/katalyst-core/pkg/config/agent/dynamic/crd"
    32  	"github.com/kubewharf/katalyst-core/pkg/consts"
    33  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    34  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    35  	"github.com/kubewharf/katalyst-core/pkg/util/process"
    36  )
    37  
    38  const healthzNameLockingFileAcquired = "LockingFileReady"
    39  
    40  const (
    41  	metricsNameLockingFailed = "get_lock_failed"
    42  	metricsNameAgentStarted  = "agent_started"
    43  )
    44  
    45  // Run starts common and uniformed agent components here, and starts other
    46  // specific components in other separate repos (with common components as
    47  // dependencies)
    48  func Run(conf *config.Configuration, clientSet *client.GenericClientSet, genericOptions ...katalystbase.GenericOptions) error {
    49  	// Set up signals so that we handle the first shutdown signal gracefully.
    50  	ctx := process.SetupSignalHandler()
    51  
    52  	baseCtx, err := katalystbase.NewGenericContext(clientSet, "", nil, AgentsDisabledByDefault,
    53  		conf.GenericConfiguration, consts.KatalystComponentAgent, conf.DynamicAgentConfiguration)
    54  	if err != nil {
    55  		return err
    56  	}
    57  
    58  	genericCtx, err := agent.NewGenericContext(baseCtx, conf)
    59  	if err != nil {
    60  		return err
    61  	}
    62  
    63  	for _, genericOption := range genericOptions {
    64  		genericOption(genericCtx)
    65  	}
    66  
    67  	lock := acquireLock(genericCtx, conf)
    68  	defer func() {
    69  		// if the process panic in other place and the defer function isn't executed,
    70  		// OS will help to unlock. So the next process till get the lock successfully.
    71  		general.ReleaseUniqueLock(lock)
    72  
    73  		// wait async log sync to disk
    74  		time.Sleep(1 * time.Second)
    75  	}()
    76  
    77  	return startAgent(ctx, genericCtx, conf, GetAgentInitializers())
    78  }
    79  
    80  // startAgent is used to initialize and start each component in katalyst-agent
    81  func startAgent(ctx context.Context, genericCtx *agent.GenericContext,
    82  	conf *config.Configuration, agents map[string]AgentStarter,
    83  ) error {
    84  	componentMap := make(map[string]agent.Component)
    85  	monitorAgentStart(genericCtx)
    86  	for agentName, starter := range agents {
    87  		if !genericCtx.IsEnabled(agentName, conf.Agents) {
    88  			klog.Warningf("%q is disabled", agentName)
    89  			continue
    90  		}
    91  
    92  		klog.Infof("initializing %q", agentName)
    93  		needToRun, component, err := starter.Init(genericCtx, conf, starter.ExtraConf, agentName)
    94  		if err != nil {
    95  			klog.Errorf("Error initializing %q", agentName)
    96  			return err
    97  		} else if !needToRun {
    98  			klog.Warningf("skip to call running functions %q", agentName)
    99  			continue
   100  		}
   101  
   102  		componentMap[agentName] = component
   103  		klog.Infof("needToRun %q", agentName)
   104  	}
   105  
   106  	// initialize dynamic config first before components run.
   107  	err := genericCtx.InitializeConfig(ctx)
   108  	if err != nil {
   109  		return fmt.Errorf("initialize dynamic config failed: %v", err)
   110  	}
   111  
   112  	wg := sync.WaitGroup{}
   113  
   114  	// start generic ctx first
   115  	wg.Add(1)
   116  	go func() {
   117  		defer wg.Done()
   118  		genericCtx.Run(ctx)
   119  	}()
   120  
   121  	// watch auth configuration
   122  	err = genericCtx.MetaServer.AddConfigWatcher(crd.AuthConfigurationGVR)
   123  	if err != nil {
   124  		return fmt.Errorf("add authconfiguration watcher failed")
   125  	}
   126  
   127  	// start all component and make sure them can be stopped completely
   128  	for agentName, component := range componentMap {
   129  		wg.Add(1)
   130  		runnable := component
   131  		go func() {
   132  			defer wg.Done()
   133  			runnable.Run(ctx)
   134  			klog.Infof("component %q stopped", agentName)
   135  		}()
   136  
   137  		klog.Infof("started %q", agentName)
   138  	}
   139  
   140  	wg.Wait()
   141  	return nil
   142  }
   143  
   144  func monitorAgentStart(genericCtx *agent.GenericContext) {
   145  	_ = genericCtx.EmitterPool.GetDefaultMetricsEmitter().StoreInt64(metricsNameAgentStarted, 1, metrics.MetricTypeNameCount)
   146  }
   147  
   148  // acquireLock makes sure only one process can handle socket files;
   149  // any process that wants to enter main loop, should acquire file lock firstly.
   150  func acquireLock(genericCtx *agent.GenericContext, conf *config.Configuration) *general.Flock {
   151  	// register a not-ready state for lock-acquiring when we starts
   152  	general.RegisterHeartbeatCheck(healthzNameLockingFileAcquired, 0, general.HealthzCheckStateNotReady, 0)
   153  
   154  	// set a ready state for lock-acquiring when we acquire locking successfully
   155  	defer func() {
   156  		_ = general.UpdateHealthzState(healthzNameLockingFileAcquired, general.HealthzCheckStateReady, "")
   157  	}()
   158  
   159  	for {
   160  		lock, err := general.GetUniqueLock(conf.LockFileName)
   161  		if err != nil {
   162  			_ = genericCtx.EmitterPool.GetDefaultMetricsEmitter().StoreInt64(metricsNameLockingFailed, 1, metrics.MetricTypeNameRaw)
   163  			// if waiting is enabled, we will always wait until lock has been obtained successfully;
   164  			if conf.LockWaitingEnabled {
   165  				continue
   166  			}
   167  			panic(err)
   168  		} else {
   169  			return lock
   170  		}
   171  	}
   172  }