agones.dev/agones@v1.54.0/cmd/extensions/main.go (about)

     1  // Copyright 2022 Google LLC All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Extensions for the Agones System
    16  package main
    17  
    18  import (
    19  	"context"
    20  	"fmt"
    21  	"io"
    22  	"os"
    23  	"path/filepath"
    24  	"strings"
    25  	"time"
    26  
    27  	"agones.dev/agones/pkg"
    28  	"agones.dev/agones/pkg/client/clientset/versioned"
    29  	"agones.dev/agones/pkg/client/informers/externalversions"
    30  	"agones.dev/agones/pkg/cloudproduct"
    31  	"agones.dev/agones/pkg/fleetautoscalers"
    32  	"agones.dev/agones/pkg/fleets"
    33  	"agones.dev/agones/pkg/gameserverallocations"
    34  	"agones.dev/agones/pkg/gameservers"
    35  	"agones.dev/agones/pkg/gameserversets"
    36  	"agones.dev/agones/pkg/metrics"
    37  	"agones.dev/agones/pkg/processor"
    38  	"agones.dev/agones/pkg/util/apiserver"
    39  	"agones.dev/agones/pkg/util/https"
    40  	"agones.dev/agones/pkg/util/httpserver"
    41  	"agones.dev/agones/pkg/util/runtime"
    42  	"agones.dev/agones/pkg/util/signals"
    43  	"agones.dev/agones/pkg/util/webhooks"
    44  	"github.com/heptiolabs/healthcheck"
    45  	"github.com/pkg/errors"
    46  	"github.com/sirupsen/logrus"
    47  	"github.com/spf13/pflag"
    48  	"github.com/spf13/viper"
    49  	"gopkg.in/natefinch/lumberjack.v2"
    50  	"k8s.io/client-go/informers"
    51  	"k8s.io/client-go/kubernetes"
    52  )
    53  
    54  const (
    55  	enableStackdriverMetricsFlag = "stackdriver-exporter"
    56  	stackdriverLabels            = "stackdriver-labels"
    57  	enablePrometheusMetricsFlag  = "prometheus-exporter"
    58  	projectIDFlag                = "gcp-project-id"
    59  	certFileFlag                 = "cert-file"
    60  	keyFileFlag                  = "key-file"
    61  	numWorkersFlag               = "num-workers"
    62  	logDirFlag                   = "log-dir"
    63  	logLevelFlag                 = "log-level"
    64  	logSizeLimitMBFlag           = "log-size-limit-mb"
    65  	allocationBatchWaitTime      = "allocation-batch-wait-time"
    66  	kubeconfigFlag               = "kubeconfig"
    67  	defaultResync                = 30 * time.Second
    68  	apiServerSustainedQPSFlag    = "api-server-qps"
    69  	apiServerBurstQPSFlag        = "api-server-qps-burst"
    70  	readinessShutdownDuration    = "readiness-shutdown-duration"
    71  	httpPort                     = "http-port"
    72  	webhookPort                  = "webhook-port"
    73  	processorGRPCAddress         = "processor-grpc-address"
    74  	processorGRPCPort            = "processor-grpc-port"
    75  	processorMaxBatchSize        = "processor-max-batch-size"
    76  )
    77  
    78  var (
    79  	podReady bool
    80  	logger   = runtime.NewLoggerWithSource("main")
    81  )
    82  
    83  func setupLogging(logDir string, logSizeLimitMB int) {
    84  	logFileName := filepath.Join(logDir, "agones-extensions-"+time.Now().Format("20060102_150405")+".log")
    85  
    86  	const maxLogSizeMB = 100
    87  	maxBackups := (logSizeLimitMB - maxLogSizeMB) / maxLogSizeMB
    88  	logger.WithField("filename", logFileName).WithField("numbackups", maxBackups).Info("logging to file")
    89  	logrus.SetOutput(
    90  		io.MultiWriter(
    91  			logrus.StandardLogger().Out,
    92  			&lumberjack.Logger{
    93  				Filename:   logFileName,
    94  				MaxSize:    maxLogSizeMB,
    95  				MaxBackups: maxBackups,
    96  			},
    97  		),
    98  	)
    99  }
   100  
   101  // main initializes the extensions service for Agones
   102  func main() {
   103  	ctx, cancelCtx := context.WithCancel(context.Background())
   104  	ctlConf := parseEnvFlags()
   105  
   106  	if ctlConf.LogDir != "" {
   107  		setupLogging(ctlConf.LogDir, ctlConf.LogSizeLimitMB)
   108  	}
   109  
   110  	logger.WithField("logLevel", ctlConf.LogLevel).Info("Setting LogLevel configuration")
   111  	level, err := logrus.ParseLevel(strings.ToLower(ctlConf.LogLevel))
   112  	if err == nil {
   113  		runtime.SetLevel(level)
   114  	} else {
   115  		logger.WithError(err).Info("Unable to parse loglevel, using the default loglevel - Info")
   116  		runtime.SetLevel(logrus.InfoLevel)
   117  	}
   118  
   119  	logger.WithField("version", pkg.Version).WithField("featureGates", runtime.EncodeFeatures()).
   120  		WithField("ctlConf", ctlConf).Info("starting extensions operator...")
   121  
   122  	// if the kubeconfig fails InClusterBuildConfig will try in cluster config
   123  	clientConf, err := runtime.InClusterBuildConfig(logger, ctlConf.KubeConfig)
   124  	if err != nil {
   125  		logger.WithError(err).Fatal("Could not create in cluster config")
   126  	}
   127  
   128  	clientConf.QPS = float32(ctlConf.APIServerSustainedQPS)
   129  	clientConf.Burst = ctlConf.APIServerBurstQPS
   130  
   131  	kubeClient, err := kubernetes.NewForConfig(clientConf)
   132  	if err != nil {
   133  		logger.WithError(err).Fatal("Could not create the kubernetes clientset")
   134  	}
   135  
   136  	agonesClient, err := versioned.NewForConfig(clientConf)
   137  	if err != nil {
   138  		logger.WithError(err).Fatal("Could not create the agones api clientset")
   139  	}
   140  
   141  	controllerHooks, err := cloudproduct.NewFromFlag(ctx, kubeClient)
   142  	if err != nil {
   143  		logger.WithError(err).Fatal("Could not initialize cloud product")
   144  	}
   145  	// https server and the items that share the Mux for routing
   146  	httpsServer := https.NewServer(ctlConf.CertFile, ctlConf.KeyFile, ctlConf.WebhookPort)
   147  	cancelTLS, err := httpsServer.WatchForCertificateChanges()
   148  	if err != nil {
   149  		logger.WithError(err).Fatal("Got an error while watching certificate changes")
   150  	}
   151  	defer cancelTLS()
   152  	wh := webhooks.NewWebHook(httpsServer.Mux)
   153  	api := apiserver.NewAPIServer(httpsServer.Mux)
   154  
   155  	agonesInformerFactory := externalversions.NewSharedInformerFactory(agonesClient, defaultResync)
   156  	kubeInformerFactory := informers.NewSharedInformerFactory(kubeClient, defaultResync)
   157  
   158  	server := &httpserver.Server{
   159  		Port:   ctlConf.HTTPPort,
   160  		Logger: logger,
   161  	}
   162  	var health healthcheck.Handler
   163  
   164  	metricsConf := metrics.Config{
   165  		Stackdriver:       ctlConf.Stackdriver,
   166  		PrometheusMetrics: ctlConf.PrometheusMetrics,
   167  		GCPProjectID:      ctlConf.GCPProjectID,
   168  		StackdriverLabels: ctlConf.StackdriverLabels,
   169  	}
   170  
   171  	health, closer := metrics.SetupMetrics(metricsConf, server)
   172  	defer closer()
   173  
   174  	podReady = true
   175  	health.AddReadinessCheck("agones-extensions", func() error {
   176  		if !podReady {
   177  			return errors.New("asked to shut down, failed readiness check")
   178  		}
   179  		return nil
   180  	})
   181  
   182  	signals.NewSigTermHandler(func() {
   183  		logger.Info("Pod shutdown has been requested, failing readiness check")
   184  		podReady = false
   185  		time.Sleep(ctlConf.ReadinessShutdownDuration)
   186  		cancelCtx()
   187  		logger.Infof("Readiness shutdown duration has passed, context cancelled")
   188  		time.Sleep(1 * time.Second) // allow a brief time for cleanup, but force exit if main doesn't
   189  		os.Exit(0)
   190  	})
   191  
   192  	// If we are using Prometheus only exporter we can make reporting more often,
   193  	// every 1 seconds, if we are using Stackdriver we would use 60 seconds reporting period,
   194  	// which is a requirements of Stackdriver, otherwise most of time series would be invalid for Stackdriver
   195  	metrics.SetReportingPeriod(ctlConf.PrometheusMetrics, ctlConf.Stackdriver)
   196  
   197  	server.Handle("/", health)
   198  
   199  	var gasExtensions *gameserverallocations.Extensions
   200  	if runtime.FeatureEnabled(runtime.FeatureProcessorAllocator) {
   201  		processorConfig := processor.Config{
   202  			ClientID:          os.Getenv("POD_NAME"),
   203  			ProcessorAddress:  fmt.Sprintf("%s:%d", ctlConf.processorGRPCAddress, ctlConf.processorGRPCPort),
   204  			MaxBatchSize:      ctlConf.processorMaxBatchSize,
   205  			AllocationTimeout: 30 * time.Second,
   206  			ReconnectInterval: 5 * time.Second,
   207  		}
   208  		processorClient := processor.NewClient(processorConfig, logger.WithField("component", "processor-client"))
   209  
   210  		go func() {
   211  			if err := processorClient.Run(ctx); err != nil {
   212  				if ctx.Err() != nil {
   213  					logger.WithError(err).Error("Processor client stopped due to context error")
   214  					return
   215  				}
   216  				logger.WithError(err).Error("Processor client failed, initiating graceful shutdown")
   217  			}
   218  		}()
   219  
   220  		gasExtensions = gameserverallocations.NewProcessorExtensions(api, kubeClient, processorClient)
   221  	} else {
   222  		gsCounter := gameservers.NewPerNodeCounter(kubeInformerFactory, agonesInformerFactory)
   223  
   224  		gasExtensions = gameserverallocations.NewExtensions(api, health, gsCounter, kubeClient, kubeInformerFactory,
   225  			agonesClient, agonesInformerFactory, 10*time.Second, 30*time.Second, ctlConf.AllocationBatchWaitTime)
   226  
   227  		kubeInformerFactory.Start(ctx.Done())
   228  		agonesInformerFactory.Start(ctx.Done())
   229  
   230  	}
   231  
   232  	gameservers.NewExtensions(controllerHooks, wh)
   233  	gameserversets.NewExtensions(controllerHooks, wh)
   234  	fleets.NewExtensions(controllerHooks, wh)
   235  	fleetautoscalers.NewExtensions(wh)
   236  
   237  	for _, r := range []runner{httpsServer, gasExtensions, server} {
   238  		go func(rr runner) {
   239  			if runErr := rr.Run(ctx, ctlConf.NumWorkers); runErr != nil {
   240  				logger.WithError(runErr).Fatalf("could not start runner: %T", rr)
   241  			}
   242  		}(r)
   243  	}
   244  
   245  	<-ctx.Done()
   246  	logger.Info("Shut down agones extensions")
   247  }
   248  
   249  func parseEnvFlags() config {
   250  	exec, err := os.Executable()
   251  	if err != nil {
   252  		logger.WithError(err).Fatal("Could not get executable path")
   253  	}
   254  
   255  	base := filepath.Dir(exec)
   256  	viper.SetDefault(certFileFlag, filepath.Join(base, "certs", "server.crt"))
   257  	viper.SetDefault(keyFileFlag, filepath.Join(base, "certs", "server.key"))
   258  	viper.SetDefault(allocationBatchWaitTime, 500*time.Millisecond)
   259  
   260  	viper.SetDefault(enablePrometheusMetricsFlag, true)
   261  	viper.SetDefault(enableStackdriverMetricsFlag, false)
   262  	viper.SetDefault(stackdriverLabels, "")
   263  
   264  	viper.SetDefault(projectIDFlag, "")
   265  	viper.SetDefault(numWorkersFlag, 64)
   266  	viper.SetDefault(apiServerSustainedQPSFlag, 100)
   267  	viper.SetDefault(apiServerBurstQPSFlag, 200)
   268  	viper.SetDefault(logDirFlag, "")
   269  	viper.SetDefault(logLevelFlag, "Info")
   270  	viper.SetDefault(logSizeLimitMBFlag, 10000) // 10 GB, will be split into 100 MB chunks
   271  	viper.SetDefault(httpPort, "8080")
   272  	viper.SetDefault(webhookPort, "8081")
   273  
   274  	viper.SetDefault(processorGRPCAddress, "agones-processor.agones-system.svc.cluster.local")
   275  	viper.SetDefault(processorGRPCPort, 9090)
   276  	viper.SetDefault(processorMaxBatchSize, 100)
   277  
   278  	pflag.String(keyFileFlag, viper.GetString(keyFileFlag), "Optional. Path to the key file")
   279  	pflag.String(certFileFlag, viper.GetString(certFileFlag), "Optional. Path to the crt file")
   280  	pflag.String(kubeconfigFlag, viper.GetString(kubeconfigFlag), "Optional. kubeconfig to run the controller out of the cluster. Only use it for debugging as webhook won't works.")
   281  
   282  	pflag.Bool(enablePrometheusMetricsFlag, viper.GetBool(enablePrometheusMetricsFlag), "Flag to activate metrics of Agones. Can also use PROMETHEUS_EXPORTER env variable.")
   283  	pflag.Bool(enableStackdriverMetricsFlag, viper.GetBool(enableStackdriverMetricsFlag), "Flag to activate stackdriver monitoring metrics for Agones. Can also use STACKDRIVER_EXPORTER env variable.")
   284  	pflag.String(stackdriverLabels, viper.GetString(stackdriverLabels), "A set of default labels to add to all stackdriver metrics generated. By default metadata are automatically added using Kubernetes API and GCP metadata enpoint.")
   285  
   286  	pflag.String(projectIDFlag, viper.GetString(projectIDFlag), "GCP ProjectID used for Stackdriver, if not specified ProjectID from Application Default Credentials would be used. Can also use GCP_PROJECT_ID env variable.")
   287  	pflag.Int32(numWorkersFlag, 64, "Number of controller workers per resource type")
   288  	pflag.Int32(apiServerSustainedQPSFlag, 100, "Maximum sustained queries per second to send to the API server")
   289  	pflag.Int32(apiServerBurstQPSFlag, 200, "Maximum burst queries per second to send to the API server")
   290  	pflag.String(httpPort, viper.GetString(httpPort), "Port for the HTTP server. Defaults to 8080, can also use HTTP_PORT env variable")
   291  	pflag.String(webhookPort, viper.GetString(webhookPort), "Port for the Webhook. Defaults to 8081, can also use WEBHOOK_PORT env variable")
   292  	pflag.String(logDirFlag, viper.GetString(logDirFlag), "If set, store logs in a given directory.")
   293  	pflag.Int32(logSizeLimitMBFlag, 1000, "Log file size limit in MB")
   294  	pflag.String(logLevelFlag, viper.GetString(logLevelFlag), "Agones Log level")
   295  	pflag.Duration(allocationBatchWaitTime, viper.GetDuration(allocationBatchWaitTime), "Flag to configure the waiting period between allocations batches")
   296  	pflag.Duration(readinessShutdownDuration, viper.GetDuration(readinessShutdownDuration), "Time in seconds for SIGTERM handler to sleep for.")
   297  
   298  	pflag.String(processorGRPCAddress, viper.GetString(processorGRPCAddress), "The gRPC address of the Agones Processor service")
   299  	pflag.Int32(processorGRPCPort, viper.GetInt32(processorGRPCPort), "The gRPC port of the Agones Processor service")
   300  	pflag.Int32(processorMaxBatchSize, viper.GetInt32(processorMaxBatchSize), "The maximum batch size to send to the Agones Processor service")
   301  
   302  	cloudproduct.BindFlags()
   303  	runtime.FeaturesBindFlags()
   304  	pflag.Parse()
   305  
   306  	viper.SetEnvKeyReplacer(strings.NewReplacer("-", "_"))
   307  
   308  	runtime.Must(viper.BindEnv(keyFileFlag))
   309  	runtime.Must(viper.BindEnv(certFileFlag))
   310  	runtime.Must(viper.BindEnv(kubeconfigFlag))
   311  
   312  	runtime.Must(viper.BindEnv(enablePrometheusMetricsFlag))
   313  	runtime.Must(viper.BindEnv(enableStackdriverMetricsFlag))
   314  	runtime.Must(viper.BindEnv(stackdriverLabels))
   315  
   316  	runtime.Must(viper.BindEnv(projectIDFlag))
   317  	runtime.Must(viper.BindEnv(numWorkersFlag))
   318  	runtime.Must(viper.BindEnv(apiServerSustainedQPSFlag))
   319  	runtime.Must(viper.BindEnv(apiServerBurstQPSFlag))
   320  	runtime.Must(viper.BindEnv(logLevelFlag))
   321  	runtime.Must(viper.BindEnv(logDirFlag))
   322  	runtime.Must(viper.BindEnv(logSizeLimitMBFlag))
   323  	runtime.Must(viper.BindEnv(httpPort))
   324  	runtime.Must(viper.BindEnv(webhookPort))
   325  	runtime.Must(viper.BindEnv(allocationBatchWaitTime))
   326  	runtime.Must(viper.BindPFlags(pflag.CommandLine))
   327  	runtime.Must(viper.BindEnv(readinessShutdownDuration))
   328  	runtime.Must(cloudproduct.BindEnv())
   329  	runtime.Must(runtime.FeaturesBindEnv())
   330  	runtime.Must(runtime.ParseFeaturesFromEnv())
   331  
   332  	return config{
   333  		KeyFile:      viper.GetString(keyFileFlag),
   334  		CertFile:     viper.GetString(certFileFlag),
   335  		KubeConfig:   viper.GetString(kubeconfigFlag),
   336  		GCPProjectID: viper.GetString(projectIDFlag),
   337  
   338  		PrometheusMetrics: viper.GetBool(enablePrometheusMetricsFlag),
   339  		Stackdriver:       viper.GetBool(enableStackdriverMetricsFlag),
   340  		StackdriverLabels: viper.GetString(stackdriverLabels),
   341  
   342  		NumWorkers:                int(viper.GetInt32(numWorkersFlag)),
   343  		APIServerSustainedQPS:     int(viper.GetInt32(apiServerSustainedQPSFlag)),
   344  		APIServerBurstQPS:         int(viper.GetInt32(apiServerBurstQPSFlag)),
   345  		LogDir:                    viper.GetString(logDirFlag),
   346  		LogLevel:                  viper.GetString(logLevelFlag),
   347  		LogSizeLimitMB:            int(viper.GetInt32(logSizeLimitMBFlag)),
   348  		HTTPPort:                  viper.GetString(httpPort),
   349  		WebhookPort:               viper.GetString(webhookPort),
   350  		AllocationBatchWaitTime:   viper.GetDuration(allocationBatchWaitTime),
   351  		ReadinessShutdownDuration: viper.GetDuration(readinessShutdownDuration),
   352  
   353  		processorGRPCAddress:  viper.GetString(processorGRPCAddress),
   354  		processorGRPCPort:     int(viper.GetInt32(processorGRPCPort)),
   355  		processorMaxBatchSize: int(viper.GetInt32(processorMaxBatchSize)),
   356  	}
   357  }
   358  
   359  // config stores all required configuration to create a game server extensions.
   360  type config struct {
   361  	KeyFile    string
   362  	CertFile   string
   363  	KubeConfig string
   364  
   365  	PrometheusMetrics bool
   366  	Stackdriver       bool
   367  	StackdriverLabels string
   368  
   369  	GCPProjectID              string
   370  	NumWorkers                int
   371  	APIServerSustainedQPS     int
   372  	APIServerBurstQPS         int
   373  	LogDir                    string
   374  	LogLevel                  string
   375  	LogSizeLimitMB            int
   376  	HTTPPort                  string
   377  	WebhookPort               string
   378  	AllocationBatchWaitTime   time.Duration
   379  	ReadinessShutdownDuration time.Duration
   380  
   381  	processorGRPCAddress  string
   382  	processorGRPCPort     int
   383  	processorMaxBatchSize int
   384  }
   385  
   386  type runner interface {
   387  	Run(ctx context.Context, workers int) error
   388  }