agones.dev/agones@v1.53.0/cmd/extensions/main.go (about) 1 // Copyright 2022 Google LLC All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Extensions for the Agones System 16 package main 17 18 import ( 19 "context" 20 "fmt" 21 "io" 22 "os" 23 "path/filepath" 24 "strings" 25 "time" 26 27 "agones.dev/agones/pkg" 28 "agones.dev/agones/pkg/client/clientset/versioned" 29 "agones.dev/agones/pkg/client/informers/externalversions" 30 "agones.dev/agones/pkg/cloudproduct" 31 "agones.dev/agones/pkg/fleetautoscalers" 32 "agones.dev/agones/pkg/fleets" 33 "agones.dev/agones/pkg/gameserverallocations" 34 "agones.dev/agones/pkg/gameservers" 35 "agones.dev/agones/pkg/gameserversets" 36 "agones.dev/agones/pkg/metrics" 37 "agones.dev/agones/pkg/processor" 38 "agones.dev/agones/pkg/util/apiserver" 39 "agones.dev/agones/pkg/util/https" 40 "agones.dev/agones/pkg/util/httpserver" 41 "agones.dev/agones/pkg/util/runtime" 42 "agones.dev/agones/pkg/util/signals" 43 "agones.dev/agones/pkg/util/webhooks" 44 "github.com/heptiolabs/healthcheck" 45 "github.com/pkg/errors" 46 "github.com/sirupsen/logrus" 47 "github.com/spf13/pflag" 48 "github.com/spf13/viper" 49 "gopkg.in/natefinch/lumberjack.v2" 50 "k8s.io/client-go/informers" 51 "k8s.io/client-go/kubernetes" 52 ) 53 54 const ( 55 enableStackdriverMetricsFlag = "stackdriver-exporter" 56 stackdriverLabels = "stackdriver-labels" 57 enablePrometheusMetricsFlag = "prometheus-exporter" 58 projectIDFlag = "gcp-project-id" 59 certFileFlag = "cert-file" 60 keyFileFlag = "key-file" 61 numWorkersFlag = "num-workers" 62 logDirFlag = "log-dir" 63 logLevelFlag = "log-level" 64 logSizeLimitMBFlag = "log-size-limit-mb" 65 allocationBatchWaitTime = "allocation-batch-wait-time" 66 kubeconfigFlag = "kubeconfig" 67 defaultResync = 30 * time.Second 68 apiServerSustainedQPSFlag = "api-server-qps" 69 apiServerBurstQPSFlag = "api-server-qps-burst" 70 readinessShutdownDuration = "readiness-shutdown-duration" 71 httpPort = "http-port" 72 webhookPort = "webhook-port" 73 processorGRPCAddress = "processor-grpc-address" 74 processorGRPCPort = "processor-grpc-port" 75 processorMaxBatchSize = "processor-max-batch-size" 76 ) 77 78 var ( 79 podReady bool 80 logger = runtime.NewLoggerWithSource("main") 81 ) 82 83 func setupLogging(logDir string, logSizeLimitMB int) { 84 logFileName := filepath.Join(logDir, "agones-extensions-"+time.Now().Format("20060102_150405")+".log") 85 86 const maxLogSizeMB = 100 87 maxBackups := (logSizeLimitMB - maxLogSizeMB) / maxLogSizeMB 88 logger.WithField("filename", logFileName).WithField("numbackups", maxBackups).Info("logging to file") 89 logrus.SetOutput( 90 io.MultiWriter( 91 logrus.StandardLogger().Out, 92 &lumberjack.Logger{ 93 Filename: logFileName, 94 MaxSize: maxLogSizeMB, 95 MaxBackups: maxBackups, 96 }, 97 ), 98 ) 99 } 100 101 // main initializes the extensions service for Agones 102 func main() { 103 ctx, cancelCtx := context.WithCancel(context.Background()) 104 ctlConf := parseEnvFlags() 105 106 if ctlConf.LogDir != "" { 107 setupLogging(ctlConf.LogDir, ctlConf.LogSizeLimitMB) 108 } 109 110 logger.WithField("logLevel", ctlConf.LogLevel).Info("Setting LogLevel configuration") 111 level, err := logrus.ParseLevel(strings.ToLower(ctlConf.LogLevel)) 112 if err == nil { 113 runtime.SetLevel(level) 114 } else { 115 logger.WithError(err).Info("Unable to parse loglevel, using the default loglevel - Info") 116 runtime.SetLevel(logrus.InfoLevel) 117 } 118 119 logger.WithField("version", pkg.Version).WithField("featureGates", runtime.EncodeFeatures()). 120 WithField("ctlConf", ctlConf).Info("starting extensions operator...") 121 122 // if the kubeconfig fails InClusterBuildConfig will try in cluster config 123 clientConf, err := runtime.InClusterBuildConfig(logger, ctlConf.KubeConfig) 124 if err != nil { 125 logger.WithError(err).Fatal("Could not create in cluster config") 126 } 127 128 clientConf.QPS = float32(ctlConf.APIServerSustainedQPS) 129 clientConf.Burst = ctlConf.APIServerBurstQPS 130 131 kubeClient, err := kubernetes.NewForConfig(clientConf) 132 if err != nil { 133 logger.WithError(err).Fatal("Could not create the kubernetes clientset") 134 } 135 136 agonesClient, err := versioned.NewForConfig(clientConf) 137 if err != nil { 138 logger.WithError(err).Fatal("Could not create the agones api clientset") 139 } 140 141 controllerHooks, err := cloudproduct.NewFromFlag(ctx, kubeClient) 142 if err != nil { 143 logger.WithError(err).Fatal("Could not initialize cloud product") 144 } 145 // https server and the items that share the Mux for routing 146 httpsServer := https.NewServer(ctlConf.CertFile, ctlConf.KeyFile, ctlConf.WebhookPort) 147 cancelTLS, err := httpsServer.WatchForCertificateChanges() 148 if err != nil { 149 logger.WithError(err).Fatal("Got an error while watching certificate changes") 150 } 151 defer cancelTLS() 152 wh := webhooks.NewWebHook(httpsServer.Mux) 153 api := apiserver.NewAPIServer(httpsServer.Mux) 154 155 agonesInformerFactory := externalversions.NewSharedInformerFactory(agonesClient, defaultResync) 156 kubeInformerFactory := informers.NewSharedInformerFactory(kubeClient, defaultResync) 157 158 server := &httpserver.Server{ 159 Port: ctlConf.HTTPPort, 160 Logger: logger, 161 } 162 var health healthcheck.Handler 163 164 metricsConf := metrics.Config{ 165 Stackdriver: ctlConf.Stackdriver, 166 PrometheusMetrics: ctlConf.PrometheusMetrics, 167 GCPProjectID: ctlConf.GCPProjectID, 168 StackdriverLabels: ctlConf.StackdriverLabels, 169 } 170 171 health, closer := metrics.SetupMetrics(metricsConf, server) 172 defer closer() 173 174 podReady = true 175 health.AddReadinessCheck("agones-extensions", func() error { 176 if !podReady { 177 return errors.New("asked to shut down, failed readiness check") 178 } 179 return nil 180 }) 181 182 signals.NewSigTermHandler(func() { 183 logger.Info("Pod shutdown has been requested, failing readiness check") 184 podReady = false 185 time.Sleep(ctlConf.ReadinessShutdownDuration) 186 cancelCtx() 187 logger.Infof("Readiness shutdown duration has passed, context cancelled") 188 time.Sleep(1 * time.Second) // allow a brief time for cleanup, but force exit if main doesn't 189 os.Exit(0) 190 }) 191 192 // If we are using Prometheus only exporter we can make reporting more often, 193 // every 1 seconds, if we are using Stackdriver we would use 60 seconds reporting period, 194 // which is a requirements of Stackdriver, otherwise most of time series would be invalid for Stackdriver 195 metrics.SetReportingPeriod(ctlConf.PrometheusMetrics, ctlConf.Stackdriver) 196 197 server.Handle("/", health) 198 199 var gasExtensions *gameserverallocations.Extensions 200 if runtime.FeatureEnabled(runtime.FeatureProcessorAllocator) { 201 processorConfig := processor.Config{ 202 ClientID: os.Getenv("POD_NAME"), 203 ProcessorAddress: fmt.Sprintf("%s:%d", ctlConf.processorGRPCAddress, ctlConf.processorGRPCPort), 204 MaxBatchSize: ctlConf.processorMaxBatchSize, 205 AllocationTimeout: 30 * time.Second, 206 ReconnectInterval: 5 * time.Second, 207 } 208 processorClient := processor.NewClient(processorConfig, logger.WithField("component", "processor-client")) 209 210 go func() { 211 if err := processorClient.Run(ctx); err != nil { 212 if ctx.Err() != nil { 213 logger.WithError(err).Error("Processor client stopped due to context error") 214 return 215 } 216 logger.WithError(err).Error("Processor client failed, initiating graceful shutdown") 217 } 218 }() 219 220 gasExtensions = gameserverallocations.NewProcessorExtensions(api, kubeClient, processorClient) 221 } else { 222 gsCounter := gameservers.NewPerNodeCounter(kubeInformerFactory, agonesInformerFactory) 223 224 gasExtensions = gameserverallocations.NewExtensions(api, health, gsCounter, kubeClient, kubeInformerFactory, 225 agonesClient, agonesInformerFactory, 10*time.Second, 30*time.Second, ctlConf.AllocationBatchWaitTime) 226 227 kubeInformerFactory.Start(ctx.Done()) 228 agonesInformerFactory.Start(ctx.Done()) 229 230 } 231 232 gameservers.NewExtensions(controllerHooks, wh) 233 gameserversets.NewExtensions(controllerHooks, wh) 234 fleets.NewExtensions(controllerHooks, wh) 235 fleetautoscalers.NewExtensions(wh) 236 237 for _, r := range []runner{httpsServer, gasExtensions, server} { 238 go func(rr runner) { 239 if runErr := rr.Run(ctx, ctlConf.NumWorkers); runErr != nil { 240 logger.WithError(runErr).Fatalf("could not start runner: %T", rr) 241 } 242 }(r) 243 } 244 245 <-ctx.Done() 246 logger.Info("Shut down agones extensions") 247 } 248 249 func parseEnvFlags() config { 250 exec, err := os.Executable() 251 if err != nil { 252 logger.WithError(err).Fatal("Could not get executable path") 253 } 254 255 base := filepath.Dir(exec) 256 viper.SetDefault(certFileFlag, filepath.Join(base, "certs", "server.crt")) 257 viper.SetDefault(keyFileFlag, filepath.Join(base, "certs", "server.key")) 258 viper.SetDefault(allocationBatchWaitTime, 500*time.Millisecond) 259 260 viper.SetDefault(enablePrometheusMetricsFlag, true) 261 viper.SetDefault(enableStackdriverMetricsFlag, false) 262 viper.SetDefault(stackdriverLabels, "") 263 264 viper.SetDefault(projectIDFlag, "") 265 viper.SetDefault(numWorkersFlag, 64) 266 viper.SetDefault(apiServerSustainedQPSFlag, 100) 267 viper.SetDefault(apiServerBurstQPSFlag, 200) 268 viper.SetDefault(logDirFlag, "") 269 viper.SetDefault(logLevelFlag, "Info") 270 viper.SetDefault(logSizeLimitMBFlag, 10000) // 10 GB, will be split into 100 MB chunks 271 viper.SetDefault(httpPort, "8080") 272 viper.SetDefault(webhookPort, "8081") 273 274 viper.SetDefault(processorGRPCAddress, "agones-processor.agones-system.svc.cluster.local") 275 viper.SetDefault(processorGRPCPort, 9090) 276 viper.SetDefault(processorMaxBatchSize, 100) 277 278 pflag.String(keyFileFlag, viper.GetString(keyFileFlag), "Optional. Path to the key file") 279 pflag.String(certFileFlag, viper.GetString(certFileFlag), "Optional. Path to the crt file") 280 pflag.String(kubeconfigFlag, viper.GetString(kubeconfigFlag), "Optional. kubeconfig to run the controller out of the cluster. Only use it for debugging as webhook won't works.") 281 282 pflag.Bool(enablePrometheusMetricsFlag, viper.GetBool(enablePrometheusMetricsFlag), "Flag to activate metrics of Agones. Can also use PROMETHEUS_EXPORTER env variable.") 283 pflag.Bool(enableStackdriverMetricsFlag, viper.GetBool(enableStackdriverMetricsFlag), "Flag to activate stackdriver monitoring metrics for Agones. Can also use STACKDRIVER_EXPORTER env variable.") 284 pflag.String(stackdriverLabels, viper.GetString(stackdriverLabels), "A set of default labels to add to all stackdriver metrics generated. By default metadata are automatically added using Kubernetes API and GCP metadata enpoint.") 285 286 pflag.String(projectIDFlag, viper.GetString(projectIDFlag), "GCP ProjectID used for Stackdriver, if not specified ProjectID from Application Default Credentials would be used. Can also use GCP_PROJECT_ID env variable.") 287 pflag.Int32(numWorkersFlag, 64, "Number of controller workers per resource type") 288 pflag.Int32(apiServerSustainedQPSFlag, 100, "Maximum sustained queries per second to send to the API server") 289 pflag.Int32(apiServerBurstQPSFlag, 200, "Maximum burst queries per second to send to the API server") 290 pflag.String(httpPort, viper.GetString(httpPort), "Port for the HTTP server. Defaults to 8080, can also use HTTP_PORT env variable") 291 pflag.String(webhookPort, viper.GetString(webhookPort), "Port for the Webhook. Defaults to 8081, can also use WEBHOOK_PORT env variable") 292 pflag.String(logDirFlag, viper.GetString(logDirFlag), "If set, store logs in a given directory.") 293 pflag.Int32(logSizeLimitMBFlag, 1000, "Log file size limit in MB") 294 pflag.String(logLevelFlag, viper.GetString(logLevelFlag), "Agones Log level") 295 pflag.Duration(allocationBatchWaitTime, viper.GetDuration(allocationBatchWaitTime), "Flag to configure the waiting period between allocations batches") 296 pflag.Duration(readinessShutdownDuration, viper.GetDuration(readinessShutdownDuration), "Time in seconds for SIGTERM handler to sleep for.") 297 298 pflag.String(processorGRPCAddress, viper.GetString(processorGRPCAddress), "The gRPC address of the Agones Processor service") 299 pflag.Int32(processorGRPCPort, viper.GetInt32(processorGRPCPort), "The gRPC port of the Agones Processor service") 300 pflag.Int32(processorMaxBatchSize, viper.GetInt32(processorMaxBatchSize), "The maximum batch size to send to the Agones Processor service") 301 302 cloudproduct.BindFlags() 303 runtime.FeaturesBindFlags() 304 pflag.Parse() 305 306 viper.SetEnvKeyReplacer(strings.NewReplacer("-", "_")) 307 308 runtime.Must(viper.BindEnv(keyFileFlag)) 309 runtime.Must(viper.BindEnv(certFileFlag)) 310 runtime.Must(viper.BindEnv(kubeconfigFlag)) 311 312 runtime.Must(viper.BindEnv(enablePrometheusMetricsFlag)) 313 runtime.Must(viper.BindEnv(enableStackdriverMetricsFlag)) 314 runtime.Must(viper.BindEnv(stackdriverLabels)) 315 316 runtime.Must(viper.BindEnv(projectIDFlag)) 317 runtime.Must(viper.BindEnv(numWorkersFlag)) 318 runtime.Must(viper.BindEnv(apiServerSustainedQPSFlag)) 319 runtime.Must(viper.BindEnv(apiServerBurstQPSFlag)) 320 runtime.Must(viper.BindEnv(logLevelFlag)) 321 runtime.Must(viper.BindEnv(logDirFlag)) 322 runtime.Must(viper.BindEnv(logSizeLimitMBFlag)) 323 runtime.Must(viper.BindEnv(httpPort)) 324 runtime.Must(viper.BindEnv(webhookPort)) 325 runtime.Must(viper.BindEnv(allocationBatchWaitTime)) 326 runtime.Must(viper.BindPFlags(pflag.CommandLine)) 327 runtime.Must(viper.BindEnv(readinessShutdownDuration)) 328 runtime.Must(cloudproduct.BindEnv()) 329 runtime.Must(runtime.FeaturesBindEnv()) 330 runtime.Must(runtime.ParseFeaturesFromEnv()) 331 332 return config{ 333 KeyFile: viper.GetString(keyFileFlag), 334 CertFile: viper.GetString(certFileFlag), 335 KubeConfig: viper.GetString(kubeconfigFlag), 336 GCPProjectID: viper.GetString(projectIDFlag), 337 338 PrometheusMetrics: viper.GetBool(enablePrometheusMetricsFlag), 339 Stackdriver: viper.GetBool(enableStackdriverMetricsFlag), 340 StackdriverLabels: viper.GetString(stackdriverLabels), 341 342 NumWorkers: int(viper.GetInt32(numWorkersFlag)), 343 APIServerSustainedQPS: int(viper.GetInt32(apiServerSustainedQPSFlag)), 344 APIServerBurstQPS: int(viper.GetInt32(apiServerBurstQPSFlag)), 345 LogDir: viper.GetString(logDirFlag), 346 LogLevel: viper.GetString(logLevelFlag), 347 LogSizeLimitMB: int(viper.GetInt32(logSizeLimitMBFlag)), 348 HTTPPort: viper.GetString(httpPort), 349 WebhookPort: viper.GetString(webhookPort), 350 AllocationBatchWaitTime: viper.GetDuration(allocationBatchWaitTime), 351 ReadinessShutdownDuration: viper.GetDuration(readinessShutdownDuration), 352 353 processorGRPCAddress: viper.GetString(processorGRPCAddress), 354 processorGRPCPort: int(viper.GetInt32(processorGRPCPort)), 355 processorMaxBatchSize: int(viper.GetInt32(processorMaxBatchSize)), 356 } 357 } 358 359 // config stores all required configuration to create a game server extensions. 360 type config struct { 361 KeyFile string 362 CertFile string 363 KubeConfig string 364 365 PrometheusMetrics bool 366 Stackdriver bool 367 StackdriverLabels string 368 369 GCPProjectID string 370 NumWorkers int 371 APIServerSustainedQPS int 372 APIServerBurstQPS int 373 LogDir string 374 LogLevel string 375 LogSizeLimitMB int 376 HTTPPort string 377 WebhookPort string 378 AllocationBatchWaitTime time.Duration 379 ReadinessShutdownDuration time.Duration 380 381 processorGRPCAddress string 382 processorGRPCPort int 383 processorMaxBatchSize int 384 } 385 386 type runner interface { 387 Run(ctx context.Context, workers int) error 388 }