github.com/kubevela/workflow@v0.6.0/cmd/main.go (about) 1 /* 2 Copyright 2022 The KubeVela Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package main 18 19 import ( 20 "context" 21 "errors" 22 goflag "flag" 23 "fmt" 24 "io" 25 "net/http" 26 "net/http/pprof" 27 "os" 28 "path/filepath" 29 "strconv" 30 "strings" 31 "time" 32 33 "github.com/crossplane/crossplane-runtime/pkg/event" 34 "github.com/kubevela/pkg/controller/sharding" 35 flag "github.com/spf13/pflag" 36 corev1 "k8s.io/api/core/v1" 37 kerrors "k8s.io/apimachinery/pkg/api/errors" 38 "k8s.io/apimachinery/pkg/runtime" 39 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 40 "k8s.io/apiserver/pkg/util/feature" 41 clientgoscheme "k8s.io/client-go/kubernetes/scheme" 42 "k8s.io/klog/v2" 43 "k8s.io/klog/v2/klogr" 44 ctrl "sigs.k8s.io/controller-runtime" 45 "sigs.k8s.io/controller-runtime/pkg/client" 46 "sigs.k8s.io/controller-runtime/pkg/healthz" 47 48 triggerv1alpha1 "github.com/kubevela/kube-trigger/api/v1alpha1" 49 velaclient "github.com/kubevela/pkg/controller/client" 50 "github.com/kubevela/pkg/multicluster" 51 52 "github.com/kubevela/workflow/api/v1alpha1" 53 "github.com/kubevela/workflow/controllers" 54 "github.com/kubevela/workflow/pkg/backup" 55 "github.com/kubevela/workflow/pkg/common" 56 "github.com/kubevela/workflow/pkg/cue/packages" 57 "github.com/kubevela/workflow/pkg/features" 58 "github.com/kubevela/workflow/pkg/monitor/watcher" 59 "github.com/kubevela/workflow/pkg/types" 60 "github.com/kubevela/workflow/pkg/utils" 61 "github.com/kubevela/workflow/pkg/webhook" 62 "github.com/kubevela/workflow/version" 63 //+kubebuilder:scaffold:imports 64 ) 65 66 var ( 67 scheme = runtime.NewScheme() 68 waitSecretTimeout = 90 * time.Second 69 waitSecretInterval = 2 * time.Second 70 ) 71 72 func init() { 73 utilruntime.Must(clientgoscheme.AddToScheme(scheme)) 74 75 utilruntime.Must(v1alpha1.AddToScheme(scheme)) 76 //+kubebuilder:scaffold:scheme 77 } 78 79 func main() { 80 var metricsAddr, logFilePath, probeAddr, pprofAddr, leaderElectionResourceLock, userAgent, certDir string 81 var backupStrategy, backupIgnoreStrategy, backupPersistType, groupByLabel, backupConfigSecretName, backupConfigSecretNamespace string 82 var enableLeaderElection, useWebhook, logDebug, backupCleanOnBackup bool 83 var qps float64 84 var logFileMaxSize uint64 85 var burst, webhookPort int 86 var leaseDuration, renewDeadline, retryPeriod, recycleDuration time.Duration 87 var controllerArgs controllers.Args 88 89 flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") 90 flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") 91 flag.StringVar(&logFilePath, "log-file-path", "", "The file to write logs to.") 92 flag.Uint64Var(&logFileMaxSize, "log-file-max-size", 1024, "Defines the maximum size a log file can grow to, Unit is megabytes.") 93 flag.BoolVar(&logDebug, "log-debug", false, "Enable debug logs for development purpose") 94 flag.BoolVar(&enableLeaderElection, "leader-elect", false, 95 "Enable leader election for controller manager. "+ 96 "Enabling this will ensure there is only one active controller manager.") 97 flag.StringVar(&leaderElectionResourceLock, "leader-election-resource-lock", "configmapsleases", "The resource lock to use for leader election") 98 flag.DurationVar(&leaseDuration, "leader-election-lease-duration", 15*time.Second, 99 "The duration that non-leader candidates will wait to force acquire leadership") 100 flag.DurationVar(&renewDeadline, "leader-election-renew-deadline", 10*time.Second, 101 "The duration that the acting controlplane will retry refreshing leadership before giving up") 102 flag.DurationVar(&retryPeriod, "leader-election-retry-period", 2*time.Second, 103 "The duration the LeaderElector clients should wait between tries of actions") 104 flag.DurationVar(&recycleDuration, "recycle-duration", 30*24*time.Hour, 105 "The recycle duration of a completed and is not the latest record in a set of workflowruns") 106 107 flag.BoolVar(&useWebhook, "use-webhook", false, "Enable Admission Webhook") 108 flag.StringVar(&certDir, "webhook-cert-dir", "/k8s-webhook-server/serving-certs", "Admission webhook cert/key dir.") 109 flag.IntVar(&webhookPort, "webhook-port", 9443, "admission webhook listen address") 110 flag.IntVar(&controllerArgs.ConcurrentReconciles, "concurrent-reconciles", 4, "concurrent-reconciles is the concurrent reconcile number of the controller. The default value is 4") 111 flag.BoolVar(&controllerArgs.IgnoreWorkflowWithoutControllerRequirement, "ignore-workflow-without-controller-requirement", false, "If true, workflow controller will not process the workflowrun without 'workflowrun.oam.dev/controller-version-require' annotation") 112 flag.Float64Var(&qps, "kube-api-qps", 50, "the qps for reconcile clients. Low qps may lead to low throughput. High qps may give stress to api-server. Raise this value if concurrent-reconciles is set to be high.") 113 flag.IntVar(&burst, "kube-api-burst", 100, "the burst for reconcile clients. Recommend setting it qps*2.") 114 flag.StringVar(&userAgent, "user-agent", "vela-workflow", "the user agent of the client.") 115 flag.StringVar(&pprofAddr, "pprof-addr", "", "The address for pprof to use while exporting profiling results. The default value is empty which means do not expose it. Set it to address like :6666 to expose it.") 116 flag.IntVar(&types.MaxWorkflowWaitBackoffTime, "max-workflow-wait-backoff-time", 60, "Set the max workflow wait backoff time, default is 60") 117 flag.IntVar(&types.MaxWorkflowFailedBackoffTime, "max-workflow-failed-backoff-time", 300, "Set the max workflow wait backoff time, default is 300") 118 flag.IntVar(&types.MaxWorkflowStepErrorRetryTimes, "max-workflow-step-error-retry-times", 10, "Set the max workflow step error retry times, default is 10") 119 flag.StringVar(&backupStrategy, "backup-strategy", "BackupFinishedRecord", "Set the strategy for backup workflow records, default is RemainLatestFailedRecord") 120 flag.StringVar(&backupIgnoreStrategy, "backup-ignore-strategy", "", "Set the strategy for ignore backup workflow records, default is IgnoreLatestFailedRecord") 121 flag.StringVar(&backupPersistType, "backup-persist-type", "", "Set the persist type for backup workflow records, default is empty") 122 flag.StringVar(&groupByLabel, "group-by-label", "pipeline.oam.dev/name", "Set the label for group by, default is pipeline.oam.dev/name") 123 flag.BoolVar(&backupCleanOnBackup, "backup-clean-on-backup", false, "Set the auto clean for backup workflow records, default is false") 124 flag.StringVar(&backupConfigSecretName, "backup-config-secret-name", "backup-config", "Set the secret name for backup workflow configs, default is backup-config") 125 flag.StringVar(&backupConfigSecretNamespace, "backup-config-secret-namespace", "vela-system", "Set the secret namespace for backup workflow configs, default is backup-config") 126 multicluster.AddClusterGatewayClientFlags(flag.CommandLine) 127 feature.DefaultMutableFeatureGate.AddFlag(flag.CommandLine) 128 sharding.AddControllerFlags(flag.CommandLine) 129 130 // setup logging 131 klog.InitFlags(nil) 132 flag.CommandLine.AddGoFlagSet(goflag.CommandLine) 133 flag.Parse() 134 if logDebug { 135 _ = flag.Set("v", strconv.Itoa(int(common.LogDebug))) 136 } 137 138 if pprofAddr != "" { 139 // Start pprof server if enabled 140 mux := http.NewServeMux() 141 mux.HandleFunc("/debug/pprof/", pprof.Index) 142 mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline) 143 mux.HandleFunc("/debug/pprof/profile", pprof.Profile) 144 mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol) 145 mux.HandleFunc("/debug/pprof/trace", pprof.Trace) 146 pprofServer := http.Server{ 147 Addr: pprofAddr, 148 Handler: mux, 149 } 150 klog.InfoS("Starting debug HTTP server", "addr", pprofServer.Addr) 151 152 go func() { 153 go func() { 154 ctx := context.Background() 155 <-ctx.Done() 156 157 ctx, cancelFunc := context.WithTimeout(context.Background(), 60*time.Minute) 158 defer cancelFunc() 159 160 if err := pprofServer.Shutdown(ctx); err != nil { 161 klog.Error(err, "Failed to shutdown debug HTTP server") 162 } 163 }() 164 165 if err := pprofServer.ListenAndServe(); !errors.Is(http.ErrServerClosed, err) { 166 klog.Error(err, "Failed to start debug HTTP server") 167 panic(err) 168 } 169 }() 170 } 171 172 if logFilePath != "" { 173 _ = flag.Set("logtostderr", "false") 174 _ = flag.Set("log_file", logFilePath) 175 _ = flag.Set("log_file_max_size", strconv.FormatUint(logFileMaxSize, 10)) 176 } 177 178 ctrl.SetLogger(klogr.New()) 179 180 klog.InfoS("KubeVela Workflow information", "version", version.VelaVersion, "revision", version.GitRevision) 181 182 restConfig := ctrl.GetConfigOrDie() 183 restConfig.QPS = float32(qps) 184 restConfig.Burst = burst 185 klog.InfoS("Kubernetes Config Loaded", 186 "QPS", restConfig.QPS, 187 "Burst", restConfig.Burst, 188 ) 189 restConfig.UserAgent = userAgent 190 191 if feature.DefaultMutableFeatureGate.Enabled(features.EnableWatchEventListener) { 192 utilruntime.Must(triggerv1alpha1.AddToScheme(scheme)) 193 } 194 195 leaderElectionID := fmt.Sprintf("workflow-%s", strings.ToLower(strings.ReplaceAll(version.VelaVersion, ".", "-"))) 196 leaderElectionID += sharding.GetShardIDSuffix() 197 mgr, err := ctrl.NewManager(restConfig, ctrl.Options{ 198 Scheme: scheme, 199 MetricsBindAddress: metricsAddr, 200 Port: webhookPort, 201 HealthProbeBindAddress: probeAddr, 202 LeaderElection: enableLeaderElection, 203 LeaderElectionID: leaderElectionID, 204 LeaderElectionResourceLock: leaderElectionResourceLock, 205 LeaseDuration: &leaseDuration, 206 RenewDeadline: &renewDeadline, 207 RetryPeriod: &retryPeriod, 208 NewClient: velaclient.DefaultNewControllerClient, 209 NewCache: sharding.BuildCache(scheme, &v1alpha1.WorkflowRun{}), 210 CertDir: certDir, 211 }) 212 if err != nil { 213 klog.Error(err, "unable to start manager") 214 os.Exit(1) 215 } 216 217 kubeClient := mgr.GetClient() 218 if groupByLabel != "" { 219 if err := mgr.Add(utils.NewRecycleCronJob(kubeClient, recycleDuration, "0 0 * * *", groupByLabel)); err != nil { 220 klog.Error(err, "unable to start recycle cronjob") 221 os.Exit(1) 222 } 223 } 224 225 pd, err := packages.NewPackageDiscover(mgr.GetConfig()) 226 if err != nil { 227 klog.Error(err, "Failed to create CRD discovery for CUE package client") 228 if !packages.IsCUEParseErr(err) { 229 os.Exit(1) 230 } 231 } 232 controllerArgs.PackageDiscover = pd 233 234 if useWebhook { 235 klog.InfoS("Enable webhook", "server port", strconv.Itoa(webhookPort)) 236 webhook.Register(mgr, controllerArgs) 237 if err := waitWebhookSecretVolume(certDir, waitSecretTimeout, waitSecretInterval); err != nil { 238 klog.ErrorS(err, "Unable to get webhook secret") 239 os.Exit(1) 240 } 241 } 242 243 if err = (&controllers.WorkflowRunReconciler{ 244 Client: kubeClient, 245 Scheme: mgr.GetScheme(), 246 Recorder: event.NewAPIRecorder(mgr.GetEventRecorderFor("WorkflowRun")), 247 ControllerVersion: version.VelaVersion, 248 Args: controllerArgs, 249 }).SetupWithManager(mgr); err != nil { 250 klog.Error(err, "unable to create controller", "controller", "WorkflowRun") 251 os.Exit(1) 252 } 253 254 if feature.DefaultMutableFeatureGate.Enabled(features.EnableBackupWorkflowRecord) { 255 if backupPersistType == "" { 256 klog.Warning("Backup persist type is empty, workflow record won't be persisted") 257 } 258 configSecret := &corev1.Secret{} 259 reader := mgr.GetAPIReader() 260 if err := reader.Get(context.Background(), client.ObjectKey{ 261 Name: backupConfigSecretName, 262 Namespace: backupConfigSecretNamespace, 263 }, configSecret); err != nil && !kerrors.IsNotFound(err) { 264 klog.Error(err, "unable to find secret") 265 os.Exit(1) 266 } 267 persister, err := backup.NewPersister(configSecret.Data, backupPersistType) 268 if err != nil { 269 klog.Error(err, "unable to create persister") 270 os.Exit(1) 271 } 272 if err = (&controllers.BackupReconciler{ 273 Client: kubeClient, 274 Scheme: mgr.GetScheme(), 275 ControllerVersion: version.VelaVersion, 276 BackupArgs: controllers.BackupArgs{ 277 BackupStrategy: backupStrategy, 278 IgnoreStrategy: backupIgnoreStrategy, 279 CleanOnBackup: backupCleanOnBackup, 280 GroupByLabel: groupByLabel, 281 Persister: persister, 282 }, 283 Args: controllerArgs, 284 }).SetupWithManager(mgr); err != nil { 285 klog.Error(err, "unable to create controller", "controller", "backup") 286 os.Exit(1) 287 } 288 } 289 //+kubebuilder:scaffold:builder 290 if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { 291 klog.Error(err, "unable to set up health check") 292 os.Exit(1) 293 } 294 if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { 295 klog.Error(err, "unable to set up ready check") 296 os.Exit(1) 297 } 298 299 klog.Info("Start the vela workflow monitor") 300 informer, err := mgr.GetCache().GetInformer(context.Background(), &v1alpha1.WorkflowRun{}) 301 if err != nil { 302 klog.ErrorS(err, "Unable to get informer for application") 303 } 304 watcher.StartWorkflowRunMetricsWatcher(informer) 305 306 klog.Info("starting manager") 307 if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { 308 klog.Error(err, "problem running manager") 309 os.Exit(1) 310 } 311 312 if logFilePath != "" { 313 klog.Flush() 314 } 315 klog.Info("Safely stops Program...") 316 } 317 318 // waitWebhookSecretVolume waits for webhook secret ready to avoid mgr running crash 319 func waitWebhookSecretVolume(certDir string, timeout, interval time.Duration) error { 320 start := time.Now() 321 for { 322 time.Sleep(interval) 323 if time.Since(start) > timeout { 324 return fmt.Errorf("getting webhook secret timeout after %s", timeout.String()) 325 } 326 klog.InfoS("Wait webhook secret", "time consumed(second)", int64(time.Since(start).Seconds()), 327 "timeout(second)", int64(timeout.Seconds())) 328 if _, err := os.Stat(certDir); !os.IsNotExist(err) { 329 ready := func() bool { 330 f, err := os.Open(filepath.Clean(certDir)) 331 if err != nil { 332 return false 333 } 334 defer func() { 335 if err := f.Close(); err != nil { 336 klog.Error(err, "Failed to close file") 337 } 338 }() 339 // check if dir is empty 340 if _, err := f.Readdir(1); errors.Is(err, io.EOF) { 341 return false 342 } 343 // check if secret files are empty 344 err = filepath.Walk(certDir, func(path string, info os.FileInfo, err error) error { 345 // even Cert dir is created, cert files are still empty for a while 346 if info.Size() == 0 { 347 return errors.New("secret is not ready") 348 } 349 return nil 350 }) 351 if err == nil { 352 klog.InfoS("Webhook secret is ready", "time consumed(second)", 353 int64(time.Since(start).Seconds())) 354 return true 355 } 356 return false 357 }() 358 if ready { 359 return nil 360 } 361 } 362 } 363 }