github.com/kubevela/workflow@v0.6.0/cmd/main.go (about)

     1  /*
     2  Copyright 2022 The KubeVela Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8  	http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package main
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	goflag "flag"
    23  	"fmt"
    24  	"io"
    25  	"net/http"
    26  	"net/http/pprof"
    27  	"os"
    28  	"path/filepath"
    29  	"strconv"
    30  	"strings"
    31  	"time"
    32  
    33  	"github.com/crossplane/crossplane-runtime/pkg/event"
    34  	"github.com/kubevela/pkg/controller/sharding"
    35  	flag "github.com/spf13/pflag"
    36  	corev1 "k8s.io/api/core/v1"
    37  	kerrors "k8s.io/apimachinery/pkg/api/errors"
    38  	"k8s.io/apimachinery/pkg/runtime"
    39  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    40  	"k8s.io/apiserver/pkg/util/feature"
    41  	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
    42  	"k8s.io/klog/v2"
    43  	"k8s.io/klog/v2/klogr"
    44  	ctrl "sigs.k8s.io/controller-runtime"
    45  	"sigs.k8s.io/controller-runtime/pkg/client"
    46  	"sigs.k8s.io/controller-runtime/pkg/healthz"
    47  
    48  	triggerv1alpha1 "github.com/kubevela/kube-trigger/api/v1alpha1"
    49  	velaclient "github.com/kubevela/pkg/controller/client"
    50  	"github.com/kubevela/pkg/multicluster"
    51  
    52  	"github.com/kubevela/workflow/api/v1alpha1"
    53  	"github.com/kubevela/workflow/controllers"
    54  	"github.com/kubevela/workflow/pkg/backup"
    55  	"github.com/kubevela/workflow/pkg/common"
    56  	"github.com/kubevela/workflow/pkg/cue/packages"
    57  	"github.com/kubevela/workflow/pkg/features"
    58  	"github.com/kubevela/workflow/pkg/monitor/watcher"
    59  	"github.com/kubevela/workflow/pkg/types"
    60  	"github.com/kubevela/workflow/pkg/utils"
    61  	"github.com/kubevela/workflow/pkg/webhook"
    62  	"github.com/kubevela/workflow/version"
    63  	//+kubebuilder:scaffold:imports
    64  )
    65  
    66  var (
    67  	scheme             = runtime.NewScheme()
    68  	waitSecretTimeout  = 90 * time.Second
    69  	waitSecretInterval = 2 * time.Second
    70  )
    71  
    72  func init() {
    73  	utilruntime.Must(clientgoscheme.AddToScheme(scheme))
    74  
    75  	utilruntime.Must(v1alpha1.AddToScheme(scheme))
    76  	//+kubebuilder:scaffold:scheme
    77  }
    78  
    79  func main() {
    80  	var metricsAddr, logFilePath, probeAddr, pprofAddr, leaderElectionResourceLock, userAgent, certDir string
    81  	var backupStrategy, backupIgnoreStrategy, backupPersistType, groupByLabel, backupConfigSecretName, backupConfigSecretNamespace string
    82  	var enableLeaderElection, useWebhook, logDebug, backupCleanOnBackup bool
    83  	var qps float64
    84  	var logFileMaxSize uint64
    85  	var burst, webhookPort int
    86  	var leaseDuration, renewDeadline, retryPeriod, recycleDuration time.Duration
    87  	var controllerArgs controllers.Args
    88  
    89  	flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
    90  	flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
    91  	flag.StringVar(&logFilePath, "log-file-path", "", "The file to write logs to.")
    92  	flag.Uint64Var(&logFileMaxSize, "log-file-max-size", 1024, "Defines the maximum size a log file can grow to, Unit is megabytes.")
    93  	flag.BoolVar(&logDebug, "log-debug", false, "Enable debug logs for development purpose")
    94  	flag.BoolVar(&enableLeaderElection, "leader-elect", false,
    95  		"Enable leader election for controller manager. "+
    96  			"Enabling this will ensure there is only one active controller manager.")
    97  	flag.StringVar(&leaderElectionResourceLock, "leader-election-resource-lock", "configmapsleases", "The resource lock to use for leader election")
    98  	flag.DurationVar(&leaseDuration, "leader-election-lease-duration", 15*time.Second,
    99  		"The duration that non-leader candidates will wait to force acquire leadership")
   100  	flag.DurationVar(&renewDeadline, "leader-election-renew-deadline", 10*time.Second,
   101  		"The duration that the acting controlplane will retry refreshing leadership before giving up")
   102  	flag.DurationVar(&retryPeriod, "leader-election-retry-period", 2*time.Second,
   103  		"The duration the LeaderElector clients should wait between tries of actions")
   104  	flag.DurationVar(&recycleDuration, "recycle-duration", 30*24*time.Hour,
   105  		"The recycle duration of a completed and is not the latest record in a set of workflowruns")
   106  
   107  	flag.BoolVar(&useWebhook, "use-webhook", false, "Enable Admission Webhook")
   108  	flag.StringVar(&certDir, "webhook-cert-dir", "/k8s-webhook-server/serving-certs", "Admission webhook cert/key dir.")
   109  	flag.IntVar(&webhookPort, "webhook-port", 9443, "admission webhook listen address")
   110  	flag.IntVar(&controllerArgs.ConcurrentReconciles, "concurrent-reconciles", 4, "concurrent-reconciles is the concurrent reconcile number of the controller. The default value is 4")
   111  	flag.BoolVar(&controllerArgs.IgnoreWorkflowWithoutControllerRequirement, "ignore-workflow-without-controller-requirement", false, "If true, workflow controller will not process the workflowrun without 'workflowrun.oam.dev/controller-version-require' annotation")
   112  	flag.Float64Var(&qps, "kube-api-qps", 50, "the qps for reconcile clients. Low qps may lead to low throughput. High qps may give stress to api-server. Raise this value if concurrent-reconciles is set to be high.")
   113  	flag.IntVar(&burst, "kube-api-burst", 100, "the burst for reconcile clients. Recommend setting it qps*2.")
   114  	flag.StringVar(&userAgent, "user-agent", "vela-workflow", "the user agent of the client.")
   115  	flag.StringVar(&pprofAddr, "pprof-addr", "", "The address for pprof to use while exporting profiling results. The default value is empty which means do not expose it. Set it to address like :6666 to expose it.")
   116  	flag.IntVar(&types.MaxWorkflowWaitBackoffTime, "max-workflow-wait-backoff-time", 60, "Set the max workflow wait backoff time, default is 60")
   117  	flag.IntVar(&types.MaxWorkflowFailedBackoffTime, "max-workflow-failed-backoff-time", 300, "Set the max workflow wait backoff time, default is 300")
   118  	flag.IntVar(&types.MaxWorkflowStepErrorRetryTimes, "max-workflow-step-error-retry-times", 10, "Set the max workflow step error retry times, default is 10")
   119  	flag.StringVar(&backupStrategy, "backup-strategy", "BackupFinishedRecord", "Set the strategy for backup workflow records, default is RemainLatestFailedRecord")
   120  	flag.StringVar(&backupIgnoreStrategy, "backup-ignore-strategy", "", "Set the strategy for ignore backup workflow records, default is IgnoreLatestFailedRecord")
   121  	flag.StringVar(&backupPersistType, "backup-persist-type", "", "Set the persist type for backup workflow records, default is empty")
   122  	flag.StringVar(&groupByLabel, "group-by-label", "pipeline.oam.dev/name", "Set the label for group by, default is pipeline.oam.dev/name")
   123  	flag.BoolVar(&backupCleanOnBackup, "backup-clean-on-backup", false, "Set the auto clean for backup workflow records, default is false")
   124  	flag.StringVar(&backupConfigSecretName, "backup-config-secret-name", "backup-config", "Set the secret name for backup workflow configs, default is backup-config")
   125  	flag.StringVar(&backupConfigSecretNamespace, "backup-config-secret-namespace", "vela-system", "Set the secret namespace for backup workflow configs, default is backup-config")
   126  	multicluster.AddClusterGatewayClientFlags(flag.CommandLine)
   127  	feature.DefaultMutableFeatureGate.AddFlag(flag.CommandLine)
   128  	sharding.AddControllerFlags(flag.CommandLine)
   129  
   130  	// setup logging
   131  	klog.InitFlags(nil)
   132  	flag.CommandLine.AddGoFlagSet(goflag.CommandLine)
   133  	flag.Parse()
   134  	if logDebug {
   135  		_ = flag.Set("v", strconv.Itoa(int(common.LogDebug)))
   136  	}
   137  
   138  	if pprofAddr != "" {
   139  		// Start pprof server if enabled
   140  		mux := http.NewServeMux()
   141  		mux.HandleFunc("/debug/pprof/", pprof.Index)
   142  		mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
   143  		mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
   144  		mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
   145  		mux.HandleFunc("/debug/pprof/trace", pprof.Trace)
   146  		pprofServer := http.Server{
   147  			Addr:    pprofAddr,
   148  			Handler: mux,
   149  		}
   150  		klog.InfoS("Starting debug HTTP server", "addr", pprofServer.Addr)
   151  
   152  		go func() {
   153  			go func() {
   154  				ctx := context.Background()
   155  				<-ctx.Done()
   156  
   157  				ctx, cancelFunc := context.WithTimeout(context.Background(), 60*time.Minute)
   158  				defer cancelFunc()
   159  
   160  				if err := pprofServer.Shutdown(ctx); err != nil {
   161  					klog.Error(err, "Failed to shutdown debug HTTP server")
   162  				}
   163  			}()
   164  
   165  			if err := pprofServer.ListenAndServe(); !errors.Is(http.ErrServerClosed, err) {
   166  				klog.Error(err, "Failed to start debug HTTP server")
   167  				panic(err)
   168  			}
   169  		}()
   170  	}
   171  
   172  	if logFilePath != "" {
   173  		_ = flag.Set("logtostderr", "false")
   174  		_ = flag.Set("log_file", logFilePath)
   175  		_ = flag.Set("log_file_max_size", strconv.FormatUint(logFileMaxSize, 10))
   176  	}
   177  
   178  	ctrl.SetLogger(klogr.New())
   179  
   180  	klog.InfoS("KubeVela Workflow information", "version", version.VelaVersion, "revision", version.GitRevision)
   181  
   182  	restConfig := ctrl.GetConfigOrDie()
   183  	restConfig.QPS = float32(qps)
   184  	restConfig.Burst = burst
   185  	klog.InfoS("Kubernetes Config Loaded",
   186  		"QPS", restConfig.QPS,
   187  		"Burst", restConfig.Burst,
   188  	)
   189  	restConfig.UserAgent = userAgent
   190  
   191  	if feature.DefaultMutableFeatureGate.Enabled(features.EnableWatchEventListener) {
   192  		utilruntime.Must(triggerv1alpha1.AddToScheme(scheme))
   193  	}
   194  
   195  	leaderElectionID := fmt.Sprintf("workflow-%s", strings.ToLower(strings.ReplaceAll(version.VelaVersion, ".", "-")))
   196  	leaderElectionID += sharding.GetShardIDSuffix()
   197  	mgr, err := ctrl.NewManager(restConfig, ctrl.Options{
   198  		Scheme:                     scheme,
   199  		MetricsBindAddress:         metricsAddr,
   200  		Port:                       webhookPort,
   201  		HealthProbeBindAddress:     probeAddr,
   202  		LeaderElection:             enableLeaderElection,
   203  		LeaderElectionID:           leaderElectionID,
   204  		LeaderElectionResourceLock: leaderElectionResourceLock,
   205  		LeaseDuration:              &leaseDuration,
   206  		RenewDeadline:              &renewDeadline,
   207  		RetryPeriod:                &retryPeriod,
   208  		NewClient:                  velaclient.DefaultNewControllerClient,
   209  		NewCache:                   sharding.BuildCache(scheme, &v1alpha1.WorkflowRun{}),
   210  		CertDir:                    certDir,
   211  	})
   212  	if err != nil {
   213  		klog.Error(err, "unable to start manager")
   214  		os.Exit(1)
   215  	}
   216  
   217  	kubeClient := mgr.GetClient()
   218  	if groupByLabel != "" {
   219  		if err := mgr.Add(utils.NewRecycleCronJob(kubeClient, recycleDuration, "0 0 * * *", groupByLabel)); err != nil {
   220  			klog.Error(err, "unable to start recycle cronjob")
   221  			os.Exit(1)
   222  		}
   223  	}
   224  
   225  	pd, err := packages.NewPackageDiscover(mgr.GetConfig())
   226  	if err != nil {
   227  		klog.Error(err, "Failed to create CRD discovery for CUE package client")
   228  		if !packages.IsCUEParseErr(err) {
   229  			os.Exit(1)
   230  		}
   231  	}
   232  	controllerArgs.PackageDiscover = pd
   233  
   234  	if useWebhook {
   235  		klog.InfoS("Enable webhook", "server port", strconv.Itoa(webhookPort))
   236  		webhook.Register(mgr, controllerArgs)
   237  		if err := waitWebhookSecretVolume(certDir, waitSecretTimeout, waitSecretInterval); err != nil {
   238  			klog.ErrorS(err, "Unable to get webhook secret")
   239  			os.Exit(1)
   240  		}
   241  	}
   242  
   243  	if err = (&controllers.WorkflowRunReconciler{
   244  		Client:            kubeClient,
   245  		Scheme:            mgr.GetScheme(),
   246  		Recorder:          event.NewAPIRecorder(mgr.GetEventRecorderFor("WorkflowRun")),
   247  		ControllerVersion: version.VelaVersion,
   248  		Args:              controllerArgs,
   249  	}).SetupWithManager(mgr); err != nil {
   250  		klog.Error(err, "unable to create controller", "controller", "WorkflowRun")
   251  		os.Exit(1)
   252  	}
   253  
   254  	if feature.DefaultMutableFeatureGate.Enabled(features.EnableBackupWorkflowRecord) {
   255  		if backupPersistType == "" {
   256  			klog.Warning("Backup persist type is empty, workflow record won't be persisted")
   257  		}
   258  		configSecret := &corev1.Secret{}
   259  		reader := mgr.GetAPIReader()
   260  		if err := reader.Get(context.Background(), client.ObjectKey{
   261  			Name:      backupConfigSecretName,
   262  			Namespace: backupConfigSecretNamespace,
   263  		}, configSecret); err != nil && !kerrors.IsNotFound(err) {
   264  			klog.Error(err, "unable to find secret")
   265  			os.Exit(1)
   266  		}
   267  		persister, err := backup.NewPersister(configSecret.Data, backupPersistType)
   268  		if err != nil {
   269  			klog.Error(err, "unable to create persister")
   270  			os.Exit(1)
   271  		}
   272  		if err = (&controllers.BackupReconciler{
   273  			Client:            kubeClient,
   274  			Scheme:            mgr.GetScheme(),
   275  			ControllerVersion: version.VelaVersion,
   276  			BackupArgs: controllers.BackupArgs{
   277  				BackupStrategy: backupStrategy,
   278  				IgnoreStrategy: backupIgnoreStrategy,
   279  				CleanOnBackup:  backupCleanOnBackup,
   280  				GroupByLabel:   groupByLabel,
   281  				Persister:      persister,
   282  			},
   283  			Args: controllerArgs,
   284  		}).SetupWithManager(mgr); err != nil {
   285  			klog.Error(err, "unable to create controller", "controller", "backup")
   286  			os.Exit(1)
   287  		}
   288  	}
   289  	//+kubebuilder:scaffold:builder
   290  	if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
   291  		klog.Error(err, "unable to set up health check")
   292  		os.Exit(1)
   293  	}
   294  	if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
   295  		klog.Error(err, "unable to set up ready check")
   296  		os.Exit(1)
   297  	}
   298  
   299  	klog.Info("Start the vela workflow monitor")
   300  	informer, err := mgr.GetCache().GetInformer(context.Background(), &v1alpha1.WorkflowRun{})
   301  	if err != nil {
   302  		klog.ErrorS(err, "Unable to get informer for application")
   303  	}
   304  	watcher.StartWorkflowRunMetricsWatcher(informer)
   305  
   306  	klog.Info("starting manager")
   307  	if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
   308  		klog.Error(err, "problem running manager")
   309  		os.Exit(1)
   310  	}
   311  
   312  	if logFilePath != "" {
   313  		klog.Flush()
   314  	}
   315  	klog.Info("Safely stops Program...")
   316  }
   317  
   318  // waitWebhookSecretVolume waits for webhook secret ready to avoid mgr running crash
   319  func waitWebhookSecretVolume(certDir string, timeout, interval time.Duration) error {
   320  	start := time.Now()
   321  	for {
   322  		time.Sleep(interval)
   323  		if time.Since(start) > timeout {
   324  			return fmt.Errorf("getting webhook secret timeout after %s", timeout.String())
   325  		}
   326  		klog.InfoS("Wait webhook secret", "time consumed(second)", int64(time.Since(start).Seconds()),
   327  			"timeout(second)", int64(timeout.Seconds()))
   328  		if _, err := os.Stat(certDir); !os.IsNotExist(err) {
   329  			ready := func() bool {
   330  				f, err := os.Open(filepath.Clean(certDir))
   331  				if err != nil {
   332  					return false
   333  				}
   334  				defer func() {
   335  					if err := f.Close(); err != nil {
   336  						klog.Error(err, "Failed to close file")
   337  					}
   338  				}()
   339  				// check if dir is empty
   340  				if _, err := f.Readdir(1); errors.Is(err, io.EOF) {
   341  					return false
   342  				}
   343  				// check if secret files are empty
   344  				err = filepath.Walk(certDir, func(path string, info os.FileInfo, err error) error {
   345  					// even Cert dir is created, cert files are still empty for a while
   346  					if info.Size() == 0 {
   347  						return errors.New("secret is not ready")
   348  					}
   349  					return nil
   350  				})
   351  				if err == nil {
   352  					klog.InfoS("Webhook secret is ready", "time consumed(second)",
   353  						int64(time.Since(start).Seconds()))
   354  					return true
   355  				}
   356  				return false
   357  			}()
   358  			if ready {
   359  				return nil
   360  			}
   361  		}
   362  	}
   363  }