github.com/webmeshproj/webmesh-cni@v0.0.27/internal/cmd/node/node.go (about)

     1  /*
     2  Copyright 2023 Avi Zimmerman <avi.zimmerman@gmail.com>.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // Package node contains the entrypoint for the webmesh-cni node component.
    18  package node
    19  
    20  import (
    21  	"context"
    22  	"errors"
    23  	"flag"
    24  	"fmt"
    25  	"net/netip"
    26  	"os"
    27  	"strings"
    28  	"time"
    29  
    30  	"github.com/knadh/koanf/parsers/json"
    31  	"github.com/knadh/koanf/providers/posflag"
    32  	"github.com/knadh/koanf/v2"
    33  	"github.com/spf13/pflag"
    34  	storagev1 "github.com/webmeshproj/storage-provider-k8s/api/storage/v1"
    35  	storageprovider "github.com/webmeshproj/storage-provider-k8s/provider"
    36  	"github.com/webmeshproj/webmesh/pkg/cmd/cmdutil"
    37  	meshconfig "github.com/webmeshproj/webmesh/pkg/config"
    38  	"github.com/webmeshproj/webmesh/pkg/plugins/builtins"
    39  	meshservices "github.com/webmeshproj/webmesh/pkg/services"
    40  	"github.com/webmeshproj/webmesh/pkg/services/meshdns"
    41  	"github.com/webmeshproj/webmesh/pkg/version"
    42  	"k8s.io/apimachinery/pkg/runtime"
    43  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    44  	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
    45  	_ "k8s.io/client-go/plugin/pkg/client/auth"
    46  	ctrl "sigs.k8s.io/controller-runtime"
    47  	"sigs.k8s.io/controller-runtime/pkg/client"
    48  	ctrlconfig "sigs.k8s.io/controller-runtime/pkg/config"
    49  	"sigs.k8s.io/controller-runtime/pkg/healthz"
    50  	ctrllog "sigs.k8s.io/controller-runtime/pkg/log"
    51  	"sigs.k8s.io/controller-runtime/pkg/log/zap"
    52  	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
    53  
    54  	cniv1 "github.com/webmeshproj/webmesh-cni/api/v1"
    55  	"github.com/webmeshproj/webmesh-cni/internal/config"
    56  	"github.com/webmeshproj/webmesh-cni/internal/controllers"
    57  	"github.com/webmeshproj/webmesh-cni/internal/host"
    58  	"github.com/webmeshproj/webmesh-cni/internal/metadata"
    59  )
    60  
    61  var (
    62  	scheme  = runtime.NewScheme()
    63  	log     = ctrl.Log.WithName("webmesh-cni")
    64  	cniopts = config.NewDefaultConfig()
    65  	zapopts = zap.Options{Development: true}
    66  )
    67  
    68  func init() {
    69  	utilruntime.Must(clientgoscheme.AddToScheme(scheme))
    70  	utilruntime.Must(cniv1.AddToScheme(scheme))
    71  	utilruntime.Must(storagev1.AddToScheme(scheme))
    72  }
    73  
    74  func pluginInArgs(pluginName string) bool {
    75  	for _, arg := range os.Args {
    76  		if strings.HasPrefix(arg, fmt.Sprintf("--host.plugins.%s", pluginName)) {
    77  			return true
    78  		}
    79  	}
    80  	return false
    81  }
    82  
    83  // Main runs the webmesh-cni daemon.
    84  func Main(build version.BuildInfo) {
    85  	// Build the flagset
    86  	var configMap string
    87  	var configMapNamespace string
    88  	zapset := flag.NewFlagSet("zap", flag.ContinueOnError)
    89  	fs := pflag.NewFlagSet("webmesh-cni", pflag.ContinueOnError)
    90  	cniopts.BindFlags(fs)
    91  	zapopts.BindFlags(zapset)
    92  	fs.AddGoFlagSet(zapset)
    93  	fs.StringVar(&configMap, "configmap", "", "The name of the configmap to load configuration from.")
    94  	fs.StringVar(&configMapNamespace, "configmap-namespace", "kube-system", "The namespace of the configmap to load configuration from.")
    95  
    96  	// Create a separate flag set with all plugins for usage.
    97  	usage := pflag.NewFlagSet("usage", pflag.ContinueOnError)
    98  	usage.AddFlagSet(fs)
    99  	pluginConfigs := builtins.NewPluginConfigs()
   100  	for pluginName, pluginConfig := range pluginConfigs {
   101  		if !pluginInArgs(pluginName) {
   102  			pluginConfig.BindFlags(fmt.Sprintf("host.plugins.%s.", pluginName), usage)
   103  		}
   104  	}
   105  	fs.Usage = cmdutil.NewUsageFunc(cmdutil.UsageConfig{
   106  		Name:        "webmesh-cni-node",
   107  		Description: "The webmesh-cni node component.",
   108  		Prefixes: []string{
   109  			"manager",
   110  			"host",
   111  			"host.auth",
   112  			"host.network",
   113  			"host.services",
   114  			"host.wireguard",
   115  			"host.plugins",
   116  			"storage",
   117  		},
   118  		Flagset: usage,
   119  	})
   120  
   121  	// Parse flags and setup logging.
   122  	err := fs.Parse(os.Args[1:])
   123  	if err != nil {
   124  		if errors.Is(err, pflag.ErrHelp) {
   125  			os.Exit(0)
   126  		}
   127  		fmt.Println("ERROR: Failed to parse flags:", err)
   128  		os.Exit(1)
   129  	}
   130  	ctrl.SetLogger(zap.New(zap.UseFlagOptions(&zapopts)))
   131  
   132  	// Load the configuration from flags and configmap
   133  	k := koanf.New(".")
   134  	if configMap != "" {
   135  		provider := config.NewConfigMapProvider(ctrl.GetConfigOrDie(), client.ObjectKey{
   136  			Name:      configMap,
   137  			Namespace: configMapNamespace,
   138  		})
   139  		err := k.Load(provider, json.Parser())
   140  		if err != nil {
   141  			log.Error(err, "Failed to load configuration from configmap")
   142  			os.Exit(1)
   143  		}
   144  	}
   145  	err = k.Load(posflag.Provider(fs, ".", k), nil)
   146  	if err != nil {
   147  		log.Error(err, "Failed to load configuration from flags")
   148  		os.Exit(1)
   149  	}
   150  	err = k.Unmarshal("", &cniopts)
   151  	if err != nil {
   152  		log.Error(err, "Failed to unmarshal configuration")
   153  		os.Exit(1)
   154  	}
   155  
   156  	// Validate the configuration.
   157  	err = cniopts.Validate()
   158  	if err != nil {
   159  		log.Error(err, "Invalid CNI configuration")
   160  		os.Exit(1)
   161  	}
   162  
   163  	log.Info("Starting webmesh-cni node", "version", build)
   164  
   165  	// Create the manager.
   166  	ctx := ctrl.SetupSignalHandler()
   167  	ctx = ctrllog.IntoContext(ctx, log)
   168  	mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
   169  		Scheme: scheme,
   170  		Metrics: metricsserver.Options{
   171  			BindAddress: cniopts.Manager.MetricsAddress,
   172  		},
   173  		HealthProbeBindAddress:  cniopts.Manager.ProbeAddress,
   174  		GracefulShutdownTimeout: &cniopts.Manager.ShutdownTimeout,
   175  		Controller: ctrlconfig.Controller{
   176  			MaxConcurrentReconciles: cniopts.Manager.MaxConcurrentReconciles,
   177  			GroupKindConcurrency: map[string]int{
   178  				"PeerContainer.cni.webmesh.io": 1,
   179  				"RemoteNetwork.cni.webmesh.io": 1,
   180  			},
   181  			NeedLeaderElection: &[]bool{false}[0],
   182  		},
   183  	})
   184  	if err != nil {
   185  		log.Error(err, "Failed to create controller manager")
   186  		os.Exit(1)
   187  	}
   188  
   189  	// Create the storage provider.
   190  	storageOpts := storageprovider.Options{
   191  		NodeID:                      cniopts.Host.NodeID,
   192  		Namespace:                   cniopts.Host.Namespace,
   193  		ListenPort:                  int(cniopts.Host.Services.API.ListenPort()),
   194  		LeaderElectionLeaseDuration: cniopts.Storage.LeaderElectLeaseDuration,
   195  		LeaderElectionRenewDeadline: cniopts.Storage.LeaderElectRenewDeadline,
   196  		LeaderElectionRetryPeriod:   cniopts.Storage.LeaderElectRetryPeriod,
   197  		ShutdownTimeout:             cniopts.Manager.ShutdownTimeout,
   198  	}
   199  	log.V(1).Info("Creating webmesh storage provider", "options", storageOpts)
   200  	storageProvider, err := storageprovider.NewWithManager(mgr, storageOpts)
   201  	if err != nil {
   202  		log.Error(err, "Failed to create webmesh storage provider")
   203  		os.Exit(1)
   204  	}
   205  
   206  	// Setup the host node.
   207  	var metaaddr netip.AddrPort
   208  	if cniopts.Manager.EnableMetadataServer {
   209  		// Append the metadata server to the allowed routes.
   210  		metaaddr, err = netip.ParseAddrPort(cniopts.Manager.MetadataAddress)
   211  		if err != nil {
   212  			log.Error(err, "Failed to parse metadata address")
   213  			os.Exit(1)
   214  		}
   215  		metaaddrPreifx := netip.PrefixFrom(metaaddr.Addr(), 32)
   216  		cniopts.Host.Network.Routes = append(cniopts.Host.Network.Routes, metaaddrPreifx.String())
   217  	}
   218  	hostnode := host.NewNode(storageProvider, cniopts.Host)
   219  
   220  	// Register the main peer container controller.
   221  	log.V(1).Info("Registering peer container controller")
   222  	containerReconciler := &controllers.PeerContainerReconciler{
   223  		Client:   mgr.GetClient(),
   224  		Host:     hostnode,
   225  		Provider: storageProvider,
   226  		Config:   cniopts,
   227  	}
   228  	if err = containerReconciler.SetupWithManager(mgr); err != nil {
   229  		log.Error(err, "Failed to setup container reconciler with manager", "controller", "PeerContainer")
   230  		os.Exit(1)
   231  	}
   232  	// Register a node reconciler to make sure edges exist across the cluster.
   233  	log.V(1).Info("Registering node controller")
   234  	nodeReconciler := &controllers.NodeReconciler{
   235  		Client:   mgr.GetClient(),
   236  		Host:     hostnode,
   237  		Provider: storageProvider,
   238  	}
   239  	if err = nodeReconciler.SetupWithManager(mgr); err != nil {
   240  		log.Error(err, "Failed to setup node reconciler with manager", "controller", "Node")
   241  		os.Exit(1)
   242  	}
   243  	// Register a pod reconciler to check for containers that can broadcast features
   244  	// to the outside world.
   245  	log.V(1).Info("Registering pod controller")
   246  	podRecondiler := &controllers.PodReconciler{
   247  		Client:       mgr.GetClient(),
   248  		Host:         hostnode,
   249  		Provider:     storageProvider,
   250  		DNSSelector:  cniopts.Manager.ClusterDNSSelector,
   251  		DNSNamespace: cniopts.Manager.ClusterDNSNamespace,
   252  		DNSPort:      cniopts.Manager.ClusterDNSPortSelector,
   253  	}
   254  	if err = podRecondiler.SetupWithManager(mgr); err != nil {
   255  		log.Error(err, "Failed to setup pod reconciler with manager", "controller", "Node")
   256  		os.Exit(1)
   257  	}
   258  	// Register the remote network reconciler for maintaining bridge connections to
   259  	// other clusters.
   260  	log.V(1).Info("Registering remote network controller")
   261  	remoteNetworkReconciler := &controllers.RemoteNetworkReconciler{
   262  		Client:   mgr.GetClient(),
   263  		Config:   cniopts,
   264  		Provider: storageProvider,
   265  		HostNode: hostnode,
   266  	}
   267  	if err = remoteNetworkReconciler.SetupWithManager(mgr); err != nil {
   268  		log.Error(err, "Failed to setup remote network reconciler with manager", "controller", "RemoteNetwork")
   269  		os.Exit(1)
   270  	}
   271  
   272  	// Register the health and ready checks.
   273  	log.V(1).Info("Registering health and ready checks")
   274  	if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
   275  		log.Error(err, "Failed to set up health check")
   276  		os.Exit(1)
   277  	}
   278  	if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
   279  		log.Error(err, "Failed to set up ready check")
   280  		os.Exit(1)
   281  	}
   282  
   283  	donec := make(chan struct{})
   284  	go func() {
   285  		defer close(donec)
   286  		log.Info("Starting peer container manager")
   287  		if err := mgr.Start(ctx); err != nil {
   288  			log.Error(err, "Problem running manager")
   289  			os.Exit(1)
   290  		}
   291  		log.Info("Peer container manager finished")
   292  		ctx, cancel := context.WithTimeout(
   293  			ctrllog.IntoContext(context.Background(), log),
   294  			cniopts.Manager.ShutdownTimeout,
   295  		)
   296  		defer cancel()
   297  		log.Info("Shutting down managed container nodes")
   298  		containerReconciler.Shutdown(ctx)
   299  	}()
   300  
   301  	// Start the storage provider in unmanaged mode.
   302  	log.Info("Starting webmesh storage provider")
   303  	err = storageProvider.StartUnmanaged(ctx)
   304  	if err != nil {
   305  		log.Error(err, "Failed to start webmesh storage provider")
   306  		os.Exit(1)
   307  	}
   308  
   309  	// Wait for the manager cache to sync and then get ready to handle requests
   310  
   311  	log.Info("Waiting for manager cache to sync", "timeout", cniopts.Storage.CacheSyncTimeout)
   312  	cacheCtx, cancel := context.WithTimeout(ctx, cniopts.Storage.CacheSyncTimeout)
   313  	if synced := mgr.GetCache().WaitForCacheSync(cacheCtx); !synced {
   314  		if err := storageProvider.Close(); err != nil {
   315  			log.Error(err, "Failed to stop storage provider")
   316  		}
   317  		cancel()
   318  		log.Error(err, "Timed out waiting for caches to sync")
   319  		os.Exit(1)
   320  	}
   321  	cancel()
   322  	log.V(1).Info("Caches synced, bootstrapping network state")
   323  
   324  	log.Info("Starting host node for routing traffic")
   325  	host := containerReconciler.Host
   326  	err = host.Start(ctx, mgr.GetConfig())
   327  	if err != nil {
   328  		if err := storageProvider.Close(); err != nil {
   329  			log.Error(err, "Failed to stop storage provider")
   330  		}
   331  		cancel()
   332  		log.Error(err, "Failed to start host node")
   333  		os.Exit(1)
   334  	}
   335  
   336  	log.Info("Webmesh CNI node started")
   337  
   338  	// Start any configured services.
   339  
   340  	if cniopts.Manager.EnableMetadataServer {
   341  		// Add the metadata address to the wireguard interface.
   342  		addr := netip.PrefixFrom(metaaddr.Addr(), 32)
   343  		err = host.Node().Network().WireGuard().AddAddress(ctx, addr)
   344  		if err != nil {
   345  			err := host.Stop(ctx)
   346  			if err != nil {
   347  				log.Error(err, "Failed to stop host node")
   348  			}
   349  			log.Error(err, "Failed to add metadata address to wireguard interface")
   350  			os.Exit(1)
   351  		}
   352  		metasrv := metadata.NewServer(metadata.Config{
   353  			Address:        metaaddr,
   354  			Host:           host,
   355  			Storage:        storageProvider,
   356  			KeyResolver:    containerReconciler,
   357  			EnableIDTokens: cniopts.Manager.EnableMetadataIDTokens,
   358  		})
   359  		go func() {
   360  			log.Info("Starting metadata server")
   361  			err := metasrv.ListenAndServe()
   362  			if err != nil {
   363  				err := host.Stop(ctx)
   364  				if err != nil {
   365  					log.Error(err, "Failed to stop host node")
   366  				}
   367  				log.Error(err, "Failed to start metadata server")
   368  				os.Exit(1)
   369  			}
   370  		}()
   371  		defer func() {
   372  			if err := metasrv.Shutdown(context.Background()); err != nil {
   373  				log.Error(err, "Failed to shutdown metadata server")
   374  			}
   375  		}()
   376  	}
   377  
   378  	hostCtx := host.NodeContext(context.Background())
   379  	if cniopts.Host.Services.MeshDNS.Enabled {
   380  		// We force subscribe forwarders to true or otherwise it would serve very little purpose.
   381  		// This makes sure we wind up with CoreDNS as a forwarding server for non-root zones.
   382  		cniopts.Host.Services.MeshDNS.SubscribeForwarders = true
   383  	}
   384  	srvOpts, err := cniopts.Host.Services.NewServiceOptions(hostCtx, host.Node())
   385  	if err != nil {
   386  		err := host.Stop(ctx)
   387  		if err != nil {
   388  			log.Error(err, "Failed to stop host node")
   389  		}
   390  		log.Error(err, "Failed to create webmesh service options")
   391  		os.Exit(1)
   392  	}
   393  	if cniopts.Host.Services.MeshDNS.Enabled {
   394  		// Set the DNS server to the remote network controller
   395  		dnssrv, ok := srvOpts.GetServer(&meshdns.Server{})
   396  		if !ok {
   397  			// Something bizarre happened.
   398  			err := host.Stop(ctx)
   399  			if err != nil {
   400  				log.Error(err, "Failed to stop host node")
   401  			}
   402  			log.Error(err, "Failed to get meshdns server")
   403  			os.Exit(1)
   404  		}
   405  		remoteNetworkReconciler.SetDNSServer(dnssrv.(*meshdns.Server))
   406  		containerReconciler.SetDNSServer(dnssrv.(*meshdns.Server))
   407  	}
   408  	srv, err := meshservices.NewServer(hostCtx, srvOpts)
   409  	if err != nil {
   410  		err := host.Stop(ctx)
   411  		if err != nil {
   412  			log.Error(err, "Failed to stop host node")
   413  		}
   414  		log.Error(err, "Failed to create webmesh services server")
   415  		os.Exit(1)
   416  	}
   417  	if !cniopts.Host.Services.API.Disabled {
   418  		err = cniopts.Host.Services.RegisterAPIs(hostCtx, meshconfig.APIRegistrationOptions{
   419  			Node:        host.Node(),
   420  			Server:      srv,
   421  			Features:    cniopts.Host.Services.NewFeatureSet(storageProvider, srv.GRPCListenPort()),
   422  			Description: "webmesh-cni",
   423  			BuildInfo:   build,
   424  		})
   425  		if err != nil {
   426  			err := host.Stop(ctx)
   427  			if err != nil {
   428  				log.Error(err, "Failed to stop host node")
   429  			}
   430  			log.Error(err, "Failed to register webmesh services APIs")
   431  			os.Exit(1)
   432  		}
   433  	}
   434  	go func() {
   435  		log.Info("Starting webmesh services")
   436  		err := srv.ListenAndServe()
   437  		if err != nil {
   438  			err := host.Stop(ctx)
   439  			if err != nil {
   440  				log.Error(err, "Failed to stop host node")
   441  			}
   442  			log.Error(err, "Failed to start webmesh services server")
   443  			os.Exit(1)
   444  		}
   445  	}()
   446  
   447  	// Wait for the manager to exit.
   448  	<-ctx.Done()
   449  
   450  	log.Info("Shutting down webmesh node and services")
   451  	shutdownCtx, cancel := context.WithTimeout(
   452  		ctrllog.IntoContext(context.Background(), log),
   453  		cniopts.Manager.ShutdownTimeout,
   454  	)
   455  	defer cancel()
   456  	err = host.Stop(shutdownCtx)
   457  	if err != nil {
   458  		log.Error(err, "Failed to stop host node")
   459  	}
   460  	srv.Shutdown(hostCtx)
   461  
   462  	// Wait for the manager to exit.
   463  	select {
   464  	case <-donec:
   465  		log.Info("Finished running manager")
   466  	case <-time.After(cniopts.Manager.ShutdownTimeout):
   467  		log.Info("Shutdown timeout reached, exiting")
   468  	}
   469  }