github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/daemon.go (about)

     1  // Package ais provides core functionality for the AIStore object storage.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package ais
     6  
     7  import (
     8  	"errors"
     9  	"flag"
    10  	"fmt"
    11  	"net/url"
    12  	"os"
    13  	"runtime"
    14  	"strings"
    15  
    16  	"github.com/NVIDIA/aistore/api/apc"
    17  	"github.com/NVIDIA/aistore/api/env"
    18  	"github.com/NVIDIA/aistore/cmn"
    19  	"github.com/NVIDIA/aistore/cmn/cos"
    20  	"github.com/NVIDIA/aistore/cmn/debug"
    21  	"github.com/NVIDIA/aistore/cmn/k8s"
    22  	"github.com/NVIDIA/aistore/cmn/nlog"
    23  	"github.com/NVIDIA/aistore/fs"
    24  	"github.com/NVIDIA/aistore/hk"
    25  	"github.com/NVIDIA/aistore/space"
    26  	"github.com/NVIDIA/aistore/sys"
    27  	"github.com/NVIDIA/aistore/xact/xreg"
    28  	"github.com/NVIDIA/aistore/xact/xs"
    29  )
    30  
    31  const usecli = " -role=<proxy|target> -config=</dir/config.json> -local_config=</dir/local-config.json> ..."
    32  
    33  type (
    34  	daemonCtx struct {
    35  		cli       cliFlags
    36  		rg        *rungroup
    37  		version   string // major.minor.build (see cmd/aisnode)
    38  		buildTime string // YYYY-MM-DD HH:MM:SS-TZ
    39  		EP        string // env "AIS_PRIMARY_EP"
    40  		resilver  struct {
    41  			reason   string // Reason why resilver needs to be run.
    42  			required bool   // Determines if the resilver needs to be started.
    43  		}
    44  	}
    45  	cliFlags struct {
    46  		localConfigPath  string // path to local config
    47  		globalConfigPath string // path to global config
    48  		role             string // proxy | target
    49  		daemonID         string // daemon ID to assign
    50  		confCustom       string // "key1=value1,key2=value2" formatted to override selected entries in config
    51  		primary          struct {
    52  			ntargets    int  // expected number of targets in a starting-up cluster
    53  			skipStartup bool // determines if primary should skip waiting for targets to join
    54  		}
    55  		transient bool // true: keep command-line provided `-config-custom` settings in memory only
    56  		target    struct {
    57  			// do not try to auto-join cluster upon startup - stand by and wait for admin request
    58  			standby bool
    59  			// force starting up with a lost or missing mountpath
    60  			startWithLostMountpath bool
    61  			// use loopback devices
    62  			useLoopbackDevs bool
    63  		}
    64  		usage bool // show usage and exit
    65  	}
    66  	runRet struct {
    67  		name string
    68  		err  error
    69  	}
    70  	rungroup struct {
    71  		rs    map[string]cos.Runner
    72  		errCh chan runRet
    73  	}
    74  )
    75  
    76  var daemon = daemonCtx{}
    77  
    78  func initFlags(flset *flag.FlagSet) {
    79  	// role aka `DaeType`
    80  	flset.StringVar(&daemon.cli.role, "role", "", "_role_ of this aisnode: 'proxy' OR 'target'")
    81  	flset.StringVar(&daemon.cli.daemonID, "daemon_id", "", "user-specified node ID (advanced usage only!)")
    82  
    83  	// config itself and its command line overrides
    84  	flset.StringVar(&daemon.cli.globalConfigPath, "config", "",
    85  		"config filename: local file that stores the global cluster configuration")
    86  	flset.StringVar(&daemon.cli.localConfigPath, "local_config", "",
    87  		"config filename: local file that stores daemon's local configuration")
    88  	flset.StringVar(&daemon.cli.confCustom, "config_custom", "",
    89  		"\"key1=value1,key2=value2\" formatted string to override selected entries in config")
    90  	flset.BoolVar(&daemon.cli.transient, "transient", false, "false: store customized (via '-config_custom') configuration\n"+
    91  		"true: keep '-config_custom' settings in memory only (non-persistent)")
    92  	flset.BoolVar(&daemon.cli.usage, "h", false, "show usage and exit")
    93  
    94  	// target-only
    95  	flset.BoolVar(&daemon.cli.target.standby, "standby", false,
    96  		"when starting up, do not try to auto-join cluster - stand by and wait for admin request (target-only)")
    97  	flset.BoolVar(&cmn.AllowSharedDisksAndNoDisks, "allow_shared_no_disks", false,
    98  		"NOTE: deprecated, will be removed in future releases")
    99  	flset.BoolVar(&daemon.cli.target.useLoopbackDevs, "loopback", false,
   100  		"use loopback devices (local playground, target-only)")
   101  	flset.BoolVar(&daemon.cli.target.startWithLostMountpath, "start_with_lost_mountpath", false,
   102  		"force starting up with a lost or missing mountpath (target-only)")
   103  
   104  	// primary-only:
   105  	flset.IntVar(&daemon.cli.primary.ntargets, "ntargets", 0,
   106  		"number of storage targets expected to be joining at startup (optional, primary-only)")
   107  	flset.BoolVar(&daemon.cli.primary.skipStartup, "skip_startup", false,
   108  		"whether primary, when starting up, should skip waiting for target joins (used only in tests)")
   109  }
   110  
   111  func initDaemon(version, buildTime string) cos.Runner {
   112  	const erfm = "Missing `%s` flag pointing to configuration file (must be provided via command line)\n"
   113  	var (
   114  		flset  *flag.FlagSet
   115  		config *cmn.Config
   116  		err    error
   117  	)
   118  	// flags
   119  	flset = flag.NewFlagSet(os.Args[0], flag.ExitOnError) // discard flags of imported packages
   120  	initFlags(flset)
   121  	flset.Parse(os.Args[1:])
   122  	if daemon.cli.usage || len(os.Args) == 1 {
   123  		fmt.Fprintln(os.Stderr, "  Usage: "+os.Args[0]+usecli+"\n")
   124  		flset.PrintDefaults()
   125  		fmt.Fprintln(os.Stderr, "  ---")
   126  		fmt.Fprintf(os.Stderr, "  Version %s (build: %s)\n", version, buildTime)
   127  		fmt.Fprintln(os.Stderr, "  Usage:\n\t"+os.Args[0]+usecli)
   128  		os.Exit(0)
   129  	}
   130  	if len(os.Args) == 2 && os.Args[1] == "version" {
   131  		fmt.Fprintf(os.Stderr, "version %s (build: %s)\n", version, buildTime)
   132  		os.Exit(0)
   133  	}
   134  	os.Args = []string{os.Args[0]}
   135  	flag.Parse() // so that imported packages don't complain
   136  
   137  	// validation
   138  	if daemon.cli.role != apc.Proxy && daemon.cli.role != apc.Target {
   139  		cos.ExitLogf("invalid node's role %q, expecting %q or %q", daemon.cli.role, apc.Proxy, apc.Target)
   140  	}
   141  	if daemon.cli.globalConfigPath == "" {
   142  		cos.ExitLogf(erfm, "config")
   143  	}
   144  	if daemon.cli.localConfigPath == "" {
   145  		cos.ExitLogf(erfm, "local-config")
   146  	}
   147  
   148  	// config
   149  	config = &cmn.Config{}
   150  	err = cmn.LoadConfig(daemon.cli.globalConfigPath, daemon.cli.localConfigPath, daemon.cli.role, config)
   151  	if err != nil {
   152  		cos.ExitLog(err)
   153  	}
   154  	cmn.GCO.Put(config)
   155  
   156  	// Examples overriding default configuration at a node startup via command line:
   157  	// 1) set client timeout to 13s and store the updated value on disk:
   158  	// $ aisnode -config=/etc/ais.json -local_config=/etc/ais_local.json -role=target \
   159  	//   -config_custom="client.client_timeout=13s"
   160  	//
   161  	// 2) same as above except that the new timeout will remain transient
   162  	//    (won't persist across restarts):
   163  	// $ aisnode -config=/etc/ais.json -local_config=/etc/ais_local.json -role=target -transient=true \
   164  	//   -config_custom="client.client_timeout=13s"
   165  	// 3) e.g. updating log level:
   166  	//   -config_custom="log.level=4611"
   167  	//   (once done, `ais config node ... inherited log` will show "3 (modules: ec,xs)")
   168  	if daemon.cli.confCustom != "" {
   169  		var (
   170  			toUpdate = &cmn.ConfigToSet{}
   171  			kvs      = strings.Split(daemon.cli.confCustom, ",")
   172  		)
   173  		if err := toUpdate.FillFromKVS(kvs); err != nil {
   174  			cos.ExitLog(err)
   175  		}
   176  		if err := setConfigInMem(toUpdate, config, apc.Daemon); err != nil {
   177  			cos.ExitLogf("failed to update config in memory: %v", err)
   178  		}
   179  
   180  		overrideConfig := cmn.GCO.MergeOverride(toUpdate)
   181  		if !daemon.cli.transient {
   182  			if err = cmn.SaveOverrideConfig(config.ConfigDir, overrideConfig); err != nil {
   183  				cos.ExitLogf("failed to save 'override' config: %v", err)
   184  			}
   185  		}
   186  	}
   187  
   188  	daemon.version, daemon.buildTime = version, buildTime
   189  	loghdr := fmt.Sprintf("Version %s, build time %s, debug %t", version, buildTime, debug.ON())
   190  	cpus := sys.NumCPU()
   191  	if containerized := sys.Containerized(); containerized {
   192  		loghdr += fmt.Sprintf(", CPUs(%d, runtime=%d), containerized", cpus, runtime.NumCPU())
   193  	} else {
   194  		loghdr += fmt.Sprintf(", CPUs(%d, runtime=%d)", cpus, runtime.NumCPU())
   195  	}
   196  	nlog.Infoln(loghdr) // redundant (see below), prior to start/init
   197  	sys.SetMaxProcs()
   198  
   199  	daemon.rg = &rungroup{rs: make(map[string]cos.Runner, 6)}
   200  	hk.Init()
   201  	daemon.rg.add(hk.DefaultHK)
   202  
   203  	// K8s
   204  	k8s.Init()
   205  
   206  	// declared xactions, as per xact/api.go
   207  	xreg.Init()
   208  
   209  	// primary 'host[:port]' endpoint or URL from the environment
   210  	if daemon.EP = os.Getenv(env.AIS.PrimaryEP); daemon.EP != "" {
   211  		scheme := "http"
   212  		if config.Net.HTTP.UseHTTPS {
   213  			scheme = "https"
   214  		}
   215  		if strings.Contains(daemon.EP, "://") {
   216  			u, err := url.Parse(daemon.EP)
   217  			if err != nil {
   218  				cos.ExitLogf("invalid environment %s=%s: %v", env.AIS.PrimaryEP, daemon.EP, err)
   219  			}
   220  			if u.Path != "" && u.Path != "/" {
   221  				cos.ExitLogf("invalid environment %s=%s (not expecting path %q)",
   222  					env.AIS.PrimaryEP, daemon.EP, u.Path)
   223  			}
   224  			// reassemble and compare
   225  			ustr := scheme + "://" + u.Hostname()
   226  			if port := u.Port(); port != "" {
   227  				ustr += ":" + port
   228  			}
   229  			if ustr != daemon.EP {
   230  				nlog.Warningln("environment-set primary URL mismatch:", daemon.EP, "vs", ustr)
   231  				daemon.EP = ustr
   232  			}
   233  		} else {
   234  			daemon.EP = scheme + "://" + daemon.EP
   235  		}
   236  	}
   237  
   238  	// fork (proxy | target)
   239  	co := newConfigOwner(config)
   240  	if daemon.cli.role == apc.Proxy {
   241  		xs.Xreg(true /* x-ele only */)
   242  		p := newProxy(co)
   243  		p.init(config)
   244  		title := "Node " + p.si.Name() + ", " + loghdr + "\n"
   245  		nlog.Infoln(title)
   246  
   247  		// aux plumbing
   248  		nlog.SetTitle(title)
   249  		cmn.InitErrs(p.si.Name(), nil)
   250  		return p
   251  	}
   252  
   253  	// reg xaction factories
   254  	xs.Xreg(false /* x-ele only */)
   255  	space.Xreg()
   256  
   257  	t := newTarget(co)
   258  	t.init(config)
   259  	title := "Node " + t.si.Name() + ", " + loghdr + "\n"
   260  	nlog.Infoln(title)
   261  
   262  	// aux plumbing
   263  	nlog.SetTitle(title)
   264  	cmn.InitErrs(t.si.Name(), fs.CleanPathErr)
   265  
   266  	return t
   267  }
   268  
   269  func newProxy(co *configOwner) *proxy {
   270  	p := &proxy{}
   271  	p.owner.config = co
   272  	return p
   273  }
   274  
   275  func newTarget(co *configOwner) *target {
   276  	t := &target{backend: make(backends, 8)}
   277  	t.owner.bmd = newBMDOwnerTgt()
   278  	t.owner.etl = newEtlMDOwnerTgt()
   279  	t.owner.config = co
   280  	return t
   281  }
   282  
   283  // Run is the 'main' where everything gets started
   284  func Run(version, buildTime string) int {
   285  	rmain := initDaemon(version, buildTime)
   286  	err := daemon.rg.runAll(rmain)
   287  
   288  	if err == nil {
   289  		nlog.Infoln("Terminated OK")
   290  		return 0
   291  	}
   292  	if e, ok := err.(*cos.ErrSignal); ok {
   293  		nlog.Infof("Terminated OK via %v", e)
   294  		return e.ExitCode()
   295  	}
   296  	if errors.Is(err, cmn.ErrStartupTimeout) {
   297  		// NOTE:
   298  		// stats and keepalive runners wait for the ClusterStarted() - i.e., for the primary
   299  		// to reach the corresponding stage. There must be an external "restarter" (e.g. K8s)
   300  		// to restart the daemon if the primary gets killed or panics prior (to reaching that state)
   301  		nlog.Errorln("Timed-out while starting up")
   302  	}
   303  	nlog.Errorf("Terminated with err: %v", err)
   304  	return 1
   305  }
   306  
   307  //////////////
   308  // rungroup //
   309  //////////////
   310  
   311  func (g *rungroup) add(r cos.Runner) {
   312  	debug.Assert(r.Name() != "")
   313  	_, exists := g.rs[r.Name()]
   314  	debug.Assert(!exists)
   315  
   316  	g.rs[r.Name()] = r
   317  }
   318  
   319  func (g *rungroup) run(r cos.Runner) {
   320  	err := r.Run()
   321  	if err != nil {
   322  		nlog.Warningf("runner [%s] exited with err [%v]", r.Name(), err)
   323  	}
   324  	g.errCh <- runRet{r.Name(), err}
   325  }
   326  
   327  func (g *rungroup) runAll(mainRunner cos.Runner) error {
   328  	g.errCh = make(chan runRet, len(g.rs))
   329  
   330  	// run all, housekeeper first
   331  	go g.run(hk.DefaultHK)
   332  	runtime.Gosched()
   333  	hk.WaitStarted()
   334  	for _, r := range g.rs {
   335  		if r.Name() == hk.DefaultHK.Name() {
   336  			continue
   337  		}
   338  		go g.run(r)
   339  	}
   340  
   341  	// Stop all runners, target (or proxy) first.
   342  	ret := <-g.errCh
   343  	nlog.SetStopping()
   344  	if ret.name != mainRunner.Name() {
   345  		mainRunner.Stop(ret.err)
   346  	}
   347  	for _, r := range g.rs {
   348  		if r.Name() != mainRunner.Name() {
   349  			r.Stop(ret.err)
   350  		}
   351  	}
   352  	// Wait for all terminations.
   353  	for range len(g.rs) - 1 {
   354  		<-g.errCh
   355  	}
   356  	return ret.err
   357  }
   358  
   359  ///////////////
   360  // daemon ID //
   361  ///////////////
   362  
   363  const (
   364  	daemonIDEnv = "AIS_DAEMON_ID"
   365  )
   366  
   367  func envDaemonID(daemonType string) (daemonID string) {
   368  	if daemon.cli.daemonID != "" {
   369  		nlog.Warningf("%s[%q] ID from command-line", daemonType, daemon.cli.daemonID)
   370  		return daemon.cli.daemonID
   371  	}
   372  	if daemonID = os.Getenv(daemonIDEnv); daemonID != "" {
   373  		nlog.Warningf("%s[%q] ID from env", daemonType, daemonID)
   374  	}
   375  	return
   376  }
   377  
   378  func genDaemonID(daemonType string, config *cmn.Config) string {
   379  	if !config.TestingEnv() {
   380  		return cos.GenDaemonID()
   381  	}
   382  	switch daemonType {
   383  	case apc.Target:
   384  		return cos.GenTestingDaemonID(fmt.Sprintf("t%d", config.HostNet.Port))
   385  	case apc.Proxy:
   386  		return cos.GenTestingDaemonID(fmt.Sprintf("p%d", config.HostNet.Port))
   387  	}
   388  	cos.AssertMsg(false, daemonType)
   389  	return ""
   390  }