github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/daemon.go (about) 1 // Package ais provides core functionality for the AIStore object storage. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package ais 6 7 import ( 8 "errors" 9 "flag" 10 "fmt" 11 "net/url" 12 "os" 13 "runtime" 14 "strings" 15 16 "github.com/NVIDIA/aistore/api/apc" 17 "github.com/NVIDIA/aistore/api/env" 18 "github.com/NVIDIA/aistore/cmn" 19 "github.com/NVIDIA/aistore/cmn/cos" 20 "github.com/NVIDIA/aistore/cmn/debug" 21 "github.com/NVIDIA/aistore/cmn/k8s" 22 "github.com/NVIDIA/aistore/cmn/nlog" 23 "github.com/NVIDIA/aistore/fs" 24 "github.com/NVIDIA/aistore/hk" 25 "github.com/NVIDIA/aistore/space" 26 "github.com/NVIDIA/aistore/sys" 27 "github.com/NVIDIA/aistore/xact/xreg" 28 "github.com/NVIDIA/aistore/xact/xs" 29 ) 30 31 const usecli = " -role=<proxy|target> -config=</dir/config.json> -local_config=</dir/local-config.json> ..." 32 33 type ( 34 daemonCtx struct { 35 cli cliFlags 36 rg *rungroup 37 version string // major.minor.build (see cmd/aisnode) 38 buildTime string // YYYY-MM-DD HH:MM:SS-TZ 39 EP string // env "AIS_PRIMARY_EP" 40 resilver struct { 41 reason string // Reason why resilver needs to be run. 42 required bool // Determines if the resilver needs to be started. 43 } 44 } 45 cliFlags struct { 46 localConfigPath string // path to local config 47 globalConfigPath string // path to global config 48 role string // proxy | target 49 daemonID string // daemon ID to assign 50 confCustom string // "key1=value1,key2=value2" formatted to override selected entries in config 51 primary struct { 52 ntargets int // expected number of targets in a starting-up cluster 53 skipStartup bool // determines if primary should skip waiting for targets to join 54 } 55 transient bool // true: keep command-line provided `-config-custom` settings in memory only 56 target struct { 57 // do not try to auto-join cluster upon startup - stand by and wait for admin request 58 standby bool 59 // force starting up with a lost or missing mountpath 60 startWithLostMountpath bool 61 // use loopback devices 62 useLoopbackDevs bool 63 } 64 usage bool // show usage and exit 65 } 66 runRet struct { 67 name string 68 err error 69 } 70 rungroup struct { 71 rs map[string]cos.Runner 72 errCh chan runRet 73 } 74 ) 75 76 var daemon = daemonCtx{} 77 78 func initFlags(flset *flag.FlagSet) { 79 // role aka `DaeType` 80 flset.StringVar(&daemon.cli.role, "role", "", "_role_ of this aisnode: 'proxy' OR 'target'") 81 flset.StringVar(&daemon.cli.daemonID, "daemon_id", "", "user-specified node ID (advanced usage only!)") 82 83 // config itself and its command line overrides 84 flset.StringVar(&daemon.cli.globalConfigPath, "config", "", 85 "config filename: local file that stores the global cluster configuration") 86 flset.StringVar(&daemon.cli.localConfigPath, "local_config", "", 87 "config filename: local file that stores daemon's local configuration") 88 flset.StringVar(&daemon.cli.confCustom, "config_custom", "", 89 "\"key1=value1,key2=value2\" formatted string to override selected entries in config") 90 flset.BoolVar(&daemon.cli.transient, "transient", false, "false: store customized (via '-config_custom') configuration\n"+ 91 "true: keep '-config_custom' settings in memory only (non-persistent)") 92 flset.BoolVar(&daemon.cli.usage, "h", false, "show usage and exit") 93 94 // target-only 95 flset.BoolVar(&daemon.cli.target.standby, "standby", false, 96 "when starting up, do not try to auto-join cluster - stand by and wait for admin request (target-only)") 97 flset.BoolVar(&cmn.AllowSharedDisksAndNoDisks, "allow_shared_no_disks", false, 98 "NOTE: deprecated, will be removed in future releases") 99 flset.BoolVar(&daemon.cli.target.useLoopbackDevs, "loopback", false, 100 "use loopback devices (local playground, target-only)") 101 flset.BoolVar(&daemon.cli.target.startWithLostMountpath, "start_with_lost_mountpath", false, 102 "force starting up with a lost or missing mountpath (target-only)") 103 104 // primary-only: 105 flset.IntVar(&daemon.cli.primary.ntargets, "ntargets", 0, 106 "number of storage targets expected to be joining at startup (optional, primary-only)") 107 flset.BoolVar(&daemon.cli.primary.skipStartup, "skip_startup", false, 108 "whether primary, when starting up, should skip waiting for target joins (used only in tests)") 109 } 110 111 func initDaemon(version, buildTime string) cos.Runner { 112 const erfm = "Missing `%s` flag pointing to configuration file (must be provided via command line)\n" 113 var ( 114 flset *flag.FlagSet 115 config *cmn.Config 116 err error 117 ) 118 // flags 119 flset = flag.NewFlagSet(os.Args[0], flag.ExitOnError) // discard flags of imported packages 120 initFlags(flset) 121 flset.Parse(os.Args[1:]) 122 if daemon.cli.usage || len(os.Args) == 1 { 123 fmt.Fprintln(os.Stderr, " Usage: "+os.Args[0]+usecli+"\n") 124 flset.PrintDefaults() 125 fmt.Fprintln(os.Stderr, " ---") 126 fmt.Fprintf(os.Stderr, " Version %s (build: %s)\n", version, buildTime) 127 fmt.Fprintln(os.Stderr, " Usage:\n\t"+os.Args[0]+usecli) 128 os.Exit(0) 129 } 130 if len(os.Args) == 2 && os.Args[1] == "version" { 131 fmt.Fprintf(os.Stderr, "version %s (build: %s)\n", version, buildTime) 132 os.Exit(0) 133 } 134 os.Args = []string{os.Args[0]} 135 flag.Parse() // so that imported packages don't complain 136 137 // validation 138 if daemon.cli.role != apc.Proxy && daemon.cli.role != apc.Target { 139 cos.ExitLogf("invalid node's role %q, expecting %q or %q", daemon.cli.role, apc.Proxy, apc.Target) 140 } 141 if daemon.cli.globalConfigPath == "" { 142 cos.ExitLogf(erfm, "config") 143 } 144 if daemon.cli.localConfigPath == "" { 145 cos.ExitLogf(erfm, "local-config") 146 } 147 148 // config 149 config = &cmn.Config{} 150 err = cmn.LoadConfig(daemon.cli.globalConfigPath, daemon.cli.localConfigPath, daemon.cli.role, config) 151 if err != nil { 152 cos.ExitLog(err) 153 } 154 cmn.GCO.Put(config) 155 156 // Examples overriding default configuration at a node startup via command line: 157 // 1) set client timeout to 13s and store the updated value on disk: 158 // $ aisnode -config=/etc/ais.json -local_config=/etc/ais_local.json -role=target \ 159 // -config_custom="client.client_timeout=13s" 160 // 161 // 2) same as above except that the new timeout will remain transient 162 // (won't persist across restarts): 163 // $ aisnode -config=/etc/ais.json -local_config=/etc/ais_local.json -role=target -transient=true \ 164 // -config_custom="client.client_timeout=13s" 165 // 3) e.g. updating log level: 166 // -config_custom="log.level=4611" 167 // (once done, `ais config node ... inherited log` will show "3 (modules: ec,xs)") 168 if daemon.cli.confCustom != "" { 169 var ( 170 toUpdate = &cmn.ConfigToSet{} 171 kvs = strings.Split(daemon.cli.confCustom, ",") 172 ) 173 if err := toUpdate.FillFromKVS(kvs); err != nil { 174 cos.ExitLog(err) 175 } 176 if err := setConfigInMem(toUpdate, config, apc.Daemon); err != nil { 177 cos.ExitLogf("failed to update config in memory: %v", err) 178 } 179 180 overrideConfig := cmn.GCO.MergeOverride(toUpdate) 181 if !daemon.cli.transient { 182 if err = cmn.SaveOverrideConfig(config.ConfigDir, overrideConfig); err != nil { 183 cos.ExitLogf("failed to save 'override' config: %v", err) 184 } 185 } 186 } 187 188 daemon.version, daemon.buildTime = version, buildTime 189 loghdr := fmt.Sprintf("Version %s, build time %s, debug %t", version, buildTime, debug.ON()) 190 cpus := sys.NumCPU() 191 if containerized := sys.Containerized(); containerized { 192 loghdr += fmt.Sprintf(", CPUs(%d, runtime=%d), containerized", cpus, runtime.NumCPU()) 193 } else { 194 loghdr += fmt.Sprintf(", CPUs(%d, runtime=%d)", cpus, runtime.NumCPU()) 195 } 196 nlog.Infoln(loghdr) // redundant (see below), prior to start/init 197 sys.SetMaxProcs() 198 199 daemon.rg = &rungroup{rs: make(map[string]cos.Runner, 6)} 200 hk.Init() 201 daemon.rg.add(hk.DefaultHK) 202 203 // K8s 204 k8s.Init() 205 206 // declared xactions, as per xact/api.go 207 xreg.Init() 208 209 // primary 'host[:port]' endpoint or URL from the environment 210 if daemon.EP = os.Getenv(env.AIS.PrimaryEP); daemon.EP != "" { 211 scheme := "http" 212 if config.Net.HTTP.UseHTTPS { 213 scheme = "https" 214 } 215 if strings.Contains(daemon.EP, "://") { 216 u, err := url.Parse(daemon.EP) 217 if err != nil { 218 cos.ExitLogf("invalid environment %s=%s: %v", env.AIS.PrimaryEP, daemon.EP, err) 219 } 220 if u.Path != "" && u.Path != "/" { 221 cos.ExitLogf("invalid environment %s=%s (not expecting path %q)", 222 env.AIS.PrimaryEP, daemon.EP, u.Path) 223 } 224 // reassemble and compare 225 ustr := scheme + "://" + u.Hostname() 226 if port := u.Port(); port != "" { 227 ustr += ":" + port 228 } 229 if ustr != daemon.EP { 230 nlog.Warningln("environment-set primary URL mismatch:", daemon.EP, "vs", ustr) 231 daemon.EP = ustr 232 } 233 } else { 234 daemon.EP = scheme + "://" + daemon.EP 235 } 236 } 237 238 // fork (proxy | target) 239 co := newConfigOwner(config) 240 if daemon.cli.role == apc.Proxy { 241 xs.Xreg(true /* x-ele only */) 242 p := newProxy(co) 243 p.init(config) 244 title := "Node " + p.si.Name() + ", " + loghdr + "\n" 245 nlog.Infoln(title) 246 247 // aux plumbing 248 nlog.SetTitle(title) 249 cmn.InitErrs(p.si.Name(), nil) 250 return p 251 } 252 253 // reg xaction factories 254 xs.Xreg(false /* x-ele only */) 255 space.Xreg() 256 257 t := newTarget(co) 258 t.init(config) 259 title := "Node " + t.si.Name() + ", " + loghdr + "\n" 260 nlog.Infoln(title) 261 262 // aux plumbing 263 nlog.SetTitle(title) 264 cmn.InitErrs(t.si.Name(), fs.CleanPathErr) 265 266 return t 267 } 268 269 func newProxy(co *configOwner) *proxy { 270 p := &proxy{} 271 p.owner.config = co 272 return p 273 } 274 275 func newTarget(co *configOwner) *target { 276 t := &target{backend: make(backends, 8)} 277 t.owner.bmd = newBMDOwnerTgt() 278 t.owner.etl = newEtlMDOwnerTgt() 279 t.owner.config = co 280 return t 281 } 282 283 // Run is the 'main' where everything gets started 284 func Run(version, buildTime string) int { 285 rmain := initDaemon(version, buildTime) 286 err := daemon.rg.runAll(rmain) 287 288 if err == nil { 289 nlog.Infoln("Terminated OK") 290 return 0 291 } 292 if e, ok := err.(*cos.ErrSignal); ok { 293 nlog.Infof("Terminated OK via %v", e) 294 return e.ExitCode() 295 } 296 if errors.Is(err, cmn.ErrStartupTimeout) { 297 // NOTE: 298 // stats and keepalive runners wait for the ClusterStarted() - i.e., for the primary 299 // to reach the corresponding stage. There must be an external "restarter" (e.g. K8s) 300 // to restart the daemon if the primary gets killed or panics prior (to reaching that state) 301 nlog.Errorln("Timed-out while starting up") 302 } 303 nlog.Errorf("Terminated with err: %v", err) 304 return 1 305 } 306 307 ////////////// 308 // rungroup // 309 ////////////// 310 311 func (g *rungroup) add(r cos.Runner) { 312 debug.Assert(r.Name() != "") 313 _, exists := g.rs[r.Name()] 314 debug.Assert(!exists) 315 316 g.rs[r.Name()] = r 317 } 318 319 func (g *rungroup) run(r cos.Runner) { 320 err := r.Run() 321 if err != nil { 322 nlog.Warningf("runner [%s] exited with err [%v]", r.Name(), err) 323 } 324 g.errCh <- runRet{r.Name(), err} 325 } 326 327 func (g *rungroup) runAll(mainRunner cos.Runner) error { 328 g.errCh = make(chan runRet, len(g.rs)) 329 330 // run all, housekeeper first 331 go g.run(hk.DefaultHK) 332 runtime.Gosched() 333 hk.WaitStarted() 334 for _, r := range g.rs { 335 if r.Name() == hk.DefaultHK.Name() { 336 continue 337 } 338 go g.run(r) 339 } 340 341 // Stop all runners, target (or proxy) first. 342 ret := <-g.errCh 343 nlog.SetStopping() 344 if ret.name != mainRunner.Name() { 345 mainRunner.Stop(ret.err) 346 } 347 for _, r := range g.rs { 348 if r.Name() != mainRunner.Name() { 349 r.Stop(ret.err) 350 } 351 } 352 // Wait for all terminations. 353 for range len(g.rs) - 1 { 354 <-g.errCh 355 } 356 return ret.err 357 } 358 359 /////////////// 360 // daemon ID // 361 /////////////// 362 363 const ( 364 daemonIDEnv = "AIS_DAEMON_ID" 365 ) 366 367 func envDaemonID(daemonType string) (daemonID string) { 368 if daemon.cli.daemonID != "" { 369 nlog.Warningf("%s[%q] ID from command-line", daemonType, daemon.cli.daemonID) 370 return daemon.cli.daemonID 371 } 372 if daemonID = os.Getenv(daemonIDEnv); daemonID != "" { 373 nlog.Warningf("%s[%q] ID from env", daemonType, daemonID) 374 } 375 return 376 } 377 378 func genDaemonID(daemonType string, config *cmn.Config) string { 379 if !config.TestingEnv() { 380 return cos.GenDaemonID() 381 } 382 switch daemonType { 383 case apc.Target: 384 return cos.GenTestingDaemonID(fmt.Sprintf("t%d", config.HostNet.Port)) 385 case apc.Proxy: 386 return cos.GenTestingDaemonID(fmt.Sprintf("p%d", config.HostNet.Port)) 387 } 388 cos.AssertMsg(false, daemonType) 389 return "" 390 }