github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/htrun.go (about) 1 // Package ais provides core functionality for the AIStore object storage. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package ais 6 7 import ( 8 "bytes" 9 "context" 10 "crypto/tls" 11 "errors" 12 "fmt" 13 "io" 14 "log" 15 "net/http" 16 "net/url" 17 "os" 18 "path/filepath" 19 "runtime" 20 "strconv" 21 "strings" 22 "sync" 23 "time" 24 25 "github.com/NVIDIA/aistore/api/apc" 26 "github.com/NVIDIA/aistore/api/env" 27 "github.com/NVIDIA/aistore/cmn" 28 "github.com/NVIDIA/aistore/cmn/archive" 29 "github.com/NVIDIA/aistore/cmn/atomic" 30 "github.com/NVIDIA/aistore/cmn/cifl" 31 "github.com/NVIDIA/aistore/cmn/cos" 32 "github.com/NVIDIA/aistore/cmn/debug" 33 "github.com/NVIDIA/aistore/cmn/jsp" 34 "github.com/NVIDIA/aistore/cmn/k8s" 35 "github.com/NVIDIA/aistore/cmn/mono" 36 "github.com/NVIDIA/aistore/cmn/nlog" 37 "github.com/NVIDIA/aistore/core" 38 "github.com/NVIDIA/aistore/core/meta" 39 "github.com/NVIDIA/aistore/memsys" 40 "github.com/NVIDIA/aistore/stats" 41 "github.com/NVIDIA/aistore/xact/xreg" 42 jsoniter "github.com/json-iterator/go" 43 "github.com/prometheus/client_golang/prometheus/promhttp" 44 "github.com/tinylib/msgp/msgp" 45 ) 46 47 const ciePrefix = "cluster integrity error cie#" 48 49 const notPresentInSmap = ` 50 %s: %s (self) is not present in the local copy of the %s 51 52 ----------------- 53 To troubleshoot: 54 1. first, make sure you are not trying to run two different %s on the same machine 55 2. double check "fspaths" config (used to find ais target's volume metadata and load its node ID) 56 3. if none of the above helps, remove possibly outdated cluster map from the %s (located at %s) 57 4. restart %s 58 -----------------` 59 60 // extra or extended state - currently, target only 61 type htext interface { 62 interruptedRestarted() (bool, bool) 63 } 64 65 type htrun struct { 66 si *meta.Snode 67 keepalive keepaliver 68 statsT stats.Tracker 69 owner struct { 70 smap *smapOwner 71 bmd bmdOwner // an interface with proxy and target impl-s 72 rmd *rmdOwner 73 config *configOwner 74 etl etlOwner // ditto 75 } 76 startup struct { 77 cluster atomic.Int64 // mono.NanoTime() since cluster startup, zero prior to that 78 node atomic.Int64 // ditto - for the node 79 } 80 gmm *memsys.MMSA // system pagesize-based memory manager and slab allocator 81 smm *memsys.MMSA // system MMSA for small-size allocations 82 } 83 84 /////////// 85 // htrun // 86 /////////// 87 88 // interface guard 89 var _ core.Node = (*htrun)(nil) 90 91 func (h *htrun) Snode() *meta.Snode { return h.si } 92 func (h *htrun) callerName() string { return h.si.String() } 93 func (h *htrun) SID() string { return h.si.ID() } 94 func (h *htrun) String() string { return h.si.String() } 95 96 func (h *htrun) Bowner() meta.Bowner { return h.owner.bmd } 97 func (h *htrun) Sowner() meta.Sowner { return h.owner.smap } 98 99 // NOTE: currently, only 'resume' (see also: kaSuspendMsg) 100 func (h *htrun) smapUpdatedCB(_, _ *smapX, nfl, ofl cos.BitFlags) { 101 if ofl.IsAnySet(meta.SnodeMaintDecomm) && !nfl.IsAnySet(meta.SnodeMaintDecomm) { 102 h.keepalive.ctrl(kaResumeMsg) 103 } 104 } 105 106 func (h *htrun) parseReq(w http.ResponseWriter, r *http.Request, apireq *apiRequest) (err error) { 107 debug.Assert(len(apireq.prefix) != 0) 108 apireq.items, err = h.parseURL(w, r, apireq.prefix, apireq.after, false) 109 if err != nil { 110 return 111 } 112 debug.Assert(len(apireq.items) > apireq.bckIdx) 113 bckName := apireq.items[apireq.bckIdx] 114 if apireq.dpq == nil { 115 apireq.query = r.URL.Query() 116 } else if err = apireq.dpq.parse(r.URL.RawQuery); err != nil { 117 return 118 } 119 apireq.bck, err = newBckFromQ(bckName, apireq.query, apireq.dpq) 120 if err != nil { 121 h.writeErr(w, r, err) 122 } 123 return err 124 } 125 126 func (h *htrun) cluMeta(opts cmetaFillOpt) (*cluMeta, error) { 127 cm := &cluMeta{SI: h.si} 128 if voteInProgress() != nil { 129 cm.Flags = cm.Flags.Set(cifl.VoteInProgress) 130 } 131 if !opts.skipConfig { 132 var err error 133 cm.Config, err = h.owner.config.get() 134 if err != nil { 135 return nil, err 136 } 137 } 138 // don't send Smap when it is undergoing changes (and is about to get metasync-ed) 139 smap := h.owner.smap.get() 140 if !opts.skipSmap { 141 cm.Smap = smap 142 } 143 if !opts.skipBMD { 144 cm.BMD = h.owner.bmd.get() 145 } 146 if !opts.skipRMD { 147 cm.RMD = h.owner.rmd.get() 148 } 149 if !opts.skipEtlMD { 150 cm.EtlMD = h.owner.etl.get() 151 } 152 if h.si.IsTarget() && opts.fillRebMarker { 153 rebInterrupted, restarted := opts.htext.interruptedRestarted() 154 if rebInterrupted { 155 cm.Flags = cm.Flags.Set(cifl.RebalanceInterrupted) 156 } 157 if restarted { 158 cm.Flags = cm.Flags.Set(cifl.Restarted) 159 } 160 } 161 if !opts.skipPrimeTime && smap.IsPrimary(h.si) { 162 cm.PrimeTime = time.Now().UnixNano() 163 } 164 return cm, nil 165 } 166 167 // usage: [API call => handler => ClusterStartedWithRetry ] 168 func (h *htrun) cluStartedWithRetry() bool { 169 if clutime := h.startup.cluster.Load(); clutime > 0 { 170 return true 171 } 172 if !h.NodeStarted() { 173 return false 174 } 175 time.Sleep(time.Second) 176 clutime := h.startup.cluster.Load() 177 if clutime == 0 { 178 nlog.ErrorDepth(1, fmt.Sprintf("%s: cluster is starting up", h)) 179 } 180 return clutime > 0 181 } 182 183 func (h *htrun) ClusterStarted() bool { return h.startup.cluster.Load() > 0 } // see also: p.ready() 184 func (h *htrun) markClusterStarted() { h.startup.cluster.Store(mono.NanoTime()) } 185 186 func (h *htrun) NodeStarted() bool { return h.startup.node.Load() > 0 } 187 func (h *htrun) markNodeStarted() { h.startup.node.Store(mono.NanoTime()) } 188 189 func (h *htrun) regNetHandlers(networkHandlers []networkHandler) { 190 var ( 191 path string 192 config = cmn.GCO.Get() 193 ) 194 // common, debug 195 for r, nh := range debug.Handlers() { 196 handlePub(r, nh) 197 } 198 // node type specific 199 for _, nh := range networkHandlers { 200 var reg bool 201 if nh.r[0] == '/' { // absolute path 202 path = nh.r 203 } else { 204 path = cos.JoinWords(apc.Version, nh.r) 205 } 206 debug.Assert(nh.net != 0) 207 if nh.net.isSet(accessNetPublic) { 208 handlePub(path, nh.h) 209 reg = true 210 } 211 if config.HostNet.UseIntraControl && nh.net.isSet(accessNetIntraControl) { 212 handleControl(path, nh.h) 213 reg = true 214 } 215 if config.HostNet.UseIntraData && nh.net.isSet(accessNetIntraData) { 216 handleData(path, nh.h) 217 reg = true 218 } 219 if reg { 220 continue 221 } 222 // none of the above 223 if !config.HostNet.UseIntraControl && !config.HostNet.UseIntraData { 224 // no intra-cluster networks: default to pub net 225 handlePub(path, nh.h) 226 } else if config.HostNet.UseIntraControl && nh.net.isSet(accessNetIntraData) { 227 // (not configured) data defaults to (configured) control 228 handleControl(path, nh.h) 229 } else { 230 debug.Assert(config.HostNet.UseIntraData && nh.net.isSet(accessNetIntraControl)) 231 // (not configured) control defaults to (configured) data 232 handleData(path, nh.h) 233 } 234 } 235 // common Prometheus 236 if h.statsT.IsPrometheus() { 237 nh := networkHandler{r: "/" + apc.Metrics, h: promhttp.Handler().ServeHTTP} 238 path := nh.r // absolute 239 handlePub(path, nh.h) 240 } 241 } 242 243 func (h *htrun) init(config *cmn.Config) { 244 initCtrlClient(config) 245 initDataClient(config) 246 247 tcpbuf := config.Net.L4.SndRcvBufSize 248 if h.si.IsProxy() { 249 tcpbuf = 0 250 } else if tcpbuf == 0 { 251 tcpbuf = cmn.DefaultSendRecvBufferSize // ditto: targets use AIS default when not configured 252 } 253 254 muxers := newMuxers() 255 g.netServ.pub = &netServer{muxers: muxers, sndRcvBufSize: tcpbuf} 256 g.netServ.control = g.netServ.pub // if not separately configured, intra-control net is public 257 if config.HostNet.UseIntraControl { 258 muxers = newMuxers() 259 g.netServ.control = &netServer{muxers: muxers, sndRcvBufSize: 0} 260 } 261 g.netServ.data = g.netServ.control // if not configured, intra-data net is intra-control 262 if config.HostNet.UseIntraData { 263 muxers = newMuxers() 264 g.netServ.data = &netServer{muxers: muxers, sndRcvBufSize: tcpbuf} 265 } 266 267 h.owner.smap = newSmapOwner(config) 268 h.owner.rmd = newRMDOwner(config) 269 h.owner.rmd.load() 270 271 h.gmm = memsys.PageMM() 272 h.gmm.RegWithHK() 273 h.smm = memsys.ByteMM() 274 h.smm.RegWithHK() 275 } 276 277 // steps 1 thru 4 278 func (h *htrun) initSnode(config *cmn.Config) { 279 var ( 280 pubAddr meta.NetInfo 281 pubExtra []meta.NetInfo 282 ctrlAddr meta.NetInfo 283 dataAddr meta.NetInfo 284 port = strconv.Itoa(config.HostNet.Port) 285 proto = config.Net.HTTP.Proto 286 ) 287 addrList, err := getLocalIPv4s(config) 288 if err != nil { 289 cos.ExitLogf("failed to get local IP addr list: %v", err) 290 } 291 292 // 1. pub net 293 pub, extra := multihome(config.HostNet.Hostname) 294 295 if k8s.IsK8s() && config.HostNet.Hostname != "" { 296 // K8s: skip IP addr validation 297 // public hostname could be a load balancer's external IP or a service DNS 298 nlog.Infoln("K8s deployment: skipping hostname validation for", config.HostNet.Hostname) 299 pubAddr.Init(proto, pub, port) 300 } else if err = initNetInfo(&pubAddr, addrList, proto, config.HostNet.Hostname, port); err != nil { 301 cos.ExitLogf("failed to get %s IPv4/hostname: %v", cmn.NetPublic, err) 302 } 303 304 // multi-home (when config.HostNet.Hostname is a comma-separated list) 305 // using the same pub port 306 if l := len(extra); l > 0 { 307 pubExtra = make([]meta.NetInfo, l) 308 for i, addr := range extra { 309 pubExtra[i].Init(proto, addr, port) 310 } 311 } else { 312 nlog.Infof("%s (user) access: %v (%q)", cmn.NetPublic, pubAddr, config.HostNet.Hostname) 313 } 314 315 // 2. intra-cluster 316 ctrlAddr = pubAddr 317 if config.HostNet.UseIntraControl { 318 icport := strconv.Itoa(config.HostNet.PortIntraControl) 319 err = initNetInfo(&ctrlAddr, addrList, proto, config.HostNet.HostnameIntraControl, icport) 320 if err != nil { 321 cos.ExitLogf("failed to get %s IPv4/hostname: %v", cmn.NetIntraControl, err) 322 } 323 var s string 324 if config.HostNet.HostnameIntraControl != "" { 325 s = " (config: " + config.HostNet.HostnameIntraControl + ")" 326 } 327 nlog.Infof("%s access: %v%s", cmn.NetIntraControl, ctrlAddr, s) 328 } 329 dataAddr = pubAddr 330 if config.HostNet.UseIntraData { 331 idport := strconv.Itoa(config.HostNet.PortIntraData) 332 err = initNetInfo(&dataAddr, addrList, proto, config.HostNet.HostnameIntraData, idport) 333 if err != nil { 334 cos.ExitLogf("failed to get %s IPv4/hostname: %v", cmn.NetIntraData, err) 335 } 336 var s string 337 if config.HostNet.HostnameIntraData != "" { 338 s = " (config: " + config.HostNet.HostnameIntraData + ")" 339 } 340 nlog.Infof("%s access: %v%s", cmn.NetIntraData, dataAddr, s) 341 } 342 343 // 3. validate 344 mustDiffer(pubAddr, 345 config.HostNet.Port, 346 true, 347 ctrlAddr, 348 config.HostNet.PortIntraControl, 349 config.HostNet.UseIntraControl, 350 "pub/ctl", 351 ) 352 mustDiffer(pubAddr, 353 config.HostNet.Port, 354 true, 355 dataAddr, 356 config.HostNet.PortIntraData, 357 config.HostNet.UseIntraData, 358 "pub/data", 359 ) 360 mustDiffer(dataAddr, 361 config.HostNet.PortIntraData, 362 config.HostNet.UseIntraData, 363 ctrlAddr, 364 config.HostNet.PortIntraControl, 365 config.HostNet.UseIntraControl, 366 "ctl/data", 367 ) 368 369 // 4. new Snode 370 h.si = &meta.Snode{ 371 PubNet: pubAddr, 372 ControlNet: ctrlAddr, 373 DataNet: dataAddr, 374 } 375 if l := len(pubExtra); l > 0 { 376 h.si.PubExtra = make([]meta.NetInfo, l) 377 copy(h.si.PubExtra, pubExtra) 378 nlog.Infof("%s (multihome) access: %v and %v", cmn.NetPublic, pubAddr, h.si.PubExtra) 379 } 380 } 381 382 func mustDiffer(ip1 meta.NetInfo, port1 int, use1 bool, ip2 meta.NetInfo, port2 int, use2 bool, tag string) { 383 if !use1 || !use2 { 384 return 385 } 386 if ip1.Hostname == ip2.Hostname && port1 == port2 { 387 cos.ExitLogf("%s: cannot use the same IP:port (%s) for two networks", tag, ip1) 388 } 389 } 390 391 // at startup, check this Snode vs locally stored Smap replica (NOTE: some errors are FATAL) 392 func (h *htrun) loadSmap() (smap *smapX, reliable bool) { 393 smap = newSmap() 394 loaded, err := h.owner.smap.load(smap) 395 396 if err != nil { 397 nlog.Errorf("Failed to load cluster map (\"Smap\"): %v - reinitializing", err) 398 return 399 } 400 if !loaded { 401 return // no local replica - joining from scratch 402 } 403 404 node := smap.GetNode(h.SID()) 405 if node == nil { 406 ty := "targets" 407 if h.si.Type() == apc.Proxy { 408 ty = "proxies" 409 } 410 cos.ExitLogf(notPresentInSmap, cmn.BadSmapPrefix, h.si, smap.StringEx(), ty, h.si, h.owner.smap.fpath, h.si) 411 } 412 if node.Type() != h.si.Type() { 413 cos.ExitLogf("%s: %s is %q while the node in the loaded %s is %q", cmn.BadSmapPrefix, 414 h.si, h.si.Type(), smap.StringEx(), node.Type()) 415 return 416 } 417 418 // 419 // NOTE: not enforcing Snode's immutability - in particular, IPs that may change upon restart in K8s 420 // 421 if _, err := smap.IsDupNet(h.si); err != nil { 422 nlog.Warningln(err, "- proceeding with the loaded", smap.String(), "anyway...") 423 } 424 reliable = true 425 return 426 } 427 428 func (h *htrun) setDaemonConfigMsg(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg, query url.Values) { 429 var ( 430 transient = cos.IsParseBool(query.Get(apc.ActTransient)) 431 toUpdate = &cmn.ConfigToSet{} 432 ) 433 if err := cos.MorphMarshal(msg.Value, toUpdate); err != nil { 434 h.writeErrf(w, r, cmn.FmtErrMorphUnmarshal, h, msg.Action, msg.Value, err) 435 return 436 } 437 438 co := h.owner.config 439 co.Lock() 440 err := setConfig(toUpdate, transient) 441 co.Unlock() 442 if err != nil { 443 h.writeErr(w, r, err) 444 } 445 } 446 447 func (h *htrun) setDaemonConfigQuery(w http.ResponseWriter, r *http.Request) { 448 var ( 449 query = r.URL.Query() 450 transient = cos.IsParseBool(query.Get(apc.ActTransient)) 451 toUpdate = &cmn.ConfigToSet{} 452 ) 453 if err := toUpdate.FillFromQuery(query); err != nil { 454 h.writeErr(w, r, err) 455 return 456 } 457 458 co := h.owner.config 459 co.Lock() 460 err := setConfig(toUpdate, transient) 461 co.Unlock() 462 if err != nil { 463 h.writeErr(w, r, err) 464 } 465 } 466 467 func (h *htrun) run(config *cmn.Config) error { 468 var ( 469 tlsConf *tls.Config 470 logger = log.New(&nlogWriter{}, "net/http err: ", 0) // a wrapper to log http.Server errors 471 ) 472 if config.Net.HTTP.UseHTTPS { 473 c, err := newTLS(&config.Net.HTTP) 474 if err != nil { 475 cos.ExitLog(err) 476 } 477 tlsConf = c 478 } 479 if config.HostNet.UseIntraControl { 480 go func() { 481 _ = g.netServ.control.listen(h.si.ControlNet.TCPEndpoint(), logger, tlsConf, config) 482 }() 483 } 484 if config.HostNet.UseIntraData { 485 go func() { 486 _ = g.netServ.data.listen(h.si.DataNet.TCPEndpoint(), logger, tlsConf, config) 487 }() 488 } 489 490 ep := h.si.PubNet.TCPEndpoint() 491 if h.pubAddrAny(config) { 492 ep = ":" + h.si.PubNet.Port 493 } else if len(h.si.PubExtra) > 0 { 494 pubAddr2 := h.si.PubExtra[0] 495 debug.Assert(pubAddr2.Port == h.si.PubNet.Port) 496 g.netServ.pub2 = &netServer{muxers: g.netServ.pub.muxers, sndRcvBufSize: g.netServ.pub.sndRcvBufSize} 497 go func() { 498 _ = g.netServ.pub2.listen(pubAddr2.TCPEndpoint(), logger, tlsConf, config) 499 }() 500 } 501 502 return g.netServ.pub.listen(ep, logger, tlsConf, config) // stay here 503 } 504 505 // return true to start listening on `INADDR_ANY:PubNet.Port` 506 func (h *htrun) pubAddrAny(config *cmn.Config) (inaddrAny bool) { 507 switch { 508 case config.HostNet.UseIntraControl && h.si.ControlNet.Port == h.si.PubNet.Port: 509 case config.HostNet.UseIntraData && h.si.DataNet.Port == h.si.PubNet.Port: 510 default: 511 inaddrAny = true 512 } 513 return inaddrAny 514 } 515 516 // remove self from Smap (if required), terminate http, and wait (w/ timeout) 517 // for running xactions to abort 518 func (h *htrun) stop(wg *sync.WaitGroup, rmFromSmap bool) { 519 const sleep = time.Second >> 1 520 521 if rmFromSmap { 522 h.unregisterSelf(true) 523 } 524 nlog.Infoln("Shutting down HTTP") 525 526 wg.Add(1) 527 go func() { 528 time.Sleep(sleep) 529 shuthttp() 530 wg.Done() 531 }() 532 entry := xreg.GetRunning(xreg.Flt{}) 533 if entry != nil { 534 time.Sleep(sleep) 535 entry = xreg.GetRunning(xreg.Flt{}) 536 if entry != nil { 537 nlog.Warningln("Timed out waiting for", entry.Kind(), "... to stop") 538 } 539 } 540 541 if h.si.IsTarget() { 542 wg.Wait() 543 } 544 } 545 546 // 547 // intra-cluster IPC, control plane 548 // call another target or a proxy; optionally, include a json-encoded body 549 // 550 551 func (h *htrun) _call(si *meta.Snode, bargs *bcastArgs, results *bcastResults) { 552 cargs := allocCargs() 553 { 554 cargs.si = si 555 cargs.req = bargs.req 556 cargs.timeout = bargs.timeout 557 } 558 cargs.req.Base = si.URL(bargs.network) 559 if bargs.req.BodyR != nil { 560 cargs.req.BodyR, _ = bargs.req.BodyR.(cos.ReadOpenCloser).Open() 561 } 562 cargs.cresv = bargs.cresv 563 res := h.call(cargs, bargs.smap) 564 if bargs.async { 565 freeCR(res) // discard right away 566 } else { 567 results.mu.Lock() 568 results.s = append(results.s, res) 569 results.mu.Unlock() 570 } 571 freeCargs(cargs) 572 } 573 574 func (h *htrun) call(args *callArgs, smap *smapX) (res *callResult) { 575 var ( 576 req *http.Request 577 resp *http.Response 578 client *http.Client 579 sid = unknownDaemonID 580 ) 581 res = allocCR() 582 if args.si != nil { 583 sid = args.si.ID() 584 res.si = args.si 585 } 586 587 debug.Assert(args.si != nil || args.req.Base != "") // either si or base 588 if args.req.Base == "" && args.si != nil { 589 args.req.Base = args.si.ControlNet.URL // by default, use intra-cluster control network 590 } 591 592 if args.req.Header == nil { 593 args.req.Header = make(http.Header) 594 } 595 596 switch args.timeout { 597 case apc.DefaultTimeout: 598 req, res.err = args.req.Req() 599 if res.err != nil { 600 break 601 } 602 client = g.client.control 603 case apc.LongTimeout: 604 req, res.err = args.req.Req() 605 if res.err != nil { 606 break 607 } 608 client = g.client.data 609 default: 610 var cancel context.CancelFunc 611 if args.timeout == 0 { 612 args.timeout = cmn.Rom.CplaneOperation() 613 } 614 req, _, cancel, res.err = args.req.ReqWithTimeout(args.timeout) 615 if res.err != nil { 616 break 617 } 618 defer cancel() 619 620 // NOTE: timeout handling 621 // - timeout causes context.deadlineExceededError, i.e. "context deadline exceeded" 622 // - the two knobs are configurable via "client_timeout" and "client_long_timeout", 623 // respectively (client section in the global config) 624 if args.timeout > g.client.control.Timeout { 625 client = g.client.data 626 } else { 627 client = g.client.control 628 } 629 } 630 if res.err != nil { 631 res.details = fmt.Sprintf("FATAL: failed to create HTTP request %s %s: %v", 632 args.req.Method, args.req.URL(), res.err) 633 return 634 } 635 636 req.Header.Set(apc.HdrCallerID, h.SID()) 637 req.Header.Set(apc.HdrCallerName, h.si.Name()) 638 if smap.vstr != "" { 639 if smap.IsPrimary(h.si) { 640 req.Header.Set(apc.HdrCallerIsPrimary, "true") 641 } 642 req.Header.Set(apc.HdrCallerSmapVer, smap.vstr) 643 } 644 req.Header.Set(cos.HdrUserAgent, ua) 645 646 resp, res.err = client.Do(req) 647 if res.err != nil { 648 res.details = "[control-plane]" // tcp level, e.g.: connection refused 649 return 650 } 651 defer resp.Body.Close() 652 res.status = resp.StatusCode 653 res.header = resp.Header 654 655 // err == nil && bad status: resp.Body contains the error message 656 if res.status >= http.StatusBadRequest { 657 if args.req.Method == http.MethodHead { 658 msg := resp.Header.Get(apc.HdrError) 659 res.err = res.herr(req, msg) 660 } else { 661 b := cmn.NewBuffer() 662 b.ReadFrom(resp.Body) 663 res.err = res.herr(req, b.String()) 664 cmn.FreeBuffer(b) 665 } 666 res.details = res.err.Error() 667 return 668 } 669 670 // read and decode via call result value (`cresv`), if provided 671 // othwerwise, read and return bytes for the caller to unmarshal 672 if args.cresv != nil { 673 res.v = args.cresv.newV() 674 args.cresv.read(res, resp.Body) 675 if res.err != nil { 676 return 677 } 678 } else { 679 res.read(resp.Body) 680 if res.err != nil { 681 return 682 } 683 } 684 685 if sid != unknownDaemonID { 686 h.keepalive.heardFrom(sid) 687 } 688 return 689 } 690 691 // 692 // intra-cluster IPC, control plane: notify another node 693 // 694 695 func (h *htrun) notifyTerm(n core.Notif, err error, aborted bool) { 696 h._nfy(n, err, apc.Finished, aborted) 697 } 698 func (h *htrun) notifyProgress(n core.Notif) { h._nfy(n, nil, apc.Progress, false) } 699 700 func (h *htrun) _nfy(n core.Notif, err error, upon string, aborted bool) { 701 var ( 702 smap = h.owner.smap.get() 703 dsts = n.Subscribers() 704 msg = n.ToNotifMsg(aborted) 705 args = allocBcArgs() 706 nodes = args.selected 707 ) 708 debug.Assert(upon == apc.Progress || upon == apc.Finished) 709 if len(dsts) == 1 && dsts[0] == equalIC { 710 for pid, psi := range smap.Pmap { 711 if smap.IsIC(psi) && pid != h.si.ID() && !psi.InMaintOrDecomm() { 712 nodes = append(nodes, psi) 713 } 714 } 715 } else { 716 for _, dst := range dsts { 717 debug.Assert(dst != equalIC) 718 if si := smap.GetActiveNode(dst); si != nil { 719 nodes = append(nodes, si) 720 } else { 721 nlog.Errorln(&errNodeNotFound{"failed to notify", dst, h.si, smap}) 722 } 723 } 724 } 725 if err != nil { 726 msg.ErrMsg = err.Error() 727 msg.AbortedX = aborted 728 } 729 msg.NodeID = h.si.ID() 730 if len(nodes) == 0 { 731 nlog.Errorf("%s: have no nodes to send [%s] notification", h, &msg) 732 return 733 } 734 path := apc.URLPathNotifs.Join(upon) 735 args.req = cmn.HreqArgs{Method: http.MethodPost, Path: path, Body: cos.MustMarshal(&msg)} 736 args.network = cmn.NetIntraControl 737 args.timeout = cmn.Rom.MaxKeepalive() 738 args.selected = nodes 739 args.nodeCount = len(nodes) 740 args.smap = smap 741 args.async = true 742 _ = h.bcastSelected(args) 743 freeBcArgs(args) 744 } 745 746 // 747 // intra-cluster comm 748 // 749 750 // bcastGroup broadcasts a message to a specific group of nodes: targets, proxies, all. 751 func (h *htrun) bcastGroup(args *bcastArgs) sliceResults { 752 if args.smap == nil { 753 args.smap = h.owner.smap.get() 754 } 755 present := args.smap.isPresent(h.si) 756 if args.network == "" { 757 args.network = cmn.NetIntraControl 758 } 759 debug.Assert(cmn.NetworkIsKnown(args.network)) 760 if args.timeout == 0 { 761 args.timeout = cmn.Rom.CplaneOperation() 762 debug.Assert(args.timeout != 0) 763 } 764 765 switch args.to { 766 case core.Targets: 767 args.nodes = []meta.NodeMap{args.smap.Tmap} 768 args.nodeCount = len(args.smap.Tmap) 769 if present && h.si.IsTarget() { 770 args.nodeCount-- 771 } 772 case core.Proxies: 773 args.nodes = []meta.NodeMap{args.smap.Pmap} 774 args.nodeCount = len(args.smap.Pmap) 775 if present && h.si.IsProxy() { 776 args.nodeCount-- 777 } 778 case core.AllNodes: 779 args.nodes = []meta.NodeMap{args.smap.Pmap, args.smap.Tmap} 780 args.nodeCount = len(args.smap.Pmap) + len(args.smap.Tmap) 781 if present { 782 args.nodeCount-- 783 } 784 case core.SelectedNodes: 785 args.nodeCount = len(args.nodes) 786 debug.Assert(args.nodeCount > 0) 787 default: 788 debug.Assert(false, args.to) 789 } 790 return h.bcastNodes(args) 791 } 792 793 // broadcast to the specified destinations (`bargs.nodes`) 794 // (if specified, `bargs.req.BodyR` must implement `cos.ReadOpenCloser`) 795 func (h *htrun) bcastNodes(bargs *bcastArgs) sliceResults { 796 var ( 797 results bcastResults 798 wg = cos.NewLimitedWaitGroup(cmn.MaxParallelism(), bargs.nodeCount) 799 f = func(si *meta.Snode) { h._call(si, bargs, &results); wg.Done() } 800 ) 801 debug.Assert(len(bargs.selected) == 0) 802 if !bargs.async { 803 results.s = allocBcastRes(len(bargs.nodes)) 804 } 805 for _, nodeMap := range bargs.nodes { 806 for _, si := range nodeMap { 807 if si.ID() == h.si.ID() { 808 continue 809 } 810 811 // TODO: remove 812 debug.Func(func() { 813 if si.URL(bargs.network) == h.si.URL(bargs.network) { 814 nlog.Errorf(fmtErrNetInfoChanged, h, si.StringEx(), si.URL(bargs.network)) 815 } 816 }) 817 818 if !bargs.ignoreMaintenance && si.InMaintOrDecomm() { 819 continue 820 } 821 wg.Add(1) 822 go f(si) 823 } 824 } 825 wg.Wait() 826 return results.s 827 } 828 829 func (h *htrun) bcastSelected(bargs *bcastArgs) sliceResults { 830 var ( 831 results bcastResults 832 wg = cos.NewLimitedWaitGroup(cmn.MaxParallelism(), bargs.nodeCount) 833 f = func(si *meta.Snode) { h._call(si, bargs, &results); wg.Done() } 834 ) 835 debug.Assert(len(bargs.selected) > 0) 836 if !bargs.async { 837 results.s = allocBcastRes(len(bargs.selected)) 838 } 839 for _, si := range bargs.selected { 840 debug.Assert(si.ID() != h.si.ID()) 841 wg.Add(1) 842 go f(si) 843 } 844 wg.Wait() 845 return results.s 846 } 847 848 func (h *htrun) bcastAsyncIC(msg *aisMsg) { 849 var ( 850 wg = &sync.WaitGroup{} 851 smap = h.owner.smap.get() 852 args = allocBcArgs() 853 ) 854 args.req = cmn.HreqArgs{Method: http.MethodPost, Path: apc.URLPathIC.S, Body: cos.MustMarshal(msg)} 855 args.network = cmn.NetIntraControl 856 args.timeout = cmn.Rom.MaxKeepalive() 857 for pid, psi := range smap.Pmap { 858 if pid == h.si.ID() || !smap.IsIC(psi) || smap.GetActiveNode(pid) == nil { 859 continue 860 } 861 wg.Add(1) 862 go func(si *meta.Snode) { 863 cargs := allocCargs() 864 { 865 cargs.si = si 866 cargs.req = args.req 867 cargs.timeout = args.timeout 868 } 869 res := h.call(cargs, smap) 870 freeCargs(cargs) 871 freeCR(res) // discard right away 872 wg.Done() 873 }(psi) 874 } 875 wg.Wait() 876 freeBcArgs(args) 877 } 878 879 func (h *htrun) bcastAllNodes(w http.ResponseWriter, r *http.Request, args *bcastArgs) { 880 args.to = core.AllNodes 881 results := h.bcastGroup(args) 882 for _, res := range results { 883 if res.err != nil { 884 h.writeErr(w, r, res.toErr()) 885 break 886 } 887 } 888 freeBcastRes(results) 889 } 890 891 // 892 // parsing helpers 893 // 894 895 // remove validated fields and return the resulting slice 896 func (h *htrun) parseURL(w http.ResponseWriter, r *http.Request, itemsPresent []string, itemsAfter int, splitAfter bool) ([]string, error) { 897 items, err := cmn.ParseURL(r.URL.Path, itemsPresent, itemsAfter, splitAfter) 898 if err != nil { 899 h.writeErr(w, r, err) 900 } 901 return items, err 902 } 903 904 func (h *htrun) writeMsgPack(w http.ResponseWriter, v msgp.Encodable, tag string) (ok bool) { 905 var ( 906 err error 907 buf, slab = h.gmm.AllocSize(cmn.MsgpLsoBufSize) // max size 908 mw = msgp.NewWriterBuf(w, buf) 909 ) 910 w.Header().Set(cos.HdrContentType, cos.ContentMsgPack) 911 if err = v.EncodeMsg(mw); err == nil { 912 err = mw.Flush() 913 } 914 slab.Free(buf) 915 if err == nil { 916 return true 917 } 918 h.logerr(tag, v, err) 919 return false 920 } 921 922 func (h *htrun) writeJSON(w http.ResponseWriter, r *http.Request, v any, tag string) { 923 if err := _writejs(w, r, v); err != nil { 924 h.logerr(tag, v, err) 925 } 926 } 927 928 // same as above with boolean return to facilitate early termination 929 func (h *htrun) writeJS(w http.ResponseWriter, r *http.Request, v any, tag string) bool { 930 if err := _writejs(w, r, v); err != nil { 931 h.logerr(tag, v, err) 932 return false 933 } 934 return true 935 } 936 937 func _writejs(w http.ResponseWriter, r *http.Request, v any) (err error) { 938 w.Header().Set(cos.HdrContentType, cos.ContentJSONCharsetUTF) 939 if isBrowser(r.Header.Get(cos.HdrUserAgent)) { 940 var out []byte 941 if out, err = jsoniter.MarshalIndent(v, "", " "); err == nil { 942 w.Header().Set(cos.HdrContentLength, strconv.Itoa(len(out))) 943 _, err = w.Write(out) 944 } 945 } else { // previously: new-encoder(w).encode(v) (non-browser client) 946 j := cos.JSON.BorrowStream(nil) 947 j.WriteVal(v) 948 j.WriteRaw("\n") 949 if err = j.Error; err == nil { 950 b := j.Buffer() 951 w.Header().Set(cos.HdrContentLength, strconv.Itoa(len(b))) 952 _, err = w.Write(b) 953 954 // NOTE: consider http.NewResponseController(w).Flush() 955 } 956 cos.JSON.ReturnStream(j) 957 } 958 return 959 } 960 961 // See https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent 962 // and https://developer.mozilla.org/en-US/docs/Web/HTTP/Browser_detection_using_the_user_agent 963 func isBrowser(userAgent string) bool { 964 return strings.HasPrefix(userAgent, "Mozilla/5.0") 965 } 966 967 func (h *htrun) logerr(tag string, v any, err error) { 968 const maxl = 48 969 var efmt, msg string 970 if nlog.Stopping() { 971 return 972 } 973 if v != nil { 974 efmt = fmt.Sprintf("message: {%+v", v) 975 if len(efmt) > maxl { 976 efmt = efmt[:maxl] + "...}" 977 } else { 978 efmt += "}" 979 } 980 } 981 efmt = tag + " response error: %v, " + efmt + " at " 982 msg = fmt.Sprintf(efmt, err) 983 for i := 1; i < 4; i++ { 984 _, file, line, ok := runtime.Caller(i) 985 if !ok { 986 break 987 } 988 if i > 1 { 989 msg += " <- " 990 } 991 f := filepath.Base(file) 992 msg += fmt.Sprintf("%s:%d", f, line) 993 } 994 if cos.IsErrBrokenPipe(err) { // client went away 995 nlog.Infoln("Warning: " + msg) 996 } else { 997 nlog.Errorln(msg) 998 } 999 h.statsT.IncErr(stats.ErrHTTPWriteCount) 1000 } 1001 1002 func _parseNCopies(value any) (copies int64, err error) { 1003 switch v := value.(type) { 1004 case string: 1005 copies, err = strconv.ParseInt(v, 10, 16) 1006 case float64: 1007 copies = int64(v) 1008 default: 1009 err = fmt.Errorf("failed to parse 'copies' (%v, %T) - unexpected type", value, value) 1010 } 1011 return 1012 } 1013 1014 func _checkAction(msg *apc.ActMsg, expectedActions ...string) (err error) { 1015 found := false 1016 for _, action := range expectedActions { 1017 found = found || msg.Action == action 1018 } 1019 if !found { 1020 err = fmt.Errorf(fmtErrInvaldAction, msg.Action, expectedActions) 1021 } 1022 return 1023 } 1024 1025 // 1026 // common cplane cont-d 1027 // 1028 1029 func (h *htrun) httpdaeget(w http.ResponseWriter, r *http.Request, query url.Values, htext htext) { 1030 var ( 1031 body any 1032 what = query.Get(apc.QparamWhat) 1033 ) 1034 switch what { 1035 case apc.WhatNodeConfig: 1036 var ( 1037 c cmn.Config 1038 config = cmn.GCO.Get() 1039 ) 1040 // hide secret 1041 c = *config 1042 c.Auth.Secret = "**********" 1043 body = &c 1044 case apc.WhatSmap: 1045 body = h.owner.smap.get() 1046 case apc.WhatBMD: 1047 body = h.owner.bmd.get() 1048 case apc.WhatSmapVote: 1049 var err error 1050 body, err = h.cluMeta(cmetaFillOpt{htext: htext, skipPrimeTime: true}) 1051 if err != nil { 1052 nlog.Errorf("failed to fetch cluster config, err: %v", err) 1053 } 1054 case apc.WhatSnode: 1055 body = h.si 1056 case apc.WhatLog: 1057 if cos.IsParseBool(query.Get(apc.QparamAllLogs)) { 1058 tempdir := h.sendAllLogs(w, r, query) 1059 if tempdir != "" { 1060 err := os.RemoveAll(tempdir) 1061 debug.AssertNoErr(err) 1062 } 1063 } else { 1064 h.sendOneLog(w, r, query) 1065 } 1066 return 1067 case apc.WhatNodeStats: 1068 statsNode := h.statsT.GetStats() 1069 statsNode.Snode = h.si 1070 body = statsNode 1071 case apc.WhatNodeStatsV322: 1072 statsNode := h.statsT.GetStatsV322() 1073 statsNode.Snode = h.si 1074 body = statsNode 1075 case apc.WhatMetricNames: 1076 body = h.statsT.GetMetricNames() 1077 case apc.WhatNodeStatsAndStatusV322: 1078 ds := h.statsAndStatusV322() 1079 daeStats := h.statsT.GetStatsV322() 1080 ds.Tracker = daeStats.Tracker 1081 body = ds 1082 default: 1083 h.writeErrf(w, r, "invalid GET /daemon request: unrecognized what=%s", what) 1084 return 1085 } 1086 h.writeJSON(w, r, body, "httpdaeget-"+what) 1087 } 1088 1089 func (h *htrun) statsAndStatus() (ds *stats.NodeStatus) { 1090 smap := h.owner.smap.get() 1091 ds = &stats.NodeStatus{ 1092 Node: stats.Node{ 1093 Snode: h.si, 1094 }, 1095 SmapVersion: smap.Version, 1096 MemCPUInfo: apc.GetMemCPU(), 1097 DeploymentType: deploymentType(), 1098 Version: daemon.version, 1099 BuildTime: daemon.buildTime, 1100 K8sPodName: os.Getenv(env.AIS.K8sPod), 1101 Status: h._status(smap), 1102 } 1103 return ds 1104 } 1105 1106 // [backward compatibility] v3.22 and prior 1107 func (h *htrun) statsAndStatusV322() (ds *stats.NodeStatusV322) { 1108 smap := h.owner.smap.get() 1109 ds = &stats.NodeStatusV322{ 1110 NodeV322: stats.NodeV322{ 1111 Snode: h.si, 1112 }, 1113 SmapVersion: smap.Version, 1114 MemCPUInfo: apc.GetMemCPU(), 1115 DeploymentType: deploymentType(), 1116 Version: daemon.version, 1117 BuildTime: daemon.buildTime, 1118 K8sPodName: os.Getenv(env.AIS.K8sPod), 1119 Status: h._status(smap), 1120 } 1121 return ds 1122 } 1123 1124 func (h *htrun) sendAllLogs(w http.ResponseWriter, r *http.Request, query url.Values) string { 1125 sev := query.Get(apc.QparamLogSev) 1126 tempdir, archname, err := h.targzLogs(sev) 1127 if err != nil { 1128 h.writeErr(w, r, err) 1129 return tempdir 1130 } 1131 fh, err := os.Open(archname) 1132 if err != nil { 1133 h.writeErr(w, r, err) 1134 return tempdir 1135 } 1136 buf, slab := h.gmm.Alloc() 1137 if written, err := io.CopyBuffer(w, fh, buf); err != nil { 1138 nlog.Errorf("failed to read %s: %v (written=%d)", archname, err, written) 1139 } 1140 cos.Close(fh) 1141 slab.Free(buf) 1142 return tempdir 1143 } 1144 1145 func (h *htrun) sendOneLog(w http.ResponseWriter, r *http.Request, query url.Values) { 1146 sev := query.Get(apc.QparamLogSev) 1147 log, err := sev2Logname(sev) 1148 if err != nil { 1149 h.writeErr(w, r, err) 1150 return 1151 } 1152 fh, err := os.Open(log) 1153 if err != nil { 1154 ecode := http.StatusInternalServerError 1155 if os.IsNotExist(err) { 1156 ecode = http.StatusNotFound 1157 } 1158 h.writeErr(w, r, err, ecode) 1159 return 1160 } 1161 soff := query.Get(apc.QparamLogOff) 1162 if soff != "" { 1163 var ( 1164 off int64 1165 err error 1166 finfo os.FileInfo 1167 ) 1168 off, err = strconv.ParseInt(soff, 10, 64) 1169 if err == nil { 1170 finfo, err = os.Stat(log) 1171 if err == nil { 1172 if siz := finfo.Size(); off > siz { 1173 err = fmt.Errorf("log likely rotated (offset %d, size %d)", off, siz) 1174 } 1175 } 1176 } 1177 if err == nil { 1178 _, err = fh.Seek(off, io.SeekStart) 1179 } 1180 if err != nil { 1181 cos.Close(fh) 1182 h.writeErr(w, r, err) 1183 return 1184 } 1185 } 1186 buf, slab := h.gmm.Alloc() 1187 if written, err := io.CopyBuffer(w, fh, buf); err != nil { 1188 // at this point, http err must be already on its way 1189 nlog.Errorf("failed to read %s: %v (written=%d)", log, err, written) 1190 } 1191 cos.Close(fh) 1192 slab.Free(buf) 1193 } 1194 1195 // see also: cli 'log get --all' 1196 func (h *htrun) targzLogs(severity string) (tempdir, archname string, err error) { 1197 var ( 1198 wfh *os.File 1199 dentries []os.DirEntry 1200 logdir = cmn.GCO.Get().LogDir 1201 ) 1202 dentries, err = os.ReadDir(logdir) 1203 if err != nil { 1204 err = fmt.Errorf("read-dir %w", err) 1205 return 1206 } 1207 tempdir = filepath.Join(os.TempDir(), "aislogs-"+h.SID()) 1208 err = cos.CreateDir(tempdir) 1209 if err != nil { 1210 err = fmt.Errorf("create-dir %w", err) 1211 return 1212 } 1213 wfh, err = os.CreateTemp(tempdir, "") 1214 if err != nil { 1215 err = fmt.Errorf("create-temp %w", err) 1216 return 1217 } 1218 archname = wfh.Name() 1219 aw := archive.NewWriter(archive.ExtTarGz, wfh, nil /*checksum*/, nil /*opts*/) 1220 1221 defer func() { 1222 aw.Fini() 1223 wfh.Close() 1224 }() 1225 1226 for _, dent := range dentries { 1227 if !dent.Type().IsRegular() { 1228 continue 1229 } 1230 finfo, errV := dent.Info() 1231 if errV != nil { 1232 continue 1233 } 1234 var ( 1235 fullPath = filepath.Join(logdir, finfo.Name()) 1236 rfh *os.File 1237 ) 1238 if !logname2Sev(fullPath, severity) { 1239 continue 1240 } 1241 rfh, err = os.Open(fullPath) 1242 if err != nil { 1243 if os.IsNotExist(err) { 1244 continue 1245 } 1246 return 1247 } 1248 oah := cos.SimpleOAH{Size: finfo.Size(), Atime: finfo.ModTime().UnixNano()} 1249 err = aw.Write(finfo.Name(), oah, rfh) 1250 rfh.Close() 1251 if err != nil { 1252 return 1253 } 1254 } 1255 return 1256 } 1257 1258 func sev2Logname(severity string) (log string, err error) { 1259 var ( 1260 dir = cmn.GCO.Get().LogDir 1261 sev = apc.LogInfo[0] // default 1262 ) 1263 if severity != "" { 1264 sev = strings.ToLower(severity)[0] 1265 } 1266 switch sev { 1267 case apc.LogInfo[0]: 1268 log = filepath.Join(dir, nlog.InfoLogName()) 1269 case apc.LogWarn[0], apc.LogErr[0]: 1270 log = filepath.Join(dir, nlog.ErrLogName()) 1271 default: 1272 err = fmt.Errorf("unknown log severity %q", severity) 1273 } 1274 return 1275 } 1276 1277 func logname2Sev(fname, severity string) bool { 1278 log, err := sev2Logname(severity) 1279 if err != nil { 1280 nlog.Warningln(err) 1281 return false 1282 } 1283 i := strings.LastIndexByte(log, '.') 1284 if i < 0 { 1285 nlog.Warningf("%q: unexpected log name format", log) 1286 return false 1287 } 1288 return strings.Contains(fname, log[i:]) 1289 } 1290 1291 // 1292 // HTTP err + spec message + code + stats 1293 // 1294 1295 const Silent = 1 1296 1297 func (*htrun) writeErr(w http.ResponseWriter, r *http.Request, err error, ecode ...int) { 1298 cmn.WriteErr(w, r, err, ecode...) // [ecode[, silent]] 1299 } 1300 1301 func (*htrun) writeErrMsg(w http.ResponseWriter, r *http.Request, msg string, ecode ...int) { 1302 cmn.WriteErrMsg(w, r, msg, ecode...) // [ecode[, silent]] 1303 } 1304 1305 func (h *htrun) writeErrSilentf(w http.ResponseWriter, r *http.Request, ecode int, format string, a ...any) { 1306 err := fmt.Errorf(format, a...) 1307 h.writeErr(w, r, err, ecode, Silent) 1308 } 1309 1310 func (h *htrun) writeErrStatusf(w http.ResponseWriter, r *http.Request, ecode int, format string, a ...any) { 1311 err := fmt.Errorf(format, a...) 1312 h.writeErrMsg(w, r, err.Error(), ecode) 1313 } 1314 1315 func (h *htrun) writeErrf(w http.ResponseWriter, r *http.Request, format string, a ...any) { 1316 err := fmt.Errorf(format, a...) 1317 if cos.IsNotExist(err, 0) { 1318 h.writeErrMsg(w, r, err.Error(), http.StatusNotFound) 1319 } else { 1320 h.writeErrMsg(w, r, err.Error()) 1321 } 1322 } 1323 1324 func (h *htrun) writeErrURL(w http.ResponseWriter, r *http.Request) { 1325 if r.URL.Scheme != "" { 1326 h.writeErrf(w, r, "request '%s %s://%s': invalid URL path", r.Method, r.URL.Scheme, r.URL.Path) 1327 return 1328 } 1329 // ignore GET /favicon.ico by Browsers 1330 if r.URL.Path == "/favicon.ico" || r.URL.Path == "favicon.ico" { 1331 return 1332 } 1333 h.writeErrf(w, r, "invalid request URI: '%s %s'", r.Method, r.RequestURI) 1334 } 1335 1336 func (h *htrun) writeErrAct(w http.ResponseWriter, r *http.Request, action string) { 1337 err := cmn.InitErrHTTP(r, fmt.Errorf("invalid action %q", action), 0) 1338 h.writeErr(w, r, err) 1339 cmn.FreeHterr(err) 1340 } 1341 1342 func (h *htrun) writeErrActf(w http.ResponseWriter, r *http.Request, action string, 1343 format string, a ...any) { 1344 detail := fmt.Sprintf(format, a...) 1345 err := cmn.InitErrHTTP(r, fmt.Errorf("invalid action %q: %s", action, detail), 0) 1346 h.writeErr(w, r, err) 1347 cmn.FreeHterr(err) 1348 } 1349 1350 // also, validatePrefix 1351 func (h *htrun) isValidObjname(w http.ResponseWriter, r *http.Request, name string) bool { 1352 if err := cmn.ValidateObjName(name); err != nil { 1353 h.writeErr(w, r, err) 1354 return false 1355 } 1356 return true 1357 } 1358 1359 // health client 1360 func (h *htrun) reqHealth(si *meta.Snode, timeout time.Duration, query url.Values, smap *smapX) (b []byte, status int, err error) { 1361 var ( 1362 path = apc.URLPathHealth.S 1363 url = si.URL(cmn.NetIntraControl) 1364 cargs = allocCargs() 1365 ) 1366 { 1367 cargs.si = si 1368 cargs.req = cmn.HreqArgs{Method: http.MethodGet, Base: url, Path: path, Query: query} 1369 cargs.timeout = timeout 1370 } 1371 res := h.call(cargs, smap) 1372 b, status, err = res.bytes, res.status, res.err 1373 freeCargs(cargs) 1374 freeCR(res) 1375 return 1376 } 1377 1378 // - utilizes reqHealth (above) to discover a _better_ Smap, if exists 1379 // - via getMaxCii.do() 1380 // - checkAll: query all nodes 1381 // - consider adding max-ver BMD bit here as well (TODO) 1382 func (h *htrun) bcastHealth(smap *smapX, checkAll bool) (*cifl.Info, int /*num confirmations*/) { 1383 if !smap.isValid() { 1384 nlog.Errorf("%s: cannot execute with invalid %s", h, smap) 1385 return nil, 0 1386 } 1387 c := getMaxCii{ 1388 h: h, 1389 maxCii: &cifl.Info{}, 1390 query: url.Values{apc.QparamClusterInfo: []string{"true"}}, 1391 timeout: cmn.Rom.CplaneOperation(), 1392 checkAll: checkAll, 1393 } 1394 smap.fill(c.maxCii) 1395 1396 h._bch(&c, smap, apc.Proxy) 1397 if checkAll || (c.cnt < maxVerConfirmations && smap.CountActiveTs() > 0) { 1398 h._bch(&c, smap, apc.Target) 1399 } 1400 nlog.Infoln(h.String()+":", c.maxCii.String()) 1401 return c.maxCii, c.cnt 1402 } 1403 1404 func (h *htrun) _bch(c *getMaxCii, smap *smapX, nodeTy string) { 1405 var ( 1406 wg cos.WG 1407 i, count int 1408 nodemap = smap.Pmap 1409 ) 1410 if nodeTy == apc.Target { 1411 nodemap = smap.Tmap 1412 } 1413 if c.checkAll { 1414 wg = cos.NewLimitedWaitGroup(cmn.MaxParallelism(), len(nodemap)) 1415 } else { 1416 count = min(cmn.MaxParallelism(), maxVerConfirmations<<1) 1417 wg = cos.NewLimitedWaitGroup(count, len(nodemap) /*have*/) 1418 } 1419 for sid, si := range nodemap { 1420 if sid == h.si.ID() { 1421 continue 1422 } 1423 if si.InMaintOrDecomm() { 1424 continue 1425 } 1426 if count > 0 && count < len(nodemap) && i > count { 1427 if c.haveEnough() { 1428 break 1429 } 1430 } 1431 wg.Add(1) 1432 i++ 1433 go c.do(si, wg, smap) 1434 } 1435 wg.Wait() 1436 } 1437 1438 // 1439 // metasync Rx 1440 // 1441 1442 func logmsync(ver int64, revs revs, msg *aisMsg, opts ...string) { 1443 const tag = "msync Rx:" 1444 var ( 1445 what string 1446 caller = opts[0] 1447 lv = strconv.FormatInt(ver, 10) 1448 ) 1449 if len(opts) == 1 { 1450 what = revs.String() 1451 } else { 1452 what = opts[1] 1453 } 1454 switch { 1455 case ver == revs.version(): 1456 nlog.InfoDepth(1, tag, what, "(same v"+lv+",", msg.String(), "<--", caller+")") 1457 case ver > revs.version(): 1458 nlog.InfoDepth(1, "Warning", tag, what, "(down from v"+lv+",", msg.String(), "<--", caller+")") 1459 default: 1460 nlog.InfoDepth(1, tag, "new", what, "(have v"+lv+",", msg.String(), "<--", caller+")") 1461 } 1462 } 1463 1464 func (h *htrun) extractConfig(payload msPayload, caller string) (newConfig *globalConfig, msg *aisMsg, err error) { 1465 if _, ok := payload[revsConfTag]; !ok { 1466 return 1467 } 1468 newConfig, msg = &globalConfig{}, &aisMsg{} 1469 confValue := payload[revsConfTag] 1470 reader := bytes.NewBuffer(confValue) 1471 if _, err1 := jsp.Decode(io.NopCloser(reader), newConfig, newConfig.JspOpts(), "extractConfig"); err1 != nil { 1472 err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "new Config", cos.BHead(confValue), err1) 1473 return 1474 } 1475 if msgValue, ok := payload[revsConfTag+revsActionTag]; ok { 1476 if err1 := jsoniter.Unmarshal(msgValue, msg); err1 != nil { 1477 err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "action message", cos.BHead(msgValue), err1) 1478 return 1479 } 1480 } 1481 config := cmn.GCO.Get() 1482 if cmn.Rom.FastV(4, cos.SmoduleAIS) { 1483 logmsync(config.Version, newConfig, msg, caller) 1484 } 1485 if newConfig.version() <= config.Version { 1486 if newConfig.version() < config.Version { 1487 err = newErrDowngrade(h.si, config.String(), newConfig.String()) 1488 } 1489 newConfig = nil 1490 } 1491 return 1492 } 1493 1494 func (h *htrun) extractEtlMD(payload msPayload, caller string) (newMD *etlMD, msg *aisMsg, err error) { 1495 if _, ok := payload[revsEtlMDTag]; !ok { 1496 return 1497 } 1498 newMD, msg = newEtlMD(), &aisMsg{} 1499 etlMDValue := payload[revsEtlMDTag] 1500 reader := bytes.NewBuffer(etlMDValue) 1501 if _, err1 := jsp.Decode(io.NopCloser(reader), newMD, newMD.JspOpts(), "extractEtlMD"); err1 != nil { 1502 err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "new EtlMD", cos.BHead(etlMDValue), err1) 1503 return 1504 } 1505 if msgValue, ok := payload[revsEtlMDTag+revsActionTag]; ok { 1506 if err1 := jsoniter.Unmarshal(msgValue, msg); err1 != nil { 1507 err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "action message", cos.BHead(msgValue), err1) 1508 return 1509 } 1510 } 1511 etlMD := h.owner.etl.get() 1512 if cmn.Rom.FastV(4, cos.SmoduleAIS) { 1513 logmsync(etlMD.Version, newMD, msg, caller) 1514 } 1515 if newMD.version() <= etlMD.version() { 1516 if newMD.version() < etlMD.version() { 1517 err = newErrDowngrade(h.si, etlMD.String(), newMD.String()) 1518 } 1519 newMD = nil 1520 } 1521 return 1522 } 1523 1524 func (h *htrun) extractSmap(payload msPayload, caller string, skipValidation bool) (newSmap *smapX, msg *aisMsg, err error) { 1525 if _, ok := payload[revsSmapTag]; !ok { 1526 return 1527 } 1528 newSmap, msg = &smapX{}, &aisMsg{} 1529 smapValue := payload[revsSmapTag] 1530 reader := bytes.NewBuffer(smapValue) 1531 if _, err1 := jsp.Decode(io.NopCloser(reader), newSmap, newSmap.JspOpts(), "extractSmap"); err1 != nil { 1532 err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "new Smap", cos.BHead(smapValue), err1) 1533 return 1534 } 1535 if msgValue, ok := payload[revsSmapTag+revsActionTag]; ok { 1536 if err1 := jsoniter.Unmarshal(msgValue, msg); err1 != nil { 1537 err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "action message", cos.BHead(msgValue), err1) 1538 return 1539 } 1540 } 1541 if skipValidation { 1542 return 1543 } 1544 1545 var ( 1546 smap = h.owner.smap.get() 1547 curVer = smap.version() 1548 isManualReb = msg.Action == apc.ActRebalance && msg.Value != nil 1549 ) 1550 if newSmap.version() == curVer && !isManualReb { 1551 newSmap = nil 1552 return 1553 } 1554 if !newSmap.isValid() { 1555 err = cmn.NewErrFailedTo(h, "extract", newSmap, newSmap.validate()) 1556 return 1557 } 1558 if !newSmap.isPresent(h.si) { 1559 err = fmt.Errorf("%s: not finding ourselves in %s", h, newSmap) 1560 return 1561 } 1562 if err = smap.validateUUID(h.si, newSmap, caller, 50 /* ciError */); err != nil { 1563 return // FATAL: cluster integrity error 1564 } 1565 if cmn.Rom.FastV(4, cos.SmoduleAIS) { 1566 logmsync(smap.Version, newSmap, msg, caller) 1567 } 1568 _, sameOrigin, _, eq := smap.Compare(&newSmap.Smap) 1569 debug.Assert(sameOrigin) 1570 if newSmap.version() < curVer { 1571 if !eq { 1572 err = newErrDowngrade(h.si, smap.StringEx(), newSmap.StringEx()) 1573 return 1574 } 1575 nlog.Warningf("%s: %s and %s are otherwise identical", h.si, newSmap.StringEx(), smap.StringEx()) 1576 newSmap = nil 1577 } 1578 return 1579 } 1580 1581 func (h *htrun) extractRMD(payload msPayload, caller string) (newRMD *rebMD, msg *aisMsg, err error) { 1582 if _, ok := payload[revsRMDTag]; !ok { 1583 return 1584 } 1585 newRMD, msg = &rebMD{}, &aisMsg{} 1586 rmdValue := payload[revsRMDTag] 1587 if err1 := jsoniter.Unmarshal(rmdValue, newRMD); err1 != nil { 1588 err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "new RMD", cos.BHead(rmdValue), err1) 1589 return 1590 } 1591 if msgValue, ok := payload[revsRMDTag+revsActionTag]; ok { 1592 if err1 := jsoniter.Unmarshal(msgValue, msg); err1 != nil { 1593 err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "action message", cos.BHead(msgValue), err1) 1594 return 1595 } 1596 } 1597 1598 rmd := h.owner.rmd.get() 1599 if newRMD.CluID != "" && newRMD.CluID != rmd.CluID && rmd.CluID != "" { 1600 logmsync(rmd.Version, newRMD, msg, caller) 1601 err = h.owner.rmd.newClusterIntegrityErr(h.String(), newRMD.CluID, rmd.CluID, rmd.Version) 1602 cos.ExitLog(err) // FATAL 1603 } 1604 1605 if cmn.Rom.FastV(4, cos.SmoduleAIS) { 1606 logmsync(rmd.Version, newRMD, msg, caller) 1607 } 1608 if newRMD.version() <= rmd.version() { 1609 if newRMD.version() < rmd.version() { 1610 err = newErrDowngrade(h.si, rmd.String(), newRMD.String()) 1611 } 1612 newRMD = nil 1613 } 1614 return 1615 } 1616 1617 func (h *htrun) extractBMD(payload msPayload, caller string) (newBMD *bucketMD, msg *aisMsg, err error) { 1618 if _, ok := payload[revsBMDTag]; !ok { 1619 return 1620 } 1621 newBMD, msg = &bucketMD{}, &aisMsg{} 1622 bmdValue := payload[revsBMDTag] 1623 reader := bytes.NewBuffer(bmdValue) 1624 if _, err1 := jsp.Decode(io.NopCloser(reader), newBMD, newBMD.JspOpts(), "extractBMD"); err1 != nil { 1625 err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "new BMD", cos.BHead(bmdValue), err1) 1626 return 1627 } 1628 if msgValue, ok := payload[revsBMDTag+revsActionTag]; ok { 1629 if err1 := jsoniter.Unmarshal(msgValue, msg); err1 != nil { 1630 err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "action message", cos.BHead(msgValue), err1) 1631 return 1632 } 1633 } 1634 bmd := h.owner.bmd.get() 1635 if cmn.Rom.FastV(4, cos.SmoduleAIS) { 1636 logmsync(bmd.Version, newBMD, msg, caller) 1637 } 1638 // skip older iff not transactional - see t.receiveBMD() 1639 if h.si.IsTarget() && msg.UUID != "" { 1640 return 1641 } 1642 if newBMD.version() <= bmd.version() { 1643 if newBMD.version() < bmd.version() { 1644 err = newErrDowngrade(h.si, bmd.StringEx(), newBMD.StringEx()) 1645 } 1646 newBMD = nil 1647 } 1648 return 1649 } 1650 1651 func (h *htrun) receiveSmap(newSmap *smapX, msg *aisMsg, payload msPayload, caller string, cb smapUpdatedCB) error { 1652 if newSmap == nil { 1653 return nil 1654 } 1655 smap := h.owner.smap.get() 1656 logmsync(smap.Version, newSmap, msg, caller, newSmap.StringEx()) 1657 1658 if !newSmap.isPresent(h.si) { 1659 return fmt.Errorf("%s: not finding self in the new %s", h, newSmap) 1660 } 1661 return h.owner.smap.synchronize(h.si, newSmap, payload, cb) 1662 } 1663 1664 func (h *htrun) receiveEtlMD(newEtlMD *etlMD, msg *aisMsg, payload msPayload, caller string, cb func(ne, oe *etlMD)) (err error) { 1665 if newEtlMD == nil { 1666 return 1667 } 1668 etlMD := h.owner.etl.get() 1669 logmsync(etlMD.Version, newEtlMD, msg, caller) 1670 1671 h.owner.etl.Lock() 1672 etlMD = h.owner.etl.get() 1673 if newEtlMD.version() <= etlMD.version() { 1674 h.owner.etl.Unlock() 1675 if newEtlMD.version() < etlMD.version() { 1676 err = newErrDowngrade(h.si, etlMD.String(), newEtlMD.String()) 1677 } 1678 return 1679 } 1680 err = h.owner.etl.putPersist(newEtlMD, payload) 1681 h.owner.etl.Unlock() 1682 debug.AssertNoErr(err) 1683 1684 if cb != nil { 1685 cb(newEtlMD, etlMD) 1686 } 1687 return 1688 } 1689 1690 // under lock 1691 func (h *htrun) _recvCfg(newConfig *globalConfig, payload msPayload) (err error) { 1692 config := cmn.GCO.Get() 1693 if newConfig.version() <= config.Version { 1694 if newConfig.version() == config.Version { 1695 return 1696 } 1697 return newErrDowngrade(h.si, config.String(), newConfig.String()) 1698 } 1699 if err = h.owner.config.persist(newConfig, payload); err != nil { 1700 return 1701 } 1702 if err = cmn.GCO.Update(&newConfig.ClusterConfig); err != nil { 1703 return 1704 } 1705 return 1706 } 1707 1708 func (h *htrun) extractRevokedTokenList(payload msPayload, caller string) (*tokenList, error) { 1709 var ( 1710 msg aisMsg 1711 bytes, ok = payload[revsTokenTag] 1712 ) 1713 if !ok { 1714 return nil, nil 1715 } 1716 if msgValue, ok := payload[revsTokenTag+revsActionTag]; ok { 1717 if err := jsoniter.Unmarshal(msgValue, &msg); err != nil { 1718 err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "action message", cos.BHead(msgValue), err) 1719 return nil, err 1720 } 1721 } 1722 tokenList := &tokenList{} 1723 if err := jsoniter.Unmarshal(bytes, tokenList); err != nil { 1724 err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "blocked token list", cos.BHead(bytes), err) 1725 return nil, err 1726 } 1727 nlog.Infof("extract token list from %q (count: %d, action: %q, uuid: %q)", caller, 1728 len(tokenList.Tokens), msg.Action, msg.UUID) 1729 return tokenList, nil 1730 } 1731 1732 // ================================== Background ========================================= 1733 // 1734 // Generally, AIStore clusters can be deployed with an arbitrary numbers of proxies. 1735 // Each proxy/gateway provides full access to the clustered objects and collaborates with 1736 // all other proxies to perform majority-voted HA failovers. 1737 // 1738 // Not all proxies are equal though. 1739 // 1740 // Two out of all proxies can be designated via configuration as "original" and 1741 // "discovery." The "original" (located at the configurable "original_url") is expected 1742 // to be the primary at cluster (initial) deployment time. 1743 // 1744 // Later on, when and if some HA event triggers an automated failover, the role of the 1745 // primary may be (automatically) assumed by a different proxy/gateway, with the 1746 // corresponding update getting synchronized across all running nodes. 1747 // A new node, however, could potentially experience a problem when trying to join the 1748 // cluster simply because its configuration would still be referring to the old primary. 1749 // The added "discovery_url" is precisely intended to address this scenario. 1750 // 1751 // Here's how a node joins a AIStore cluster: 1752 // - first, there's the primary proxy/gateway referenced by the current cluster map 1753 // or - during the cluster deployment time - by the the configured "primary_url" 1754 // (see /deploy/dev/local/aisnode_config.sh) 1755 // - if that one fails, the new node goes ahead and tries the alternatives: 1756 // - config.Proxy.PrimaryURL ("primary_url") 1757 // - config.Proxy.DiscoveryURL ("discovery_url") 1758 // - config.Proxy.OriginalURL ("original_url") 1759 // - if these fails we try the candidates provided by the caller. 1760 // 1761 // ================================== Background ========================================= 1762 func (h *htrun) join(query url.Values, htext htext, contactURLs ...string) (res *callResult) { 1763 var ( 1764 config = cmn.GCO.Get() 1765 candidates = make([]string, 0, 4+len(contactURLs)) 1766 selfPublicURL, pubValid = cos.ParseURL(h.si.URL(cmn.NetPublic)) 1767 selfIntraURL, intraValid = cos.ParseURL(h.si.URL(cmn.NetIntraControl)) 1768 resPrev *callResult 1769 ) 1770 debug.Assert(pubValid && intraValid) 1771 1772 // env goes first 1773 if daemon.EP != "" { 1774 candidates = _addCan(daemon.EP, selfPublicURL.Host, selfIntraURL.Host, candidates) 1775 } 1776 primaryURL, psi := h.getPrimaryURLAndSI(nil, config) 1777 candidates = _addCan(primaryURL, selfPublicURL.Host, selfIntraURL.Host, candidates) 1778 if psi != nil { 1779 candidates = _addCan(psi.URL(cmn.NetPublic), selfPublicURL.Host, selfIntraURL.Host, candidates) 1780 } 1781 candidates = _addCan(config.Proxy.PrimaryURL, selfPublicURL.Host, selfIntraURL.Host, candidates) 1782 candidates = _addCan(config.Proxy.DiscoveryURL, selfPublicURL.Host, selfIntraURL.Host, candidates) 1783 candidates = _addCan(config.Proxy.OriginalURL, selfPublicURL.Host, selfIntraURL.Host, candidates) 1784 for _, u := range contactURLs { 1785 candidates = _addCan(u, selfPublicURL.Host, selfIntraURL.Host, candidates) 1786 } 1787 1788 sleep := max(2*time.Second, cmn.Rom.MaxKeepalive()) 1789 for range 4 { // retry 1790 for _, candidateURL := range candidates { 1791 if nlog.Stopping() { 1792 return 1793 } 1794 if resPrev != nil { 1795 freeCR(resPrev) 1796 resPrev = nil //nolint:ineffassign // readability 1797 } 1798 res = h.regTo(candidateURL, nil, apc.DefaultTimeout, query, htext, false /*keepalive*/) 1799 if res.err == nil { 1800 nlog.Infoln(h.String()+": primary responded Ok via", candidateURL) 1801 return // ok 1802 } 1803 resPrev = res 1804 } 1805 time.Sleep(sleep) 1806 } 1807 if resPrev != nil { 1808 freeCR(resPrev) 1809 } 1810 1811 smap := h.owner.smap.get() 1812 if smap.validate() != nil { 1813 return 1814 } 1815 1816 // Failed to join cluster using config, try getting primary URL using existing smap. 1817 cii, _ := h.bcastHealth(smap, false /*checkAll*/) 1818 if cii == nil || cii.Smap.Version < smap.version() { 1819 return 1820 } 1821 primaryURL = cii.Smap.Primary.PubURL 1822 1823 // Daemon is stopping skip register 1824 if nlog.Stopping() { 1825 return 1826 } 1827 res = h.regTo(primaryURL, nil, apc.DefaultTimeout, query, htext, false /*keepalive*/) 1828 if res.err == nil { 1829 nlog.Infoln(h.String()+": joined cluster via", primaryURL) 1830 } 1831 return 1832 } 1833 1834 func _addCan(url, selfPub, selfCtrl string, candidates []string) []string { 1835 if u, valid := cos.ParseURL(url); !valid || u.Host == selfPub || u.Host == selfCtrl { 1836 return candidates 1837 } 1838 if cos.StringInSlice(url, candidates) { 1839 return candidates 1840 } 1841 return append(candidates, url) 1842 } 1843 1844 func (h *htrun) regTo(url string, psi *meta.Snode, tout time.Duration, q url.Values, htext htext, keepalive bool) *callResult { 1845 var ( 1846 path string 1847 skipPrxKalive = h.si.IsProxy() || keepalive 1848 opts = cmetaFillOpt{ 1849 htext: htext, 1850 skipSmap: skipPrxKalive, 1851 skipBMD: skipPrxKalive, 1852 skipRMD: keepalive, 1853 skipConfig: keepalive, 1854 skipEtlMD: keepalive, 1855 fillRebMarker: !keepalive, 1856 skipPrimeTime: true, 1857 } 1858 ) 1859 cm, err := h.cluMeta(opts) 1860 if err != nil { 1861 res := allocCR() 1862 res.err = err 1863 return res 1864 } 1865 1866 if keepalive { 1867 path = apc.URLPathCluKalive.S 1868 } else { 1869 path = apc.URLPathCluAutoReg.S 1870 } 1871 cargs := allocCargs() 1872 { 1873 cargs.si = psi 1874 cargs.req = cmn.HreqArgs{Method: http.MethodPost, Base: url, Path: path, Query: q, Body: cos.MustMarshal(cm)} 1875 cargs.timeout = tout 1876 } 1877 smap := cm.Smap 1878 if smap == nil { 1879 smap = h.owner.smap.get() 1880 } 1881 res := h.call(cargs, smap) 1882 freeCargs(cargs) 1883 return res 1884 } 1885 1886 func (h *htrun) sendKalive(smap *smapX, htext htext, timeout time.Duration, fast bool) (pid string, status int, err error) { 1887 if nlog.Stopping() { 1888 err = errors.New(h.String() + " is stopping") 1889 return 1890 } 1891 primaryURL, psi := h.getPrimaryURLAndSI(smap, nil) 1892 pid = psi.ID() 1893 1894 if fast { 1895 // fast path 1896 debug.Assert(h.ClusterStarted()) 1897 path := apc.URLPathCluKalive.Join(h.SID()) 1898 cargs := allocCargs() 1899 { 1900 cargs.si = psi 1901 cargs.req = cmn.HreqArgs{Method: http.MethodPost, Base: primaryURL, Path: path} 1902 cargs.timeout = timeout 1903 } 1904 res := h.call(cargs, smap) 1905 freeCargs(cargs) 1906 err = res.err 1907 freeCR(res) 1908 return 1909 } 1910 1911 // slow path 1912 res := h.regTo(primaryURL, psi, timeout, nil, htext, true /*keepalive*/) 1913 if res.err != nil { 1914 if strings.Contains(res.err.Error(), ciePrefix) { 1915 cos.ExitLog(res.err) // FATAL: cluster integrity error (cie) 1916 } 1917 status, err = res.status, res.err 1918 freeCR(res) 1919 return 1920 } 1921 freeCR(res) 1922 return 1923 } 1924 1925 func (h *htrun) getPrimaryURLAndSI(smap *smapX, config *cmn.Config) (string, *meta.Snode) { 1926 if smap == nil { 1927 smap = h.owner.smap.get() 1928 } 1929 if smap.validate() != nil { 1930 if config == nil { 1931 config = cmn.GCO.Get() 1932 } 1933 return config.Proxy.PrimaryURL, nil 1934 } 1935 return smap.Primary.URL(cmn.NetIntraControl), smap.Primary 1936 } 1937 1938 func (h *htrun) pollClusterStarted(config *cmn.Config, psi *meta.Snode) (maxCii *cifl.Info) { 1939 var ( 1940 sleep, total, rediscover time.Duration 1941 healthTimeout = config.Timeout.CplaneOperation.D() 1942 query = url.Values{apc.QparamAskPrimary: []string{"true"}} 1943 ) 1944 for { 1945 sleep = min(cmn.Rom.MaxKeepalive(), sleep+time.Second) 1946 time.Sleep(sleep) 1947 total += sleep 1948 rediscover += sleep 1949 if nlog.Stopping() { 1950 return 1951 } 1952 smap := h.owner.smap.get() 1953 if smap.validate() != nil { 1954 continue 1955 } 1956 if h.si.IsProxy() && smap.isPrimary(h.si) { // TODO: unlikely - see httpRequestNewPrimary 1957 nlog.Warningln(h.String(), "started as a non-primary and got _elected_ during startup") 1958 return 1959 } 1960 if _, _, err := h.reqHealth(smap.Primary, healthTimeout, query /*ask primary*/, smap); err == nil { 1961 // log 1962 s := fmt.Sprintf("%s via primary health: cluster startup Ok, %s", h.si, smap.StringEx()) 1963 if self := smap.GetNode(h.si.ID()); self == nil { 1964 nlog.Warningln(s + "; NOTE: not present in the cluster map") 1965 } else if self.Flags.IsSet(meta.SnodeMaint) { 1966 h.si.Flags = self.Flags 1967 nlog.Warningln(s + "; NOTE: starting in maintenance mode") 1968 } else if rmd := h.owner.rmd.get(); rmd != nil && rmd.version() > 0 { 1969 if smap.UUID != rmd.CluID { 1970 if rmd.CluID != "" { 1971 err = h.owner.rmd.newClusterIntegrityErr(h.String(), smap.UUID, rmd.CluID, rmd.version()) 1972 cos.ExitLog(err) // FATAL 1973 } 1974 1975 nlog.Warningf("local copy of RMD v%d does not have cluster ID (expecting %q)", 1976 rmd.version(), smap.UUID) 1977 nlog.Infoln(s) 1978 } else { 1979 nlog.Infoln(s+",", rmd.String()) 1980 } 1981 } else { 1982 nlog.Infoln(s) 1983 } 1984 return 1985 } 1986 1987 if rediscover >= config.Timeout.Startup.D()/2 { 1988 rediscover = 0 1989 if cii, cnt := h.bcastHealth(smap, true /*checkAll*/); cii != nil && cii.Smap.Version > smap.version() { 1990 var pid string 1991 if psi != nil { 1992 pid = psi.ID() 1993 } 1994 if cii.Smap.Primary.ID != pid && cnt >= maxVerConfirmations { 1995 nlog.Warningf("%s: change of primary %s => %s - must rejoin", h.si, pid, cii.Smap.Primary.ID) 1996 maxCii = cii 1997 return 1998 } 1999 } 2000 } 2001 if total > config.Timeout.Startup.D() { 2002 nlog.Errorln(h.String() + ": " + cmn.StartupMayTimeout) 2003 } 2004 } 2005 } 2006 2007 func (h *htrun) unregisterSelf(ignoreErr bool) (err error) { 2008 var status int 2009 smap := h.owner.smap.get() 2010 if smap == nil || smap.validate() != nil { 2011 return 2012 } 2013 cargs := allocCargs() 2014 { 2015 cargs.si = smap.Primary 2016 cargs.req = cmn.HreqArgs{Method: http.MethodDelete, Path: apc.URLPathCluDaemon.Join(h.si.ID())} 2017 cargs.timeout = apc.DefaultTimeout 2018 } 2019 res := h.call(cargs, smap) 2020 status, err = res.status, res.err 2021 if err != nil { 2022 f := nlog.Errorf 2023 if ignoreErr { 2024 f = nlog.Infof 2025 } 2026 f("%s: failed to unreg self, err: %v(%d)", h.si, err, status) 2027 } 2028 freeCargs(cargs) 2029 freeCR(res) 2030 return 2031 } 2032 2033 // via /health handler 2034 func (h *htrun) externalWD(w http.ResponseWriter, r *http.Request) (responded bool) { 2035 callerID := r.Header.Get(apc.HdrCallerID) 2036 caller := r.Header.Get(apc.HdrCallerName) 2037 // external call 2038 if callerID == "" && caller == "" { 2039 readiness := cos.IsParseBool(r.URL.Query().Get(apc.QparamHealthReadiness)) 2040 if cmn.Rom.FastV(5, cos.SmoduleAIS) { 2041 nlog.Infof("%s: external health-ping from %s (readiness=%t)", h.si, r.RemoteAddr, readiness) 2042 } 2043 // respond with 503 as per https://tools.ietf.org/html/rfc7231#section-6.6.4 2044 // see also: 2045 // * https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes 2046 if !readiness && !h.ClusterStarted() { 2047 w.WriteHeader(http.StatusServiceUnavailable) 2048 } 2049 // NOTE: for "readiness" check always return true; otherwise, true if cluster started 2050 return true 2051 } 2052 // intra-cluster health ping 2053 if !h.ensureIntraControl(w, r, false /* from primary */) { 2054 responded = true 2055 } 2056 return 2057 } 2058 2059 // 2060 // intra-cluster request validations and helpers 2061 // 2062 2063 func (h *htrun) isIntraCall(hdr http.Header, fromPrimary bool) (err error) { 2064 debug.Assert(hdr != nil) 2065 var ( 2066 smap = h.owner.smap.get() 2067 callerID = hdr.Get(apc.HdrCallerID) 2068 callerName = hdr.Get(apc.HdrCallerName) 2069 callerSver = hdr.Get(apc.HdrCallerSmapVer) 2070 callerVer int64 2071 erP error 2072 ) 2073 if ok := callerID != "" && callerName != ""; !ok { 2074 return fmt.Errorf("%s: expected %s request", h, cmn.NetIntraControl) 2075 } 2076 if !smap.isValid() { 2077 return 2078 } 2079 caller := smap.GetNode(callerID) 2080 if ok := caller != nil && (!fromPrimary || smap.isPrimary(caller)); ok { 2081 return 2082 } 2083 if callerSver != smap.vstr && callerSver != "" { 2084 callerVer, erP = strconv.ParseInt(callerSver, 10, 64) 2085 if erP != nil { 2086 debug.AssertNoErr(erP) 2087 nlog.Errorln(erP) 2088 return 2089 } 2090 // we still trust the request when the sender's Smap is more current 2091 if callerVer > smap.version() { 2092 if h.ClusterStarted() { 2093 nlog.Errorf("%s: %s < Smap(v%s) from %s - proceeding anyway...", h, smap, callerSver, callerName) 2094 } 2095 runtime.Gosched() 2096 return 2097 } 2098 } 2099 if caller == nil { 2100 if !fromPrimary { 2101 // assume request from a newly joined node and proceed 2102 return nil 2103 } 2104 return fmt.Errorf("%s: expected %s from a valid node, %s", h, cmn.NetIntraControl, smap) 2105 } 2106 return fmt.Errorf("%s: expected %s from primary (and not %s), %s", h, cmn.NetIntraControl, caller, smap) 2107 } 2108 2109 func (h *htrun) ensureIntraControl(w http.ResponseWriter, r *http.Request, onlyPrimary bool) (isIntra bool) { 2110 err := h.isIntraCall(r.Header, onlyPrimary) 2111 if err != nil { 2112 h.writeErr(w, r, err) 2113 return 2114 } 2115 if !cmn.GCO.Get().HostNet.UseIntraControl { 2116 return true // intra-control == pub 2117 } 2118 // NOTE: not checking r.RemoteAddr 2119 intraAddr := h.si.ControlNet.TCPEndpoint() 2120 srvAddr := r.Context().Value(http.ServerContextKey).(*http.Server).Addr 2121 if srvAddr == intraAddr { 2122 return true 2123 } 2124 h.writeErrf(w, r, "%s: expected %s request", h, cmn.NetIntraControl) 2125 return 2126 } 2127 2128 func (h *htrun) uptime2hdr(hdr http.Header) { 2129 now := mono.NanoTime() 2130 hdr.Set(apc.HdrNodeUptime, strconv.FormatInt(now-h.startup.node.Load(), 10)) 2131 hdr.Set(apc.HdrClusterUptime, strconv.FormatInt(now-h.startup.cluster.Load(), 10)) 2132 } 2133 2134 // NOTE: not checking vs Smap (yet) 2135 func isT2TPut(hdr http.Header) bool { return hdr != nil && hdr.Get(apc.HdrT2TPutterID) != "" } 2136 2137 func isRedirect(q url.Values) (ptime string) { 2138 if len(q) == 0 || q.Get(apc.QparamProxyID) == "" { 2139 return 2140 } 2141 return q.Get(apc.QparamUnixTime) 2142 } 2143 2144 func ptLatency(tts int64, ptime, isPrimary string) (dur int64) { 2145 pts, err := cos.S2UnixNano(ptime) 2146 if err != nil { 2147 debug.AssertNoErr(err) 2148 return 2149 } 2150 if ok, _ := cos.ParseBool(isPrimary); ok { 2151 xreg.PrimeTime.Store(pts) 2152 xreg.MyTime.Store(tts) 2153 } 2154 dur = tts - pts 2155 if dur < 0 && -dur < int64(clusterClockDrift) { 2156 dur = 0 2157 } 2158 return 2159 } 2160 2161 // 2162 // aisMsg reader & constructors 2163 // 2164 2165 func (*htrun) readAisMsg(w http.ResponseWriter, r *http.Request) (msg *aisMsg, err error) { 2166 msg = &aisMsg{} 2167 err = cmn.ReadJSON(w, r, msg) 2168 return 2169 } 2170 2171 func (msg *aisMsg) String() string { 2172 s := "aism[" + msg.Action 2173 if msg.UUID != "" { 2174 s += "[" + msg.UUID + "]" 2175 } 2176 if msg.Name != "" { 2177 s += ", name=" + msg.Name 2178 } 2179 return s + "]" 2180 } 2181 2182 func (msg *aisMsg) StringEx() (s string) { 2183 s = msg.String() 2184 vs, err := jsoniter.Marshal(msg.Value) 2185 debug.AssertNoErr(err) 2186 s += ",(" + strings.ReplaceAll(string(vs), ",", ", ") + ")" 2187 return 2188 } 2189 2190 func (h *htrun) newAmsgStr(msgStr string, bmd *bucketMD) *aisMsg { 2191 return h.newAmsg(&apc.ActMsg{Value: msgStr}, bmd) 2192 } 2193 2194 func (h *htrun) newAmsgActVal(act string, val any) *aisMsg { 2195 return h.newAmsg(&apc.ActMsg{Action: act, Value: val}, nil) 2196 } 2197 2198 func (h *htrun) newAmsg(actionMsg *apc.ActMsg, bmd *bucketMD, uuid ...string) *aisMsg { 2199 msg := &aisMsg{ActMsg: *actionMsg} 2200 if bmd != nil { 2201 msg.BMDVersion = bmd.Version 2202 } else { 2203 msg.BMDVersion = h.owner.bmd.Get().Version 2204 } 2205 if len(uuid) > 0 { 2206 msg.UUID = uuid[0] 2207 } 2208 return msg 2209 } 2210 2211 // apc.ActMsg c-tor and reader 2212 func (*htrun) readActionMsg(w http.ResponseWriter, r *http.Request) (msg *apc.ActMsg, err error) { 2213 msg = &apc.ActMsg{} 2214 err = cmn.ReadJSON(w, r, msg) 2215 return 2216 } 2217 2218 // cmn.ReadJSON with the only difference: EOF is ok 2219 func readJSON(w http.ResponseWriter, r *http.Request, out any) (err error) { 2220 err = jsoniter.NewDecoder(r.Body).Decode(out) 2221 cos.Close(r.Body) 2222 if err == nil || err == io.EOF { 2223 return nil 2224 } 2225 return cmn.WriteErrJSON(w, r, out, err) 2226 } 2227 2228 // (via apc.WhatNodeStatsAndStatus) 2229 func (h *htrun) _status(smap *smapX) (daeStatus string) { 2230 self := smap.GetNode(h.si.ID()) // updated flags 2231 switch { 2232 case self.Flags.IsSet(meta.SnodeMaint): 2233 daeStatus = apc.NodeMaintenance 2234 case self.Flags.IsSet(meta.SnodeDecomm): 2235 daeStatus = apc.NodeDecommission 2236 } 2237 return 2238 } 2239 2240 //////////////// 2241 // callResult // 2242 //////////////// 2243 2244 // error helpers for intra-cluster calls 2245 2246 func (res *callResult) unwrap() (err error) { 2247 err = errors.Unwrap(res.err) 2248 if err == nil { 2249 err = res.err 2250 } 2251 return 2252 } 2253 2254 func (res *callResult) toErr() error { 2255 if res.err == nil { 2256 return nil 2257 } 2258 // is cmn.ErrHTTP 2259 if herr := cmn.Err2HTTPErr(res.err); herr != nil { 2260 // add status, details 2261 if res.status >= http.StatusBadRequest { 2262 herr.Status = res.status 2263 } 2264 if herr.Message == "" { 2265 herr.Message = res.details 2266 } 2267 return herr 2268 } 2269 // res => cmn.ErrHTTP 2270 if res.status >= http.StatusBadRequest { 2271 var detail string 2272 if res.details != "" { 2273 detail = "[" + res.details + "]" 2274 } 2275 return res.herr(nil, fmt.Sprintf("%v%s", res.err, detail)) 2276 } 2277 if res.details == "" { 2278 return res.err 2279 } 2280 return cmn.NewErrFailedTo(nil, "call "+res.si.StringEx(), res.details, res.err) 2281 } 2282 2283 func (res *callResult) herr(r *http.Request, msg string) *cmn.ErrHTTP { 2284 orig := &cmn.ErrHTTP{} 2285 if e := jsoniter.Unmarshal([]byte(msg), orig); e == nil { 2286 return orig 2287 } 2288 nherr := cmn.NewErrHTTP(r, errors.New(msg), res.status) 2289 if res.si != nil { 2290 nherr.Node = res.si.StringEx() 2291 } 2292 return nherr 2293 } 2294 2295 func (res *callResult) errorf(format string, a ...any) error { 2296 debug.Assert(res.err != nil) 2297 // add formatted 2298 msg := fmt.Sprintf(format, a...) 2299 if herr := cmn.Err2HTTPErr(res.err); herr != nil { 2300 herr.Message = msg + ": " + herr.Message 2301 res.err = herr 2302 } else { 2303 res.err = errors.New(msg + ": " + res.err.Error()) 2304 } 2305 return res.toErr() 2306 }