
     1  // Package ais provides core functionality for the AIStore object storage.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package ais
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"crypto/tls"
    11  	"errors"
    12  	"fmt"
    13  	"io"
    14  	"log"
    15  	"net/http"
    16  	"net/url"
    17  	"os"
    18  	"path/filepath"
    19  	"runtime"
    20  	"strconv"
    21  	"strings"
    22  	"sync"
    23  	"time"
    25  	""
    26  	""
    27  	""
    28  	""
    29  	""
    30  	""
    31  	""
    32  	""
    33  	""
    34  	""
    35  	""
    36  	""
    37  	""
    38  	""
    39  	""
    40  	""
    41  	""
    42  	jsoniter ""
    43  	""
    44  	""
    45  )
    47  const ciePrefix = "cluster integrity error cie#"
    49  const notPresentInSmap = `
    50  %s: %s (self) is not present in the local copy of the %s
    52  -----------------
    53  To troubleshoot:
    54  1. first, make sure you are not trying to run two different %s on the same machine
    55  2. double check "fspaths" config (used to find ais target's volume metadata and load its node ID)
    56  3. if none of the above helps, remove possibly outdated cluster map from the %s (located at %s)
    57  4. restart %s
    58  -----------------`
    60  // extra or extended state - currently, target only
    61  type htext interface {
    62  	interruptedRestarted() (bool, bool)
    63  }
    65  type htrun struct {
    66  	si        *meta.Snode
    67  	keepalive keepaliver
    68  	statsT    stats.Tracker
    69  	owner     struct {
    70  		smap   *smapOwner
    71  		bmd    bmdOwner // an interface with proxy and target impl-s
    72  		rmd    *rmdOwner
    73  		config *configOwner
    74  		etl    etlOwner // ditto
    75  	}
    76  	startup struct {
    77  		cluster atomic.Int64 // mono.NanoTime() since cluster startup, zero prior to that
    78  		node    atomic.Int64 // ditto - for the node
    79  	}
    80  	gmm *memsys.MMSA // system pagesize-based memory manager and slab allocator
    81  	smm *memsys.MMSA // system MMSA for small-size allocations
    82  }
    84  ///////////
    85  // htrun //
    86  ///////////
    88  // interface guard
    89  var _ core.Node = (*htrun)(nil)
    91  func (h *htrun) Snode() *meta.Snode { return }
    92  func (h *htrun) callerName() string { return }
    93  func (h *htrun) SID() string        { return }
    94  func (h *htrun) String() string     { return }
    96  func (h *htrun) Bowner() meta.Bowner { return h.owner.bmd }
    97  func (h *htrun) Sowner() meta.Sowner { return h.owner.smap }
    99  // NOTE: currently, only 'resume' (see also: kaSuspendMsg)
   100  func (h *htrun) smapUpdatedCB(_, _ *smapX, nfl, ofl cos.BitFlags) {
   101  	if ofl.IsAnySet(meta.SnodeMaintDecomm) && !nfl.IsAnySet(meta.SnodeMaintDecomm) {
   102  		h.keepalive.ctrl(kaResumeMsg)
   103  	}
   104  }
   106  func (h *htrun) parseReq(w http.ResponseWriter, r *http.Request, apireq *apiRequest) (err error) {
   107  	debug.Assert(len(apireq.prefix) != 0)
   108  	apireq.items, err = h.parseURL(w, r, apireq.prefix, apireq.after, false)
   109  	if err != nil {
   110  		return
   111  	}
   112  	debug.Assert(len(apireq.items) > apireq.bckIdx)
   113  	bckName := apireq.items[apireq.bckIdx]
   114  	if apireq.dpq == nil {
   115  		apireq.query = r.URL.Query()
   116  	} else if err = apireq.dpq.parse(r.URL.RawQuery); err != nil {
   117  		return
   118  	}
   119  	apireq.bck, err = newBckFromQ(bckName, apireq.query, apireq.dpq)
   120  	if err != nil {
   121  		h.writeErr(w, r, err)
   122  	}
   123  	return err
   124  }
   126  func (h *htrun) cluMeta(opts cmetaFillOpt) (*cluMeta, error) {
   127  	cm := &cluMeta{SI:}
   128  	if voteInProgress() != nil {
   129  		cm.Flags = cm.Flags.Set(cifl.VoteInProgress)
   130  	}
   131  	if !opts.skipConfig {
   132  		var err error
   133  		cm.Config, err = h.owner.config.get()
   134  		if err != nil {
   135  			return nil, err
   136  		}
   137  	}
   138  	// don't send Smap when it is undergoing changes (and is about to get metasync-ed)
   139  	smap := h.owner.smap.get()
   140  	if !opts.skipSmap {
   141  		cm.Smap = smap
   142  	}
   143  	if !opts.skipBMD {
   144  		cm.BMD = h.owner.bmd.get()
   145  	}
   146  	if !opts.skipRMD {
   147  		cm.RMD = h.owner.rmd.get()
   148  	}
   149  	if !opts.skipEtlMD {
   150  		cm.EtlMD = h.owner.etl.get()
   151  	}
   152  	if && opts.fillRebMarker {
   153  		rebInterrupted, restarted := opts.htext.interruptedRestarted()
   154  		if rebInterrupted {
   155  			cm.Flags = cm.Flags.Set(cifl.RebalanceInterrupted)
   156  		}
   157  		if restarted {
   158  			cm.Flags = cm.Flags.Set(cifl.Restarted)
   159  		}
   160  	}
   161  	if !opts.skipPrimeTime && smap.IsPrimary( {
   162  		cm.PrimeTime = time.Now().UnixNano()
   163  	}
   164  	return cm, nil
   165  }
   167  // usage: [API call => handler => ClusterStartedWithRetry ]
   168  func (h *htrun) cluStartedWithRetry() bool {
   169  	if clutime := h.startup.cluster.Load(); clutime > 0 {
   170  		return true
   171  	}
   172  	if !h.NodeStarted() {
   173  		return false
   174  	}
   175  	time.Sleep(time.Second)
   176  	clutime := h.startup.cluster.Load()
   177  	if clutime == 0 {
   178  		nlog.ErrorDepth(1, fmt.Sprintf("%s: cluster is starting up", h))
   179  	}
   180  	return clutime > 0
   181  }
   183  func (h *htrun) ClusterStarted() bool { return h.startup.cluster.Load() > 0 } // see also: p.ready()
   184  func (h *htrun) markClusterStarted()  { h.startup.cluster.Store(mono.NanoTime()) }
   186  func (h *htrun) NodeStarted() bool { return h.startup.node.Load() > 0 }
   187  func (h *htrun) markNodeStarted()  { h.startup.node.Store(mono.NanoTime()) }
   189  func (h *htrun) regNetHandlers(networkHandlers []networkHandler) {
   190  	var (
   191  		path   string
   192  		config = cmn.GCO.Get()
   193  	)
   194  	// common, debug
   195  	for r, nh := range debug.Handlers() {
   196  		handlePub(r, nh)
   197  	}
   198  	// node type specific
   199  	for _, nh := range networkHandlers {
   200  		var reg bool
   201  		if nh.r[0] == '/' { // absolute path
   202  			path = nh.r
   203  		} else {
   204  			path = cos.JoinWords(apc.Version, nh.r)
   205  		}
   206  		debug.Assert( != 0)
   207  		if {
   208  			handlePub(path, nh.h)
   209  			reg = true
   210  		}
   211  		if config.HostNet.UseIntraControl && {
   212  			handleControl(path, nh.h)
   213  			reg = true
   214  		}
   215  		if config.HostNet.UseIntraData && {
   216  			handleData(path, nh.h)
   217  			reg = true
   218  		}
   219  		if reg {
   220  			continue
   221  		}
   222  		// none of the above
   223  		if !config.HostNet.UseIntraControl && !config.HostNet.UseIntraData {
   224  			// no intra-cluster networks: default to pub net
   225  			handlePub(path, nh.h)
   226  		} else if config.HostNet.UseIntraControl && {
   227  			// (not configured) data defaults to (configured) control
   228  			handleControl(path, nh.h)
   229  		} else {
   230  			debug.Assert(config.HostNet.UseIntraData &&
   231  			// (not configured) control defaults to (configured) data
   232  			handleData(path, nh.h)
   233  		}
   234  	}
   235  	// common Prometheus
   236  	if h.statsT.IsPrometheus() {
   237  		nh := networkHandler{r: "/" + apc.Metrics, h: promhttp.Handler().ServeHTTP}
   238  		path := nh.r // absolute
   239  		handlePub(path, nh.h)
   240  	}
   241  }
   243  func (h *htrun) init(config *cmn.Config) {
   244  	initCtrlClient(config)
   245  	initDataClient(config)
   247  	tcpbuf := config.Net.L4.SndRcvBufSize
   248  	if {
   249  		tcpbuf = 0
   250  	} else if tcpbuf == 0 {
   251  		tcpbuf = cmn.DefaultSendRecvBufferSize // ditto: targets use AIS default when not configured
   252  	}
   254  	muxers := newMuxers()
   255 = &netServer{muxers: muxers, sndRcvBufSize: tcpbuf}
   256  	g.netServ.control = // if not separately configured, intra-control net is public
   257  	if config.HostNet.UseIntraControl {
   258  		muxers = newMuxers()
   259  		g.netServ.control = &netServer{muxers: muxers, sndRcvBufSize: 0}
   260  	}
   261 = g.netServ.control // if not configured, intra-data net is intra-control
   262  	if config.HostNet.UseIntraData {
   263  		muxers = newMuxers()
   264 = &netServer{muxers: muxers, sndRcvBufSize: tcpbuf}
   265  	}
   267  	h.owner.smap = newSmapOwner(config)
   268  	h.owner.rmd = newRMDOwner(config)
   269  	h.owner.rmd.load()
   271  	h.gmm = memsys.PageMM()
   272  	h.gmm.RegWithHK()
   273  	h.smm = memsys.ByteMM()
   274  	h.smm.RegWithHK()
   275  }
   277  // steps 1 thru 4
   278  func (h *htrun) initSnode(config *cmn.Config) {
   279  	var (
   280  		pubAddr  meta.NetInfo
   281  		pubExtra []meta.NetInfo
   282  		ctrlAddr meta.NetInfo
   283  		dataAddr meta.NetInfo
   284  		port     = strconv.Itoa(config.HostNet.Port)
   285  		proto    = config.Net.HTTP.Proto
   286  	)
   287  	addrList, err := getLocalIPv4s(config)
   288  	if err != nil {
   289  		cos.ExitLogf("failed to get local IP addr list: %v", err)
   290  	}
   292  	// 1. pub net
   293  	pub, extra := multihome(config.HostNet.Hostname)
   295  	if k8s.IsK8s() && config.HostNet.Hostname != "" {
   296  		// K8s: skip IP addr validation
   297  		// public hostname could be a load balancer's external IP or a service DNS
   298  		nlog.Infoln("K8s deployment: skipping hostname validation for", config.HostNet.Hostname)
   299  		pubAddr.Init(proto, pub, port)
   300  	} else if err = initNetInfo(&pubAddr, addrList, proto, config.HostNet.Hostname, port); err != nil {
   301  		cos.ExitLogf("failed to get %s IPv4/hostname: %v", cmn.NetPublic, err)
   302  	}
   304  	// multi-home (when config.HostNet.Hostname is a comma-separated list)
   305  	// using the same pub port
   306  	if l := len(extra); l > 0 {
   307  		pubExtra = make([]meta.NetInfo, l)
   308  		for i, addr := range extra {
   309  			pubExtra[i].Init(proto, addr, port)
   310  		}
   311  	} else {
   312  		nlog.Infof("%s (user) access: %v (%q)", cmn.NetPublic, pubAddr, config.HostNet.Hostname)
   313  	}
   315  	// 2. intra-cluster
   316  	ctrlAddr = pubAddr
   317  	if config.HostNet.UseIntraControl {
   318  		icport := strconv.Itoa(config.HostNet.PortIntraControl)
   319  		err = initNetInfo(&ctrlAddr, addrList, proto, config.HostNet.HostnameIntraControl, icport)
   320  		if err != nil {
   321  			cos.ExitLogf("failed to get %s IPv4/hostname: %v", cmn.NetIntraControl, err)
   322  		}
   323  		var s string
   324  		if config.HostNet.HostnameIntraControl != "" {
   325  			s = " (config: " + config.HostNet.HostnameIntraControl + ")"
   326  		}
   327  		nlog.Infof("%s access: %v%s", cmn.NetIntraControl, ctrlAddr, s)
   328  	}
   329  	dataAddr = pubAddr
   330  	if config.HostNet.UseIntraData {
   331  		idport := strconv.Itoa(config.HostNet.PortIntraData)
   332  		err = initNetInfo(&dataAddr, addrList, proto, config.HostNet.HostnameIntraData, idport)
   333  		if err != nil {
   334  			cos.ExitLogf("failed to get %s IPv4/hostname: %v", cmn.NetIntraData, err)
   335  		}
   336  		var s string
   337  		if config.HostNet.HostnameIntraData != "" {
   338  			s = " (config: " + config.HostNet.HostnameIntraData + ")"
   339  		}
   340  		nlog.Infof("%s access: %v%s", cmn.NetIntraData, dataAddr, s)
   341  	}
   343  	// 3. validate
   344  	mustDiffer(pubAddr,
   345  		config.HostNet.Port,
   346  		true,
   347  		ctrlAddr,
   348  		config.HostNet.PortIntraControl,
   349  		config.HostNet.UseIntraControl,
   350  		"pub/ctl",
   351  	)
   352  	mustDiffer(pubAddr,
   353  		config.HostNet.Port,
   354  		true,
   355  		dataAddr,
   356  		config.HostNet.PortIntraData,
   357  		config.HostNet.UseIntraData,
   358  		"pub/data",
   359  	)
   360  	mustDiffer(dataAddr,
   361  		config.HostNet.PortIntraData,
   362  		config.HostNet.UseIntraData,
   363  		ctrlAddr,
   364  		config.HostNet.PortIntraControl,
   365  		config.HostNet.UseIntraControl,
   366  		"ctl/data",
   367  	)
   369  	// 4. new Snode
   370 = &meta.Snode{
   371  		PubNet:     pubAddr,
   372  		ControlNet: ctrlAddr,
   373  		DataNet:    dataAddr,
   374  	}
   375  	if l := len(pubExtra); l > 0 {
   376 = make([]meta.NetInfo, l)
   377  		copy(, pubExtra)
   378  		nlog.Infof("%s (multihome) access: %v and %v", cmn.NetPublic, pubAddr,
   379  	}
   380  }
   382  func mustDiffer(ip1 meta.NetInfo, port1 int, use1 bool, ip2 meta.NetInfo, port2 int, use2 bool, tag string) {
   383  	if !use1 || !use2 {
   384  		return
   385  	}
   386  	if ip1.Hostname == ip2.Hostname && port1 == port2 {
   387  		cos.ExitLogf("%s: cannot use the same IP:port (%s) for two networks", tag, ip1)
   388  	}
   389  }
   391  // at startup, check this Snode vs locally stored Smap replica (NOTE: some errors are FATAL)
   392  func (h *htrun) loadSmap() (smap *smapX, reliable bool) {
   393  	smap = newSmap()
   394  	loaded, err := h.owner.smap.load(smap)
   396  	if err != nil {
   397  		nlog.Errorf("Failed to load cluster map (\"Smap\"): %v - reinitializing", err)
   398  		return
   399  	}
   400  	if !loaded {
   401  		return // no local replica - joining from scratch
   402  	}
   404  	node := smap.GetNode(h.SID())
   405  	if node == nil {
   406  		ty := "targets"
   407  		if == apc.Proxy {
   408  			ty = "proxies"
   409  		}
   410  		cos.ExitLogf(notPresentInSmap, cmn.BadSmapPrefix,, smap.StringEx(), ty,, h.owner.smap.fpath,
   411  	}
   412  	if node.Type() != {
   413  		cos.ExitLogf("%s: %s is %q while the node in the loaded %s is %q", cmn.BadSmapPrefix,
   414,, smap.StringEx(), node.Type())
   415  		return
   416  	}
   418  	//
   419  	// NOTE: not enforcing Snode's immutability - in particular, IPs that may change upon restart in K8s
   420  	//
   421  	if _, err := smap.IsDupNet(; err != nil {
   422  		nlog.Warningln(err, "- proceeding with the loaded", smap.String(), "anyway...")
   423  	}
   424  	reliable = true
   425  	return
   426  }
   428  func (h *htrun) setDaemonConfigMsg(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg, query url.Values) {
   429  	var (
   430  		transient = cos.IsParseBool(query.Get(apc.ActTransient))
   431  		toUpdate  = &cmn.ConfigToSet{}
   432  	)
   433  	if err := cos.MorphMarshal(msg.Value, toUpdate); err != nil {
   434  		h.writeErrf(w, r, cmn.FmtErrMorphUnmarshal, h, msg.Action, msg.Value, err)
   435  		return
   436  	}
   438  	co := h.owner.config
   439  	co.Lock()
   440  	err := setConfig(toUpdate, transient)
   441  	co.Unlock()
   442  	if err != nil {
   443  		h.writeErr(w, r, err)
   444  	}
   445  }
   447  func (h *htrun) setDaemonConfigQuery(w http.ResponseWriter, r *http.Request) {
   448  	var (
   449  		query     = r.URL.Query()
   450  		transient = cos.IsParseBool(query.Get(apc.ActTransient))
   451  		toUpdate  = &cmn.ConfigToSet{}
   452  	)
   453  	if err := toUpdate.FillFromQuery(query); err != nil {
   454  		h.writeErr(w, r, err)
   455  		return
   456  	}
   458  	co := h.owner.config
   459  	co.Lock()
   460  	err := setConfig(toUpdate, transient)
   461  	co.Unlock()
   462  	if err != nil {
   463  		h.writeErr(w, r, err)
   464  	}
   465  }
   467  func (h *htrun) run(config *cmn.Config) error {
   468  	var (
   469  		tlsConf *tls.Config
   470  		logger  = log.New(&nlogWriter{}, "net/http err: ", 0) // a wrapper to log http.Server errors
   471  	)
   472  	if config.Net.HTTP.UseHTTPS {
   473  		c, err := newTLS(&config.Net.HTTP)
   474  		if err != nil {
   475  			cos.ExitLog(err)
   476  		}
   477  		tlsConf = c
   478  	}
   479  	if config.HostNet.UseIntraControl {
   480  		go func() {
   481  			_ = g.netServ.control.listen(, logger, tlsConf, config)
   482  		}()
   483  	}
   484  	if config.HostNet.UseIntraData {
   485  		go func() {
   486  			_ =, logger, tlsConf, config)
   487  		}()
   488  	}
   490  	ep :=
   491  	if h.pubAddrAny(config) {
   492  		ep = ":" +
   493  	} else if len( > 0 {
   494  		pubAddr2 :=[0]
   495  		debug.Assert(pubAddr2.Port ==
   496  		g.netServ.pub2 = &netServer{muxers:, sndRcvBufSize:}
   497  		go func() {
   498  			_ = g.netServ.pub2.listen(pubAddr2.TCPEndpoint(), logger, tlsConf, config)
   499  		}()
   500  	}
   502  	return, logger, tlsConf, config) // stay here
   503  }
   505  // return true to start listening on `INADDR_ANY:PubNet.Port`
   506  func (h *htrun) pubAddrAny(config *cmn.Config) (inaddrAny bool) {
   507  	switch {
   508  	case config.HostNet.UseIntraControl && ==
   509  	case config.HostNet.UseIntraData && ==
   510  	default:
   511  		inaddrAny = true
   512  	}
   513  	return inaddrAny
   514  }
   516  // remove self from Smap (if required), terminate http, and wait (w/ timeout)
   517  // for running xactions to abort
   518  func (h *htrun) stop(wg *sync.WaitGroup, rmFromSmap bool) {
   519  	const sleep = time.Second >> 1
   521  	if rmFromSmap {
   522  		h.unregisterSelf(true)
   523  	}
   524  	nlog.Infoln("Shutting down HTTP")
   526  	wg.Add(1)
   527  	go func() {
   528  		time.Sleep(sleep)
   529  		shuthttp()
   530  		wg.Done()
   531  	}()
   532  	entry := xreg.GetRunning(xreg.Flt{})
   533  	if entry != nil {
   534  		time.Sleep(sleep)
   535  		entry = xreg.GetRunning(xreg.Flt{})
   536  		if entry != nil {
   537  			nlog.Warningln("Timed out waiting for", entry.Kind(), "... to stop")
   538  		}
   539  	}
   541  	if {
   542  		wg.Wait()
   543  	}
   544  }
   546  //
   547  // intra-cluster IPC, control plane
   548  // call another target or a proxy; optionally, include a json-encoded body
   549  //
   551  func (h *htrun) _call(si *meta.Snode, bargs *bcastArgs, results *bcastResults) {
   552  	cargs := allocCargs()
   553  	{
   554 = si
   555  		cargs.req = bargs.req
   556  		cargs.timeout = bargs.timeout
   557  	}
   558  	cargs.req.Base = si.URL(
   559  	if bargs.req.BodyR != nil {
   560  		cargs.req.BodyR, _ = bargs.req.BodyR.(cos.ReadOpenCloser).Open()
   561  	}
   562  	cargs.cresv = bargs.cresv
   563  	res :=, bargs.smap)
   564  	if bargs.async {
   565  		freeCR(res) // discard right away
   566  	} else {
   568  		results.s = append(results.s, res)
   570  	}
   571  	freeCargs(cargs)
   572  }
   574  func (h *htrun) call(args *callArgs, smap *smapX) (res *callResult) {
   575  	var (
   576  		req    *http.Request
   577  		resp   *http.Response
   578  		client *http.Client
   579  		sid    = unknownDaemonID
   580  	)
   581  	res = allocCR()
   582  	if != nil {
   583  		sid =
   584 =
   585  	}
   587  	debug.Assert( != nil || args.req.Base != "") // either si or base
   588  	if args.req.Base == "" && != nil {
   589  		args.req.Base = // by default, use intra-cluster control network
   590  	}
   592  	if args.req.Header == nil {
   593  		args.req.Header = make(http.Header)
   594  	}
   596  	switch args.timeout {
   597  	case apc.DefaultTimeout:
   598  		req, res.err = args.req.Req()
   599  		if res.err != nil {
   600  			break
   601  		}
   602  		client = g.client.control
   603  	case apc.LongTimeout:
   604  		req, res.err = args.req.Req()
   605  		if res.err != nil {
   606  			break
   607  		}
   608  		client =
   609  	default:
   610  		var cancel context.CancelFunc
   611  		if args.timeout == 0 {
   612  			args.timeout = cmn.Rom.CplaneOperation()
   613  		}
   614  		req, _, cancel, res.err = args.req.ReqWithTimeout(args.timeout)
   615  		if res.err != nil {
   616  			break
   617  		}
   618  		defer cancel()
   620  		// NOTE: timeout handling
   621  		// - timeout causes context.deadlineExceededError, i.e. "context deadline exceeded"
   622  		// - the two knobs are configurable via "client_timeout" and "client_long_timeout",
   623  		// respectively (client section in the global config)
   624  		if args.timeout > g.client.control.Timeout {
   625  			client =
   626  		} else {
   627  			client = g.client.control
   628  		}
   629  	}
   630  	if res.err != nil {
   631  		res.details = fmt.Sprintf("FATAL: failed to create HTTP request %s %s: %v",
   632  			args.req.Method, args.req.URL(), res.err)
   633  		return
   634  	}
   636  	req.Header.Set(apc.HdrCallerID, h.SID())
   637  	req.Header.Set(apc.HdrCallerName,
   638  	if smap.vstr != "" {
   639  		if smap.IsPrimary( {
   640  			req.Header.Set(apc.HdrCallerIsPrimary, "true")
   641  		}
   642  		req.Header.Set(apc.HdrCallerSmapVer, smap.vstr)
   643  	}
   644  	req.Header.Set(cos.HdrUserAgent, ua)
   646  	resp, res.err = client.Do(req)
   647  	if res.err != nil {
   648  		res.details = "[control-plane]" // tcp level, e.g.: connection refused
   649  		return
   650  	}
   651  	defer resp.Body.Close()
   652  	res.status = resp.StatusCode
   653  	res.header = resp.Header
   655  	// err == nil && bad status: resp.Body contains the error message
   656  	if res.status >= http.StatusBadRequest {
   657  		if args.req.Method == http.MethodHead {
   658  			msg := resp.Header.Get(apc.HdrError)
   659  			res.err = res.herr(req, msg)
   660  		} else {
   661  			b := cmn.NewBuffer()
   662  			b.ReadFrom(resp.Body)
   663  			res.err = res.herr(req, b.String())
   664  			cmn.FreeBuffer(b)
   665  		}
   666  		res.details = res.err.Error()
   667  		return
   668  	}
   670  	// read and decode via call result value (`cresv`), if provided
   671  	// othwerwise, read and return bytes for the caller to unmarshal
   672  	if args.cresv != nil {
   673  		res.v = args.cresv.newV()
   674, resp.Body)
   675  		if res.err != nil {
   676  			return
   677  		}
   678  	} else {
   680  		if res.err != nil {
   681  			return
   682  		}
   683  	}
   685  	if sid != unknownDaemonID {
   686  		h.keepalive.heardFrom(sid)
   687  	}
   688  	return
   689  }
   691  //
   692  // intra-cluster IPC, control plane: notify another node
   693  //
   695  func (h *htrun) notifyTerm(n core.Notif, err error, aborted bool) {
   696  	h._nfy(n, err, apc.Finished, aborted)
   697  }
   698  func (h *htrun) notifyProgress(n core.Notif) { h._nfy(n, nil, apc.Progress, false) }
   700  func (h *htrun) _nfy(n core.Notif, err error, upon string, aborted bool) {
   701  	var (
   702  		smap  = h.owner.smap.get()
   703  		dsts  = n.Subscribers()
   704  		msg   = n.ToNotifMsg(aborted)
   705  		args  = allocBcArgs()
   706  		nodes = args.selected
   707  	)
   708  	debug.Assert(upon == apc.Progress || upon == apc.Finished)
   709  	if len(dsts) == 1 && dsts[0] == equalIC {
   710  		for pid, psi := range smap.Pmap {
   711  			if smap.IsIC(psi) && pid != && !psi.InMaintOrDecomm() {
   712  				nodes = append(nodes, psi)
   713  			}
   714  		}
   715  	} else {
   716  		for _, dst := range dsts {
   717  			debug.Assert(dst != equalIC)
   718  			if si := smap.GetActiveNode(dst); si != nil {
   719  				nodes = append(nodes, si)
   720  			} else {
   721  				nlog.Errorln(&errNodeNotFound{"failed to notify", dst,, smap})
   722  			}
   723  		}
   724  	}
   725  	if err != nil {
   726  		msg.ErrMsg = err.Error()
   727  		msg.AbortedX = aborted
   728  	}
   729  	msg.NodeID =
   730  	if len(nodes) == 0 {
   731  		nlog.Errorf("%s: have no nodes to send [%s] notification", h, &msg)
   732  		return
   733  	}
   734  	path := apc.URLPathNotifs.Join(upon)
   735  	args.req = cmn.HreqArgs{Method: http.MethodPost, Path: path, Body: cos.MustMarshal(&msg)}
   736 = cmn.NetIntraControl
   737  	args.timeout = cmn.Rom.MaxKeepalive()
   738  	args.selected = nodes
   739  	args.nodeCount = len(nodes)
   740  	args.smap = smap
   741  	args.async = true
   742  	_ = h.bcastSelected(args)
   743  	freeBcArgs(args)
   744  }
   746  //
   747  // intra-cluster comm
   748  //
   750  // bcastGroup broadcasts a message to a specific group of nodes: targets, proxies, all.
   751  func (h *htrun) bcastGroup(args *bcastArgs) sliceResults {
   752  	if args.smap == nil {
   753  		args.smap = h.owner.smap.get()
   754  	}
   755  	present := args.smap.isPresent(
   756  	if == "" {
   757 = cmn.NetIntraControl
   758  	}
   759  	debug.Assert(cmn.NetworkIsKnown(
   760  	if args.timeout == 0 {
   761  		args.timeout = cmn.Rom.CplaneOperation()
   762  		debug.Assert(args.timeout != 0)
   763  	}
   765  	switch {
   766  	case core.Targets:
   767  		args.nodes = []meta.NodeMap{args.smap.Tmap}
   768  		args.nodeCount = len(args.smap.Tmap)
   769  		if present && {
   770  			args.nodeCount--
   771  		}
   772  	case core.Proxies:
   773  		args.nodes = []meta.NodeMap{args.smap.Pmap}
   774  		args.nodeCount = len(args.smap.Pmap)
   775  		if present && {
   776  			args.nodeCount--
   777  		}
   778  	case core.AllNodes:
   779  		args.nodes = []meta.NodeMap{args.smap.Pmap, args.smap.Tmap}
   780  		args.nodeCount = len(args.smap.Pmap) + len(args.smap.Tmap)
   781  		if present {
   782  			args.nodeCount--
   783  		}
   784  	case core.SelectedNodes:
   785  		args.nodeCount = len(args.nodes)
   786  		debug.Assert(args.nodeCount > 0)
   787  	default:
   788  		debug.Assert(false,
   789  	}
   790  	return h.bcastNodes(args)
   791  }
   793  // broadcast to the specified destinations (`bargs.nodes`)
   794  // (if specified, `bargs.req.BodyR` must implement `cos.ReadOpenCloser`)
   795  func (h *htrun) bcastNodes(bargs *bcastArgs) sliceResults {
   796  	var (
   797  		results bcastResults
   798  		wg      = cos.NewLimitedWaitGroup(cmn.MaxParallelism(), bargs.nodeCount)
   799  		f       = func(si *meta.Snode) { h._call(si, bargs, &results); wg.Done() }
   800  	)
   801  	debug.Assert(len(bargs.selected) == 0)
   802  	if !bargs.async {
   803  		results.s = allocBcastRes(len(bargs.nodes))
   804  	}
   805  	for _, nodeMap := range bargs.nodes {
   806  		for _, si := range nodeMap {
   807  			if si.ID() == {
   808  				continue
   809  			}
   811  			// TODO: remove
   812  			debug.Func(func() {
   813  				if si.URL( == {
   814  					nlog.Errorf(fmtErrNetInfoChanged, h, si.StringEx(), si.URL(
   815  				}
   816  			})
   818  			if !bargs.ignoreMaintenance && si.InMaintOrDecomm() {
   819  				continue
   820  			}
   821  			wg.Add(1)
   822  			go f(si)
   823  		}
   824  	}
   825  	wg.Wait()
   826  	return results.s
   827  }
   829  func (h *htrun) bcastSelected(bargs *bcastArgs) sliceResults {
   830  	var (
   831  		results bcastResults
   832  		wg      = cos.NewLimitedWaitGroup(cmn.MaxParallelism(), bargs.nodeCount)
   833  		f       = func(si *meta.Snode) { h._call(si, bargs, &results); wg.Done() }
   834  	)
   835  	debug.Assert(len(bargs.selected) > 0)
   836  	if !bargs.async {
   837  		results.s = allocBcastRes(len(bargs.selected))
   838  	}
   839  	for _, si := range bargs.selected {
   840  		debug.Assert(si.ID() !=
   841  		wg.Add(1)
   842  		go f(si)
   843  	}
   844  	wg.Wait()
   845  	return results.s
   846  }
   848  func (h *htrun) bcastAsyncIC(msg *aisMsg) {
   849  	var (
   850  		wg   = &sync.WaitGroup{}
   851  		smap = h.owner.smap.get()
   852  		args = allocBcArgs()
   853  	)
   854  	args.req = cmn.HreqArgs{Method: http.MethodPost, Path: apc.URLPathIC.S, Body: cos.MustMarshal(msg)}
   855 = cmn.NetIntraControl
   856  	args.timeout = cmn.Rom.MaxKeepalive()
   857  	for pid, psi := range smap.Pmap {
   858  		if pid == || !smap.IsIC(psi) || smap.GetActiveNode(pid) == nil {
   859  			continue
   860  		}
   861  		wg.Add(1)
   862  		go func(si *meta.Snode) {
   863  			cargs := allocCargs()
   864  			{
   865 = si
   866  				cargs.req = args.req
   867  				cargs.timeout = args.timeout
   868  			}
   869  			res :=, smap)
   870  			freeCargs(cargs)
   871  			freeCR(res) // discard right away
   872  			wg.Done()
   873  		}(psi)
   874  	}
   875  	wg.Wait()
   876  	freeBcArgs(args)
   877  }
   879  func (h *htrun) bcastAllNodes(w http.ResponseWriter, r *http.Request, args *bcastArgs) {
   880 = core.AllNodes
   881  	results := h.bcastGroup(args)
   882  	for _, res := range results {
   883  		if res.err != nil {
   884  			h.writeErr(w, r, res.toErr())
   885  			break
   886  		}
   887  	}
   888  	freeBcastRes(results)
   889  }
   891  //
   892  // parsing helpers
   893  //
   895  // remove validated fields and return the resulting slice
   896  func (h *htrun) parseURL(w http.ResponseWriter, r *http.Request, itemsPresent []string, itemsAfter int, splitAfter bool) ([]string, error) {
   897  	items, err := cmn.ParseURL(r.URL.Path, itemsPresent, itemsAfter, splitAfter)
   898  	if err != nil {
   899  		h.writeErr(w, r, err)
   900  	}
   901  	return items, err
   902  }
   904  func (h *htrun) writeMsgPack(w http.ResponseWriter, v msgp.Encodable, tag string) (ok bool) {
   905  	var (
   906  		err       error
   907  		buf, slab = h.gmm.AllocSize(cmn.MsgpLsoBufSize) // max size
   908  		mw        = msgp.NewWriterBuf(w, buf)
   909  	)
   910  	w.Header().Set(cos.HdrContentType, cos.ContentMsgPack)
   911  	if err = v.EncodeMsg(mw); err == nil {
   912  		err = mw.Flush()
   913  	}
   914  	slab.Free(buf)
   915  	if err == nil {
   916  		return true
   917  	}
   918  	h.logerr(tag, v, err)
   919  	return false
   920  }
   922  func (h *htrun) writeJSON(w http.ResponseWriter, r *http.Request, v any, tag string) {
   923  	if err := _writejs(w, r, v); err != nil {
   924  		h.logerr(tag, v, err)
   925  	}
   926  }
   928  // same as above with boolean return to facilitate early termination
   929  func (h *htrun) writeJS(w http.ResponseWriter, r *http.Request, v any, tag string) bool {
   930  	if err := _writejs(w, r, v); err != nil {
   931  		h.logerr(tag, v, err)
   932  		return false
   933  	}
   934  	return true
   935  }
   937  func _writejs(w http.ResponseWriter, r *http.Request, v any) (err error) {
   938  	w.Header().Set(cos.HdrContentType, cos.ContentJSONCharsetUTF)
   939  	if isBrowser(r.Header.Get(cos.HdrUserAgent)) {
   940  		var out []byte
   941  		if out, err = jsoniter.MarshalIndent(v, "", "    "); err == nil {
   942  			w.Header().Set(cos.HdrContentLength, strconv.Itoa(len(out)))
   943  			_, err = w.Write(out)
   944  		}
   945  	} else { // previously: new-encoder(w).encode(v) (non-browser client)
   946  		j := cos.JSON.BorrowStream(nil)
   947  		j.WriteVal(v)
   948  		j.WriteRaw("\n")
   949  		if err = j.Error; err == nil {
   950  			b := j.Buffer()
   951  			w.Header().Set(cos.HdrContentLength, strconv.Itoa(len(b)))
   952  			_, err = w.Write(b)
   954  			// NOTE: consider http.NewResponseController(w).Flush()
   955  		}
   956  		cos.JSON.ReturnStream(j)
   957  	}
   958  	return
   959  }
   961  // See
   962  // and
   963  func isBrowser(userAgent string) bool {
   964  	return strings.HasPrefix(userAgent, "Mozilla/5.0")
   965  }
   967  func (h *htrun) logerr(tag string, v any, err error) {
   968  	const maxl = 48
   969  	var efmt, msg string
   970  	if nlog.Stopping() {
   971  		return
   972  	}
   973  	if v != nil {
   974  		efmt = fmt.Sprintf("message: {%+v", v)
   975  		if len(efmt) > maxl {
   976  			efmt = efmt[:maxl] + "...}"
   977  		} else {
   978  			efmt += "}"
   979  		}
   980  	}
   981  	efmt = tag + " response error: %v, " + efmt + " at "
   982  	msg = fmt.Sprintf(efmt, err)
   983  	for i := 1; i < 4; i++ {
   984  		_, file, line, ok := runtime.Caller(i)
   985  		if !ok {
   986  			break
   987  		}
   988  		if i > 1 {
   989  			msg += " <- "
   990  		}
   991  		f := filepath.Base(file)
   992  		msg += fmt.Sprintf("%s:%d", f, line)
   993  	}
   994  	if cos.IsErrBrokenPipe(err) { // client went away
   995  		nlog.Infoln("Warning: " + msg)
   996  	} else {
   997  		nlog.Errorln(msg)
   998  	}
   999  	h.statsT.IncErr(stats.ErrHTTPWriteCount)
  1000  }
  1002  func _parseNCopies(value any) (copies int64, err error) {
  1003  	switch v := value.(type) {
  1004  	case string:
  1005  		copies, err = strconv.ParseInt(v, 10, 16)
  1006  	case float64:
  1007  		copies = int64(v)
  1008  	default:
  1009  		err = fmt.Errorf("failed to parse 'copies' (%v, %T) - unexpected type", value, value)
  1010  	}
  1011  	return
  1012  }
  1014  func _checkAction(msg *apc.ActMsg, expectedActions ...string) (err error) {
  1015  	found := false
  1016  	for _, action := range expectedActions {
  1017  		found = found || msg.Action == action
  1018  	}
  1019  	if !found {
  1020  		err = fmt.Errorf(fmtErrInvaldAction, msg.Action, expectedActions)
  1021  	}
  1022  	return
  1023  }
  1025  //
  1026  // common cplane cont-d
  1027  //
  1029  func (h *htrun) httpdaeget(w http.ResponseWriter, r *http.Request, query url.Values, htext htext) {
  1030  	var (
  1031  		body any
  1032  		what = query.Get(apc.QparamWhat)
  1033  	)
  1034  	switch what {
  1035  	case apc.WhatNodeConfig:
  1036  		var (
  1037  			c      cmn.Config
  1038  			config = cmn.GCO.Get()
  1039  		)
  1040  		// hide secret
  1041  		c = *config
  1042  		c.Auth.Secret = "**********"
  1043  		body = &c
  1044  	case apc.WhatSmap:
  1045  		body = h.owner.smap.get()
  1046  	case apc.WhatBMD:
  1047  		body = h.owner.bmd.get()
  1048  	case apc.WhatSmapVote:
  1049  		var err error
  1050  		body, err = h.cluMeta(cmetaFillOpt{htext: htext, skipPrimeTime: true})
  1051  		if err != nil {
  1052  			nlog.Errorf("failed to fetch cluster config, err: %v", err)
  1053  		}
  1054  	case apc.WhatSnode:
  1055  		body =
  1056  	case apc.WhatLog:
  1057  		if cos.IsParseBool(query.Get(apc.QparamAllLogs)) {
  1058  			tempdir := h.sendAllLogs(w, r, query)
  1059  			if tempdir != "" {
  1060  				err := os.RemoveAll(tempdir)
  1061  				debug.AssertNoErr(err)
  1062  			}
  1063  		} else {
  1064  			h.sendOneLog(w, r, query)
  1065  		}
  1066  		return
  1067  	case apc.WhatNodeStats:
  1068  		statsNode := h.statsT.GetStats()
  1069  		statsNode.Snode =
  1070  		body = statsNode
  1071  	case apc.WhatNodeStatsV322:
  1072  		statsNode := h.statsT.GetStatsV322()
  1073  		statsNode.Snode =
  1074  		body = statsNode
  1075  	case apc.WhatMetricNames:
  1076  		body = h.statsT.GetMetricNames()
  1077  	case apc.WhatNodeStatsAndStatusV322:
  1078  		ds := h.statsAndStatusV322()
  1079  		daeStats := h.statsT.GetStatsV322()
  1080  		ds.Tracker = daeStats.Tracker
  1081  		body = ds
  1082  	default:
  1083  		h.writeErrf(w, r, "invalid GET /daemon request: unrecognized what=%s", what)
  1084  		return
  1085  	}
  1086  	h.writeJSON(w, r, body, "httpdaeget-"+what)
  1087  }
  1089  func (h *htrun) statsAndStatus() (ds *stats.NodeStatus) {
  1090  	smap := h.owner.smap.get()
  1091  	ds = &stats.NodeStatus{
  1092  		Node: stats.Node{
  1093  			Snode:,
  1094  		},
  1095  		SmapVersion:    smap.Version,
  1096  		MemCPUInfo:     apc.GetMemCPU(),
  1097  		DeploymentType: deploymentType(),
  1098  		Version:        daemon.version,
  1099  		BuildTime:      daemon.buildTime,
  1100  		K8sPodName:     os.Getenv(env.AIS.K8sPod),
  1101  		Status:         h._status(smap),
  1102  	}
  1103  	return ds
  1104  }
  1106  // [backward compatibility] v3.22 and prior
  1107  func (h *htrun) statsAndStatusV322() (ds *stats.NodeStatusV322) {
  1108  	smap := h.owner.smap.get()
  1109  	ds = &stats.NodeStatusV322{
  1110  		NodeV322: stats.NodeV322{
  1111  			Snode:,
  1112  		},
  1113  		SmapVersion:    smap.Version,
  1114  		MemCPUInfo:     apc.GetMemCPU(),
  1115  		DeploymentType: deploymentType(),
  1116  		Version:        daemon.version,
  1117  		BuildTime:      daemon.buildTime,
  1118  		K8sPodName:     os.Getenv(env.AIS.K8sPod),
  1119  		Status:         h._status(smap),
  1120  	}
  1121  	return ds
  1122  }
  1124  func (h *htrun) sendAllLogs(w http.ResponseWriter, r *http.Request, query url.Values) string {
  1125  	sev := query.Get(apc.QparamLogSev)
  1126  	tempdir, archname, err := h.targzLogs(sev)
  1127  	if err != nil {
  1128  		h.writeErr(w, r, err)
  1129  		return tempdir
  1130  	}
  1131  	fh, err := os.Open(archname)
  1132  	if err != nil {
  1133  		h.writeErr(w, r, err)
  1134  		return tempdir
  1135  	}
  1136  	buf, slab := h.gmm.Alloc()
  1137  	if written, err := io.CopyBuffer(w, fh, buf); err != nil {
  1138  		nlog.Errorf("failed to read %s: %v (written=%d)", archname, err, written)
  1139  	}
  1140  	cos.Close(fh)
  1141  	slab.Free(buf)
  1142  	return tempdir
  1143  }
  1145  func (h *htrun) sendOneLog(w http.ResponseWriter, r *http.Request, query url.Values) {
  1146  	sev := query.Get(apc.QparamLogSev)
  1147  	log, err := sev2Logname(sev)
  1148  	if err != nil {
  1149  		h.writeErr(w, r, err)
  1150  		return
  1151  	}
  1152  	fh, err := os.Open(log)
  1153  	if err != nil {
  1154  		ecode := http.StatusInternalServerError
  1155  		if os.IsNotExist(err) {
  1156  			ecode = http.StatusNotFound
  1157  		}
  1158  		h.writeErr(w, r, err, ecode)
  1159  		return
  1160  	}
  1161  	soff := query.Get(apc.QparamLogOff)
  1162  	if soff != "" {
  1163  		var (
  1164  			off   int64
  1165  			err   error
  1166  			finfo os.FileInfo
  1167  		)
  1168  		off, err = strconv.ParseInt(soff, 10, 64)
  1169  		if err == nil {
  1170  			finfo, err = os.Stat(log)
  1171  			if err == nil {
  1172  				if siz := finfo.Size(); off > siz {
  1173  					err = fmt.Errorf("log likely rotated (offset %d, size %d)", off, siz)
  1174  				}
  1175  			}
  1176  		}
  1177  		if err == nil {
  1178  			_, err = fh.Seek(off, io.SeekStart)
  1179  		}
  1180  		if err != nil {
  1181  			cos.Close(fh)
  1182  			h.writeErr(w, r, err)
  1183  			return
  1184  		}
  1185  	}
  1186  	buf, slab := h.gmm.Alloc()
  1187  	if written, err := io.CopyBuffer(w, fh, buf); err != nil {
  1188  		// at this point, http err must be already on its way
  1189  		nlog.Errorf("failed to read %s: %v (written=%d)", log, err, written)
  1190  	}
  1191  	cos.Close(fh)
  1192  	slab.Free(buf)
  1193  }
  1195  // see also: cli 'log get --all'
  1196  func (h *htrun) targzLogs(severity string) (tempdir, archname string, err error) {
  1197  	var (
  1198  		wfh      *os.File
  1199  		dentries []os.DirEntry
  1200  		logdir   = cmn.GCO.Get().LogDir
  1201  	)
  1202  	dentries, err = os.ReadDir(logdir)
  1203  	if err != nil {
  1204  		err = fmt.Errorf("read-dir %w", err)
  1205  		return
  1206  	}
  1207  	tempdir = filepath.Join(os.TempDir(), "aislogs-"+h.SID())
  1208  	err = cos.CreateDir(tempdir)
  1209  	if err != nil {
  1210  		err = fmt.Errorf("create-dir %w", err)
  1211  		return
  1212  	}
  1213  	wfh, err = os.CreateTemp(tempdir, "")
  1214  	if err != nil {
  1215  		err = fmt.Errorf("create-temp %w", err)
  1216  		return
  1217  	}
  1218  	archname = wfh.Name()
  1219  	aw := archive.NewWriter(archive.ExtTarGz, wfh, nil /*checksum*/, nil /*opts*/)
  1221  	defer func() {
  1222  		aw.Fini()
  1223  		wfh.Close()
  1224  	}()
  1226  	for _, dent := range dentries {
  1227  		if !dent.Type().IsRegular() {
  1228  			continue
  1229  		}
  1230  		finfo, errV := dent.Info()
  1231  		if errV != nil {
  1232  			continue
  1233  		}
  1234  		var (
  1235  			fullPath = filepath.Join(logdir, finfo.Name())
  1236  			rfh      *os.File
  1237  		)
  1238  		if !logname2Sev(fullPath, severity) {
  1239  			continue
  1240  		}
  1241  		rfh, err = os.Open(fullPath)
  1242  		if err != nil {
  1243  			if os.IsNotExist(err) {
  1244  				continue
  1245  			}
  1246  			return
  1247  		}
  1248  		oah := cos.SimpleOAH{Size: finfo.Size(), Atime: finfo.ModTime().UnixNano()}
  1249  		err = aw.Write(finfo.Name(), oah, rfh)
  1250  		rfh.Close()
  1251  		if err != nil {
  1252  			return
  1253  		}
  1254  	}
  1255  	return
  1256  }
  1258  func sev2Logname(severity string) (log string, err error) {
  1259  	var (
  1260  		dir = cmn.GCO.Get().LogDir
  1261  		sev = apc.LogInfo[0] // default
  1262  	)
  1263  	if severity != "" {
  1264  		sev = strings.ToLower(severity)[0]
  1265  	}
  1266  	switch sev {
  1267  	case apc.LogInfo[0]:
  1268  		log = filepath.Join(dir, nlog.InfoLogName())
  1269  	case apc.LogWarn[0], apc.LogErr[0]:
  1270  		log = filepath.Join(dir, nlog.ErrLogName())
  1271  	default:
  1272  		err = fmt.Errorf("unknown log severity %q", severity)
  1273  	}
  1274  	return
  1275  }
  1277  func logname2Sev(fname, severity string) bool {
  1278  	log, err := sev2Logname(severity)
  1279  	if err != nil {
  1280  		nlog.Warningln(err)
  1281  		return false
  1282  	}
  1283  	i := strings.LastIndexByte(log, '.')
  1284  	if i < 0 {
  1285  		nlog.Warningf("%q: unexpected log name format", log)
  1286  		return false
  1287  	}
  1288  	return strings.Contains(fname, log[i:])
  1289  }
  1291  //
  1292  // HTTP err + spec message + code + stats
  1293  //
  1295  const Silent = 1
  1297  func (*htrun) writeErr(w http.ResponseWriter, r *http.Request, err error, ecode {
  1298  	cmn.WriteErr(w, r, err, ecode...) // [ecode[, silent]]
  1299  }
  1301  func (*htrun) writeErrMsg(w http.ResponseWriter, r *http.Request, msg string, ecode {
  1302  	cmn.WriteErrMsg(w, r, msg, ecode...) // [ecode[, silent]]
  1303  }
  1305  func (h *htrun) writeErrSilentf(w http.ResponseWriter, r *http.Request, ecode int, format string, a ...any) {
  1306  	err := fmt.Errorf(format, a...)
  1307  	h.writeErr(w, r, err, ecode, Silent)
  1308  }
  1310  func (h *htrun) writeErrStatusf(w http.ResponseWriter, r *http.Request, ecode int, format string, a ...any) {
  1311  	err := fmt.Errorf(format, a...)
  1312  	h.writeErrMsg(w, r, err.Error(), ecode)
  1313  }
  1315  func (h *htrun) writeErrf(w http.ResponseWriter, r *http.Request, format string, a ...any) {
  1316  	err := fmt.Errorf(format, a...)
  1317  	if cos.IsNotExist(err, 0) {
  1318  		h.writeErrMsg(w, r, err.Error(), http.StatusNotFound)
  1319  	} else {
  1320  		h.writeErrMsg(w, r, err.Error())
  1321  	}
  1322  }
  1324  func (h *htrun) writeErrURL(w http.ResponseWriter, r *http.Request) {
  1325  	if r.URL.Scheme != "" {
  1326  		h.writeErrf(w, r, "request '%s %s://%s': invalid URL path", r.Method, r.URL.Scheme, r.URL.Path)
  1327  		return
  1328  	}
  1329  	// ignore GET /favicon.ico by Browsers
  1330  	if r.URL.Path == "/favicon.ico" || r.URL.Path == "favicon.ico" {
  1331  		return
  1332  	}
  1333  	h.writeErrf(w, r, "invalid request URI: '%s %s'", r.Method, r.RequestURI)
  1334  }
  1336  func (h *htrun) writeErrAct(w http.ResponseWriter, r *http.Request, action string) {
  1337  	err := cmn.InitErrHTTP(r, fmt.Errorf("invalid action %q", action), 0)
  1338  	h.writeErr(w, r, err)
  1339  	cmn.FreeHterr(err)
  1340  }
  1342  func (h *htrun) writeErrActf(w http.ResponseWriter, r *http.Request, action string,
  1343  	format string, a ...any) {
  1344  	detail := fmt.Sprintf(format, a...)
  1345  	err := cmn.InitErrHTTP(r, fmt.Errorf("invalid action %q: %s", action, detail), 0)
  1346  	h.writeErr(w, r, err)
  1347  	cmn.FreeHterr(err)
  1348  }
  1350  // also, validatePrefix
  1351  func (h *htrun) isValidObjname(w http.ResponseWriter, r *http.Request, name string) bool {
  1352  	if err := cmn.ValidateObjName(name); err != nil {
  1353  		h.writeErr(w, r, err)
  1354  		return false
  1355  	}
  1356  	return true
  1357  }
  1359  // health client
  1360  func (h *htrun) reqHealth(si *meta.Snode, timeout time.Duration, query url.Values, smap *smapX) (b []byte, status int, err error) {
  1361  	var (
  1362  		path  = apc.URLPathHealth.S
  1363  		url   = si.URL(cmn.NetIntraControl)
  1364  		cargs = allocCargs()
  1365  	)
  1366  	{
  1367 = si
  1368  		cargs.req = cmn.HreqArgs{Method: http.MethodGet, Base: url, Path: path, Query: query}
  1369  		cargs.timeout = timeout
  1370  	}
  1371  	res :=, smap)
  1372  	b, status, err = res.bytes, res.status, res.err
  1373  	freeCargs(cargs)
  1374  	freeCR(res)
  1375  	return
  1376  }
  1378  // - utilizes reqHealth (above) to discover a _better_ Smap, if exists
  1379  // - via
  1380  // - checkAll: query all nodes
  1381  // - consider adding max-ver BMD bit here as well (TODO)
  1382  func (h *htrun) bcastHealth(smap *smapX, checkAll bool) (*cifl.Info, int /*num confirmations*/) {
  1383  	if !smap.isValid() {
  1384  		nlog.Errorf("%s: cannot execute with invalid %s", h, smap)
  1385  		return nil, 0
  1386  	}
  1387  	c := getMaxCii{
  1388  		h:        h,
  1389  		maxCii:   &cifl.Info{},
  1390  		query:    url.Values{apc.QparamClusterInfo: []string{"true"}},
  1391  		timeout:  cmn.Rom.CplaneOperation(),
  1392  		checkAll: checkAll,
  1393  	}
  1394  	smap.fill(c.maxCii)
  1396  	h._bch(&c, smap, apc.Proxy)
  1397  	if checkAll || (c.cnt < maxVerConfirmations && smap.CountActiveTs() > 0) {
  1398  		h._bch(&c, smap, apc.Target)
  1399  	}
  1400  	nlog.Infoln(h.String()+":", c.maxCii.String())
  1401  	return c.maxCii, c.cnt
  1402  }
  1404  func (h *htrun) _bch(c *getMaxCii, smap *smapX, nodeTy string) {
  1405  	var (
  1406  		wg       cos.WG
  1407  		i, count int
  1408  		nodemap  = smap.Pmap
  1409  	)
  1410  	if nodeTy == apc.Target {
  1411  		nodemap = smap.Tmap
  1412  	}
  1413  	if c.checkAll {
  1414  		wg = cos.NewLimitedWaitGroup(cmn.MaxParallelism(), len(nodemap))
  1415  	} else {
  1416  		count = min(cmn.MaxParallelism(), maxVerConfirmations<<1)
  1417  		wg = cos.NewLimitedWaitGroup(count, len(nodemap) /*have*/)
  1418  	}
  1419  	for sid, si := range nodemap {
  1420  		if sid == {
  1421  			continue
  1422  		}
  1423  		if si.InMaintOrDecomm() {
  1424  			continue
  1425  		}
  1426  		if count > 0 && count < len(nodemap) && i > count {
  1427  			if c.haveEnough() {
  1428  				break
  1429  			}
  1430  		}
  1431  		wg.Add(1)
  1432  		i++
  1433  		go, wg, smap)
  1434  	}
  1435  	wg.Wait()
  1436  }
  1438  //
  1439  // metasync Rx
  1440  //
  1442  func logmsync(ver int64, revs revs, msg *aisMsg, opts ...string) {
  1443  	const tag = "msync Rx:"
  1444  	var (
  1445  		what   string
  1446  		caller = opts[0]
  1447  		lv     = strconv.FormatInt(ver, 10)
  1448  	)
  1449  	if len(opts) == 1 {
  1450  		what = revs.String()
  1451  	} else {
  1452  		what = opts[1]
  1453  	}
  1454  	switch {
  1455  	case ver == revs.version():
  1456  		nlog.InfoDepth(1, tag, what, "(same v"+lv+",", msg.String(), "<--", caller+")")
  1457  	case ver > revs.version():
  1458  		nlog.InfoDepth(1, "Warning", tag, what, "(down from v"+lv+",", msg.String(), "<--", caller+")")
  1459  	default:
  1460  		nlog.InfoDepth(1, tag, "new", what, "(have v"+lv+",", msg.String(), "<--", caller+")")
  1461  	}
  1462  }
  1464  func (h *htrun) extractConfig(payload msPayload, caller string) (newConfig *globalConfig, msg *aisMsg, err error) {
  1465  	if _, ok := payload[revsConfTag]; !ok {
  1466  		return
  1467  	}
  1468  	newConfig, msg = &globalConfig{}, &aisMsg{}
  1469  	confValue := payload[revsConfTag]
  1470  	reader := bytes.NewBuffer(confValue)
  1471  	if _, err1 := jsp.Decode(io.NopCloser(reader), newConfig, newConfig.JspOpts(), "extractConfig"); err1 != nil {
  1472  		err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "new Config", cos.BHead(confValue), err1)
  1473  		return
  1474  	}
  1475  	if msgValue, ok := payload[revsConfTag+revsActionTag]; ok {
  1476  		if err1 := jsoniter.Unmarshal(msgValue, msg); err1 != nil {
  1477  			err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "action message", cos.BHead(msgValue), err1)
  1478  			return
  1479  		}
  1480  	}
  1481  	config := cmn.GCO.Get()
  1482  	if cmn.Rom.FastV(4, cos.SmoduleAIS) {
  1483  		logmsync(config.Version, newConfig, msg, caller)
  1484  	}
  1485  	if newConfig.version() <= config.Version {
  1486  		if newConfig.version() < config.Version {
  1487  			err = newErrDowngrade(, config.String(), newConfig.String())
  1488  		}
  1489  		newConfig = nil
  1490  	}
  1491  	return
  1492  }
  1494  func (h *htrun) extractEtlMD(payload msPayload, caller string) (newMD *etlMD, msg *aisMsg, err error) {
  1495  	if _, ok := payload[revsEtlMDTag]; !ok {
  1496  		return
  1497  	}
  1498  	newMD, msg = newEtlMD(), &aisMsg{}
  1499  	etlMDValue := payload[revsEtlMDTag]
  1500  	reader := bytes.NewBuffer(etlMDValue)
  1501  	if _, err1 := jsp.Decode(io.NopCloser(reader), newMD, newMD.JspOpts(), "extractEtlMD"); err1 != nil {
  1502  		err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "new EtlMD", cos.BHead(etlMDValue), err1)
  1503  		return
  1504  	}
  1505  	if msgValue, ok := payload[revsEtlMDTag+revsActionTag]; ok {
  1506  		if err1 := jsoniter.Unmarshal(msgValue, msg); err1 != nil {
  1507  			err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "action message", cos.BHead(msgValue), err1)
  1508  			return
  1509  		}
  1510  	}
  1511  	etlMD := h.owner.etl.get()
  1512  	if cmn.Rom.FastV(4, cos.SmoduleAIS) {
  1513  		logmsync(etlMD.Version, newMD, msg, caller)
  1514  	}
  1515  	if newMD.version() <= etlMD.version() {
  1516  		if newMD.version() < etlMD.version() {
  1517  			err = newErrDowngrade(, etlMD.String(), newMD.String())
  1518  		}
  1519  		newMD = nil
  1520  	}
  1521  	return
  1522  }
  1524  func (h *htrun) extractSmap(payload msPayload, caller string, skipValidation bool) (newSmap *smapX, msg *aisMsg, err error) {
  1525  	if _, ok := payload[revsSmapTag]; !ok {
  1526  		return
  1527  	}
  1528  	newSmap, msg = &smapX{}, &aisMsg{}
  1529  	smapValue := payload[revsSmapTag]
  1530  	reader := bytes.NewBuffer(smapValue)
  1531  	if _, err1 := jsp.Decode(io.NopCloser(reader), newSmap, newSmap.JspOpts(), "extractSmap"); err1 != nil {
  1532  		err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "new Smap", cos.BHead(smapValue), err1)
  1533  		return
  1534  	}
  1535  	if msgValue, ok := payload[revsSmapTag+revsActionTag]; ok {
  1536  		if err1 := jsoniter.Unmarshal(msgValue, msg); err1 != nil {
  1537  			err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "action message", cos.BHead(msgValue), err1)
  1538  			return
  1539  		}
  1540  	}
  1541  	if skipValidation {
  1542  		return
  1543  	}
  1545  	var (
  1546  		smap        = h.owner.smap.get()
  1547  		curVer      = smap.version()
  1548  		isManualReb = msg.Action == apc.ActRebalance && msg.Value != nil
  1549  	)
  1550  	if newSmap.version() == curVer && !isManualReb {
  1551  		newSmap = nil
  1552  		return
  1553  	}
  1554  	if !newSmap.isValid() {
  1555  		err = cmn.NewErrFailedTo(h, "extract", newSmap, newSmap.validate())
  1556  		return
  1557  	}
  1558  	if !newSmap.isPresent( {
  1559  		err = fmt.Errorf("%s: not finding ourselves in %s", h, newSmap)
  1560  		return
  1561  	}
  1562  	if err = smap.validateUUID(, newSmap, caller, 50 /* ciError */); err != nil {
  1563  		return // FATAL: cluster integrity error
  1564  	}
  1565  	if cmn.Rom.FastV(4, cos.SmoduleAIS) {
  1566  		logmsync(smap.Version, newSmap, msg, caller)
  1567  	}
  1568  	_, sameOrigin, _, eq := smap.Compare(&newSmap.Smap)
  1569  	debug.Assert(sameOrigin)
  1570  	if newSmap.version() < curVer {
  1571  		if !eq {
  1572  			err = newErrDowngrade(, smap.StringEx(), newSmap.StringEx())
  1573  			return
  1574  		}
  1575  		nlog.Warningf("%s: %s and %s are otherwise identical",, newSmap.StringEx(), smap.StringEx())
  1576  		newSmap = nil
  1577  	}
  1578  	return
  1579  }
  1581  func (h *htrun) extractRMD(payload msPayload, caller string) (newRMD *rebMD, msg *aisMsg, err error) {
  1582  	if _, ok := payload[revsRMDTag]; !ok {
  1583  		return
  1584  	}
  1585  	newRMD, msg = &rebMD{}, &aisMsg{}
  1586  	rmdValue := payload[revsRMDTag]
  1587  	if err1 := jsoniter.Unmarshal(rmdValue, newRMD); err1 != nil {
  1588  		err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "new RMD", cos.BHead(rmdValue), err1)
  1589  		return
  1590  	}
  1591  	if msgValue, ok := payload[revsRMDTag+revsActionTag]; ok {
  1592  		if err1 := jsoniter.Unmarshal(msgValue, msg); err1 != nil {
  1593  			err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "action message", cos.BHead(msgValue), err1)
  1594  			return
  1595  		}
  1596  	}
  1598  	rmd := h.owner.rmd.get()
  1599  	if newRMD.CluID != "" && newRMD.CluID != rmd.CluID && rmd.CluID != "" {
  1600  		logmsync(rmd.Version, newRMD, msg, caller)
  1601  		err = h.owner.rmd.newClusterIntegrityErr(h.String(), newRMD.CluID, rmd.CluID, rmd.Version)
  1602  		cos.ExitLog(err) // FATAL
  1603  	}
  1605  	if cmn.Rom.FastV(4, cos.SmoduleAIS) {
  1606  		logmsync(rmd.Version, newRMD, msg, caller)
  1607  	}
  1608  	if newRMD.version() <= rmd.version() {
  1609  		if newRMD.version() < rmd.version() {
  1610  			err = newErrDowngrade(, rmd.String(), newRMD.String())
  1611  		}
  1612  		newRMD = nil
  1613  	}
  1614  	return
  1615  }
  1617  func (h *htrun) extractBMD(payload msPayload, caller string) (newBMD *bucketMD, msg *aisMsg, err error) {
  1618  	if _, ok := payload[revsBMDTag]; !ok {
  1619  		return
  1620  	}
  1621  	newBMD, msg = &bucketMD{}, &aisMsg{}
  1622  	bmdValue := payload[revsBMDTag]
  1623  	reader := bytes.NewBuffer(bmdValue)
  1624  	if _, err1 := jsp.Decode(io.NopCloser(reader), newBMD, newBMD.JspOpts(), "extractBMD"); err1 != nil {
  1625  		err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "new BMD", cos.BHead(bmdValue), err1)
  1626  		return
  1627  	}
  1628  	if msgValue, ok := payload[revsBMDTag+revsActionTag]; ok {
  1629  		if err1 := jsoniter.Unmarshal(msgValue, msg); err1 != nil {
  1630  			err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "action message", cos.BHead(msgValue), err1)
  1631  			return
  1632  		}
  1633  	}
  1634  	bmd := h.owner.bmd.get()
  1635  	if cmn.Rom.FastV(4, cos.SmoduleAIS) {
  1636  		logmsync(bmd.Version, newBMD, msg, caller)
  1637  	}
  1638  	// skip older iff not transactional - see t.receiveBMD()
  1639  	if && msg.UUID != "" {
  1640  		return
  1641  	}
  1642  	if newBMD.version() <= bmd.version() {
  1643  		if newBMD.version() < bmd.version() {
  1644  			err = newErrDowngrade(, bmd.StringEx(), newBMD.StringEx())
  1645  		}
  1646  		newBMD = nil
  1647  	}
  1648  	return
  1649  }
  1651  func (h *htrun) receiveSmap(newSmap *smapX, msg *aisMsg, payload msPayload, caller string, cb smapUpdatedCB) error {
  1652  	if newSmap == nil {
  1653  		return nil
  1654  	}
  1655  	smap := h.owner.smap.get()
  1656  	logmsync(smap.Version, newSmap, msg, caller, newSmap.StringEx())
  1658  	if !newSmap.isPresent( {
  1659  		return fmt.Errorf("%s: not finding self in the new %s", h, newSmap)
  1660  	}
  1661  	return h.owner.smap.synchronize(, newSmap, payload, cb)
  1662  }
  1664  func (h *htrun) receiveEtlMD(newEtlMD *etlMD, msg *aisMsg, payload msPayload, caller string, cb func(ne, oe *etlMD)) (err error) {
  1665  	if newEtlMD == nil {
  1666  		return
  1667  	}
  1668  	etlMD := h.owner.etl.get()
  1669  	logmsync(etlMD.Version, newEtlMD, msg, caller)
  1671  	h.owner.etl.Lock()
  1672  	etlMD = h.owner.etl.get()
  1673  	if newEtlMD.version() <= etlMD.version() {
  1674  		h.owner.etl.Unlock()
  1675  		if newEtlMD.version() < etlMD.version() {
  1676  			err = newErrDowngrade(, etlMD.String(), newEtlMD.String())
  1677  		}
  1678  		return
  1679  	}
  1680  	err = h.owner.etl.putPersist(newEtlMD, payload)
  1681  	h.owner.etl.Unlock()
  1682  	debug.AssertNoErr(err)
  1684  	if cb != nil {
  1685  		cb(newEtlMD, etlMD)
  1686  	}
  1687  	return
  1688  }
  1690  // under lock
  1691  func (h *htrun) _recvCfg(newConfig *globalConfig, payload msPayload) (err error) {
  1692  	config := cmn.GCO.Get()
  1693  	if newConfig.version() <= config.Version {
  1694  		if newConfig.version() == config.Version {
  1695  			return
  1696  		}
  1697  		return newErrDowngrade(, config.String(), newConfig.String())
  1698  	}
  1699  	if err = h.owner.config.persist(newConfig, payload); err != nil {
  1700  		return
  1701  	}
  1702  	if err = cmn.GCO.Update(&newConfig.ClusterConfig); err != nil {
  1703  		return
  1704  	}
  1705  	return
  1706  }
  1708  func (h *htrun) extractRevokedTokenList(payload msPayload, caller string) (*tokenList, error) {
  1709  	var (
  1710  		msg       aisMsg
  1711  		bytes, ok = payload[revsTokenTag]
  1712  	)
  1713  	if !ok {
  1714  		return nil, nil
  1715  	}
  1716  	if msgValue, ok := payload[revsTokenTag+revsActionTag]; ok {
  1717  		if err := jsoniter.Unmarshal(msgValue, &msg); err != nil {
  1718  			err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "action message", cos.BHead(msgValue), err)
  1719  			return nil, err
  1720  		}
  1721  	}
  1722  	tokenList := &tokenList{}
  1723  	if err := jsoniter.Unmarshal(bytes, tokenList); err != nil {
  1724  		err = fmt.Errorf(cmn.FmtErrUnmarshal, h, "blocked token list", cos.BHead(bytes), err)
  1725  		return nil, err
  1726  	}
  1727  	nlog.Infof("extract token list from %q (count: %d, action: %q, uuid: %q)", caller,
  1728  		len(tokenList.Tokens), msg.Action, msg.UUID)
  1729  	return tokenList, nil
  1730  }
  1732  // ================================== Background =========================================
  1733  //
  1734  // Generally, AIStore clusters can be deployed with an arbitrary numbers of proxies.
  1735  // Each proxy/gateway provides full access to the clustered objects and collaborates with
  1736  // all other proxies to perform majority-voted HA failovers.
  1737  //
  1738  // Not all proxies are equal though.
  1739  //
  1740  // Two out of all proxies can be designated via configuration as "original" and
  1741  // "discovery." The "original" (located at the configurable "original_url") is expected
  1742  // to be the primary at cluster (initial) deployment time.
  1743  //
  1744  // Later on, when and if some HA event triggers an automated failover, the role of the
  1745  // primary may be (automatically) assumed by a different proxy/gateway, with the
  1746  // corresponding update getting synchronized across all running nodes.
  1747  // A new node, however, could potentially experience a problem when trying to join the
  1748  // cluster simply because its configuration would still be referring to the old primary.
  1749  // The added "discovery_url" is precisely intended to address this scenario.
  1750  //
  1751  // Here's how a node joins a AIStore cluster:
  1752  //   - first, there's the primary proxy/gateway referenced by the current cluster map
  1753  //     or - during the cluster deployment time - by the the configured "primary_url"
  1754  //     (see /deploy/dev/local/
  1755  //   - if that one fails, the new node goes ahead and tries the alternatives:
  1756  //   - config.Proxy.PrimaryURL   ("primary_url")
  1757  //   - config.Proxy.DiscoveryURL ("discovery_url")
  1758  //   - config.Proxy.OriginalURL  ("original_url")
  1759  //   - if these fails we try the candidates provided by the caller.
  1760  //
  1761  // ================================== Background =========================================
  1762  func (h *htrun) join(query url.Values, htext htext, contactURLs ...string) (res *callResult) {
  1763  	var (
  1764  		config                   = cmn.GCO.Get()
  1765  		candidates               = make([]string, 0, 4+len(contactURLs))
  1766  		selfPublicURL, pubValid  = cos.ParseURL(
  1767  		selfIntraURL, intraValid = cos.ParseURL(
  1768  		resPrev                  *callResult
  1769  	)
  1770  	debug.Assert(pubValid && intraValid)
  1772  	// env goes first
  1773  	if daemon.EP != "" {
  1774  		candidates = _addCan(daemon.EP, selfPublicURL.Host, selfIntraURL.Host, candidates)
  1775  	}
  1776  	primaryURL, psi := h.getPrimaryURLAndSI(nil, config)
  1777  	candidates = _addCan(primaryURL, selfPublicURL.Host, selfIntraURL.Host, candidates)
  1778  	if psi != nil {
  1779  		candidates = _addCan(psi.URL(cmn.NetPublic), selfPublicURL.Host, selfIntraURL.Host, candidates)
  1780  	}
  1781  	candidates = _addCan(config.Proxy.PrimaryURL, selfPublicURL.Host, selfIntraURL.Host, candidates)
  1782  	candidates = _addCan(config.Proxy.DiscoveryURL, selfPublicURL.Host, selfIntraURL.Host, candidates)
  1783  	candidates = _addCan(config.Proxy.OriginalURL, selfPublicURL.Host, selfIntraURL.Host, candidates)
  1784  	for _, u := range contactURLs {
  1785  		candidates = _addCan(u, selfPublicURL.Host, selfIntraURL.Host, candidates)
  1786  	}
  1788  	sleep := max(2*time.Second, cmn.Rom.MaxKeepalive())
  1789  	for range 4 { // retry
  1790  		for _, candidateURL := range candidates {
  1791  			if nlog.Stopping() {
  1792  				return
  1793  			}
  1794  			if resPrev != nil {
  1795  				freeCR(resPrev)
  1796  				resPrev = nil //nolint:ineffassign // readability
  1797  			}
  1798  			res = h.regTo(candidateURL, nil, apc.DefaultTimeout, query, htext, false /*keepalive*/)
  1799  			if res.err == nil {
  1800  				nlog.Infoln(h.String()+": primary responded Ok via", candidateURL)
  1801  				return // ok
  1802  			}
  1803  			resPrev = res
  1804  		}
  1805  		time.Sleep(sleep)
  1806  	}
  1807  	if resPrev != nil {
  1808  		freeCR(resPrev)
  1809  	}
  1811  	smap := h.owner.smap.get()
  1812  	if smap.validate() != nil {
  1813  		return
  1814  	}
  1816  	// Failed to join cluster using config, try getting primary URL using existing smap.
  1817  	cii, _ := h.bcastHealth(smap, false /*checkAll*/)
  1818  	if cii == nil || cii.Smap.Version < smap.version() {
  1819  		return
  1820  	}
  1821  	primaryURL = cii.Smap.Primary.PubURL
  1823  	// Daemon is stopping skip register
  1824  	if nlog.Stopping() {
  1825  		return
  1826  	}
  1827  	res = h.regTo(primaryURL, nil, apc.DefaultTimeout, query, htext, false /*keepalive*/)
  1828  	if res.err == nil {
  1829  		nlog.Infoln(h.String()+": joined cluster via", primaryURL)
  1830  	}
  1831  	return
  1832  }
  1834  func _addCan(url, selfPub, selfCtrl string, candidates []string) []string {
  1835  	if u, valid := cos.ParseURL(url); !valid || u.Host == selfPub || u.Host == selfCtrl {
  1836  		return candidates
  1837  	}
  1838  	if cos.StringInSlice(url, candidates) {
  1839  		return candidates
  1840  	}
  1841  	return append(candidates, url)
  1842  }
  1844  func (h *htrun) regTo(url string, psi *meta.Snode, tout time.Duration, q url.Values, htext htext, keepalive bool) *callResult {
  1845  	var (
  1846  		path          string
  1847  		skipPrxKalive = || keepalive
  1848  		opts          = cmetaFillOpt{
  1849  			htext:         htext,
  1850  			skipSmap:      skipPrxKalive,
  1851  			skipBMD:       skipPrxKalive,
  1852  			skipRMD:       keepalive,
  1853  			skipConfig:    keepalive,
  1854  			skipEtlMD:     keepalive,
  1855  			fillRebMarker: !keepalive,
  1856  			skipPrimeTime: true,
  1857  		}
  1858  	)
  1859  	cm, err := h.cluMeta(opts)
  1860  	if err != nil {
  1861  		res := allocCR()
  1862  		res.err = err
  1863  		return res
  1864  	}
  1866  	if keepalive {
  1867  		path = apc.URLPathCluKalive.S
  1868  	} else {
  1869  		path = apc.URLPathCluAutoReg.S
  1870  	}
  1871  	cargs := allocCargs()
  1872  	{
  1873 = psi
  1874  		cargs.req = cmn.HreqArgs{Method: http.MethodPost, Base: url, Path: path, Query: q, Body: cos.MustMarshal(cm)}
  1875  		cargs.timeout = tout
  1876  	}
  1877  	smap := cm.Smap
  1878  	if smap == nil {
  1879  		smap = h.owner.smap.get()
  1880  	}
  1881  	res :=, smap)
  1882  	freeCargs(cargs)
  1883  	return res
  1884  }
  1886  func (h *htrun) sendKalive(smap *smapX, htext htext, timeout time.Duration, fast bool) (pid string, status int, err error) {
  1887  	if nlog.Stopping() {
  1888  		err = errors.New(h.String() + " is stopping")
  1889  		return
  1890  	}
  1891  	primaryURL, psi := h.getPrimaryURLAndSI(smap, nil)
  1892  	pid = psi.ID()
  1894  	if fast {
  1895  		// fast path
  1896  		debug.Assert(h.ClusterStarted())
  1897  		path := apc.URLPathCluKalive.Join(h.SID())
  1898  		cargs := allocCargs()
  1899  		{
  1900 = psi
  1901  			cargs.req = cmn.HreqArgs{Method: http.MethodPost, Base: primaryURL, Path: path}
  1902  			cargs.timeout = timeout
  1903  		}
  1904  		res :=, smap)
  1905  		freeCargs(cargs)
  1906  		err = res.err
  1907  		freeCR(res)
  1908  		return
  1909  	}
  1911  	// slow path
  1912  	res := h.regTo(primaryURL, psi, timeout, nil, htext, true /*keepalive*/)
  1913  	if res.err != nil {
  1914  		if strings.Contains(res.err.Error(), ciePrefix) {
  1915  			cos.ExitLog(res.err) // FATAL: cluster integrity error (cie)
  1916  		}
  1917  		status, err = res.status, res.err
  1918  		freeCR(res)
  1919  		return
  1920  	}
  1921  	freeCR(res)
  1922  	return
  1923  }
  1925  func (h *htrun) getPrimaryURLAndSI(smap *smapX, config *cmn.Config) (string, *meta.Snode) {
  1926  	if smap == nil {
  1927  		smap = h.owner.smap.get()
  1928  	}
  1929  	if smap.validate() != nil {
  1930  		if config == nil {
  1931  			config = cmn.GCO.Get()
  1932  		}
  1933  		return config.Proxy.PrimaryURL, nil
  1934  	}
  1935  	return smap.Primary.URL(cmn.NetIntraControl), smap.Primary
  1936  }
  1938  func (h *htrun) pollClusterStarted(config *cmn.Config, psi *meta.Snode) (maxCii *cifl.Info) {
  1939  	var (
  1940  		sleep, total, rediscover time.Duration
  1941  		healthTimeout            = config.Timeout.CplaneOperation.D()
  1942  		query                    = url.Values{apc.QparamAskPrimary: []string{"true"}}
  1943  	)
  1944  	for {
  1945  		sleep = min(cmn.Rom.MaxKeepalive(), sleep+time.Second)
  1946  		time.Sleep(sleep)
  1947  		total += sleep
  1948  		rediscover += sleep
  1949  		if nlog.Stopping() {
  1950  			return
  1951  		}
  1952  		smap := h.owner.smap.get()
  1953  		if smap.validate() != nil {
  1954  			continue
  1955  		}
  1956  		if && smap.isPrimary( { // TODO: unlikely - see httpRequestNewPrimary
  1957  			nlog.Warningln(h.String(), "started as a non-primary and got _elected_ during startup")
  1958  			return
  1959  		}
  1960  		if _, _, err := h.reqHealth(smap.Primary, healthTimeout, query /*ask primary*/, smap); err == nil {
  1961  			// log
  1962  			s := fmt.Sprintf("%s via primary health: cluster startup Ok, %s",, smap.StringEx())
  1963  			if self := smap.GetNode(; self == nil {
  1964  				nlog.Warningln(s + "; NOTE: not present in the cluster map")
  1965  			} else if self.Flags.IsSet(meta.SnodeMaint) {
  1966 = self.Flags
  1967  				nlog.Warningln(s + "; NOTE: starting in maintenance mode")
  1968  			} else if rmd := h.owner.rmd.get(); rmd != nil && rmd.version() > 0 {
  1969  				if smap.UUID != rmd.CluID {
  1970  					if rmd.CluID != "" {
  1971  						err = h.owner.rmd.newClusterIntegrityErr(h.String(), smap.UUID, rmd.CluID, rmd.version())
  1972  						cos.ExitLog(err) // FATAL
  1973  					}
  1975  					nlog.Warningf("local copy of RMD v%d does not have cluster ID (expecting %q)",
  1976  						rmd.version(), smap.UUID)
  1977  					nlog.Infoln(s)
  1978  				} else {
  1979  					nlog.Infoln(s+",", rmd.String())
  1980  				}
  1981  			} else {
  1982  				nlog.Infoln(s)
  1983  			}
  1984  			return
  1985  		}
  1987  		if rediscover >= config.Timeout.Startup.D()/2 {
  1988  			rediscover = 0
  1989  			if cii, cnt := h.bcastHealth(smap, true /*checkAll*/); cii != nil && cii.Smap.Version > smap.version() {
  1990  				var pid string
  1991  				if psi != nil {
  1992  					pid = psi.ID()
  1993  				}
  1994  				if cii.Smap.Primary.ID != pid && cnt >= maxVerConfirmations {
  1995  					nlog.Warningf("%s: change of primary %s => %s - must rejoin",, pid, cii.Smap.Primary.ID)
  1996  					maxCii = cii
  1997  					return
  1998  				}
  1999  			}
  2000  		}
  2001  		if total > config.Timeout.Startup.D() {
  2002  			nlog.Errorln(h.String() + ": " + cmn.StartupMayTimeout)
  2003  		}
  2004  	}
  2005  }
  2007  func (h *htrun) unregisterSelf(ignoreErr bool) (err error) {
  2008  	var status int
  2009  	smap := h.owner.smap.get()
  2010  	if smap == nil || smap.validate() != nil {
  2011  		return
  2012  	}
  2013  	cargs := allocCargs()
  2014  	{
  2015 = smap.Primary
  2016  		cargs.req = cmn.HreqArgs{Method: http.MethodDelete, Path: apc.URLPathCluDaemon.Join(}
  2017  		cargs.timeout = apc.DefaultTimeout
  2018  	}
  2019  	res :=, smap)
  2020  	status, err = res.status, res.err
  2021  	if err != nil {
  2022  		f := nlog.Errorf
  2023  		if ignoreErr {
  2024  			f = nlog.Infof
  2025  		}
  2026  		f("%s: failed to unreg self, err: %v(%d)",, err, status)
  2027  	}
  2028  	freeCargs(cargs)
  2029  	freeCR(res)
  2030  	return
  2031  }
  2033  // via /health handler
  2034  func (h *htrun) externalWD(w http.ResponseWriter, r *http.Request) (responded bool) {
  2035  	callerID := r.Header.Get(apc.HdrCallerID)
  2036  	caller := r.Header.Get(apc.HdrCallerName)
  2037  	// external call
  2038  	if callerID == "" && caller == "" {
  2039  		readiness := cos.IsParseBool(r.URL.Query().Get(apc.QparamHealthReadiness))
  2040  		if cmn.Rom.FastV(5, cos.SmoduleAIS) {
  2041  			nlog.Infof("%s: external health-ping from %s (readiness=%t)",, r.RemoteAddr, readiness)
  2042  		}
  2043  		// respond with 503 as per
  2044  		// see also:
  2045  		// *
  2046  		if !readiness && !h.ClusterStarted() {
  2047  			w.WriteHeader(http.StatusServiceUnavailable)
  2048  		}
  2049  		// NOTE: for "readiness" check always return true; otherwise, true if cluster started
  2050  		return true
  2051  	}
  2052  	// intra-cluster health ping
  2053  	if !h.ensureIntraControl(w, r, false /* from primary */) {
  2054  		responded = true
  2055  	}
  2056  	return
  2057  }
  2059  //
  2060  // intra-cluster request validations and helpers
  2061  //
  2063  func (h *htrun) isIntraCall(hdr http.Header, fromPrimary bool) (err error) {
  2064  	debug.Assert(hdr != nil)
  2065  	var (
  2066  		smap       = h.owner.smap.get()
  2067  		callerID   = hdr.Get(apc.HdrCallerID)
  2068  		callerName = hdr.Get(apc.HdrCallerName)
  2069  		callerSver = hdr.Get(apc.HdrCallerSmapVer)
  2070  		callerVer  int64
  2071  		erP        error
  2072  	)
  2073  	if ok := callerID != "" && callerName != ""; !ok {
  2074  		return fmt.Errorf("%s: expected %s request", h, cmn.NetIntraControl)
  2075  	}
  2076  	if !smap.isValid() {
  2077  		return
  2078  	}
  2079  	caller := smap.GetNode(callerID)
  2080  	if ok := caller != nil && (!fromPrimary || smap.isPrimary(caller)); ok {
  2081  		return
  2082  	}
  2083  	if callerSver != smap.vstr && callerSver != "" {
  2084  		callerVer, erP = strconv.ParseInt(callerSver, 10, 64)
  2085  		if erP != nil {
  2086  			debug.AssertNoErr(erP)
  2087  			nlog.Errorln(erP)
  2088  			return
  2089  		}
  2090  		// we still trust the request when the sender's Smap is more current
  2091  		if callerVer > smap.version() {
  2092  			if h.ClusterStarted() {
  2093  				nlog.Errorf("%s: %s < Smap(v%s) from %s - proceeding anyway...", h, smap, callerSver, callerName)
  2094  			}
  2095  			runtime.Gosched()
  2096  			return
  2097  		}
  2098  	}
  2099  	if caller == nil {
  2100  		if !fromPrimary {
  2101  			// assume request from a newly joined node and proceed
  2102  			return nil
  2103  		}
  2104  		return fmt.Errorf("%s: expected %s from a valid node, %s", h, cmn.NetIntraControl, smap)
  2105  	}
  2106  	return fmt.Errorf("%s: expected %s from primary (and not %s), %s", h, cmn.NetIntraControl, caller, smap)
  2107  }
  2109  func (h *htrun) ensureIntraControl(w http.ResponseWriter, r *http.Request, onlyPrimary bool) (isIntra bool) {
  2110  	err := h.isIntraCall(r.Header, onlyPrimary)
  2111  	if err != nil {
  2112  		h.writeErr(w, r, err)
  2113  		return
  2114  	}
  2115  	if !cmn.GCO.Get().HostNet.UseIntraControl {
  2116  		return true // intra-control == pub
  2117  	}
  2118  	// NOTE: not checking r.RemoteAddr
  2119  	intraAddr :=
  2120  	srvAddr := r.Context().Value(http.ServerContextKey).(*http.Server).Addr
  2121  	if srvAddr == intraAddr {
  2122  		return true
  2123  	}
  2124  	h.writeErrf(w, r, "%s: expected %s request", h, cmn.NetIntraControl)
  2125  	return
  2126  }
  2128  func (h *htrun) uptime2hdr(hdr http.Header) {
  2129  	now := mono.NanoTime()
  2130  	hdr.Set(apc.HdrNodeUptime, strconv.FormatInt(now-h.startup.node.Load(), 10))
  2131  	hdr.Set(apc.HdrClusterUptime, strconv.FormatInt(now-h.startup.cluster.Load(), 10))
  2132  }
  2134  // NOTE: not checking vs Smap (yet)
  2135  func isT2TPut(hdr http.Header) bool { return hdr != nil && hdr.Get(apc.HdrT2TPutterID) != "" }
  2137  func isRedirect(q url.Values) (ptime string) {
  2138  	if len(q) == 0 || q.Get(apc.QparamProxyID) == "" {
  2139  		return
  2140  	}
  2141  	return q.Get(apc.QparamUnixTime)
  2142  }
  2144  func ptLatency(tts int64, ptime, isPrimary string) (dur int64) {
  2145  	pts, err := cos.S2UnixNano(ptime)
  2146  	if err != nil {
  2147  		debug.AssertNoErr(err)
  2148  		return
  2149  	}
  2150  	if ok, _ := cos.ParseBool(isPrimary); ok {
  2151  		xreg.PrimeTime.Store(pts)
  2152  		xreg.MyTime.Store(tts)
  2153  	}
  2154  	dur = tts - pts
  2155  	if dur < 0 && -dur < int64(clusterClockDrift) {
  2156  		dur = 0
  2157  	}
  2158  	return
  2159  }
  2161  //
  2162  // aisMsg reader & constructors
  2163  //
  2165  func (*htrun) readAisMsg(w http.ResponseWriter, r *http.Request) (msg *aisMsg, err error) {
  2166  	msg = &aisMsg{}
  2167  	err = cmn.ReadJSON(w, r, msg)
  2168  	return
  2169  }
  2171  func (msg *aisMsg) String() string {
  2172  	s := "aism[" + msg.Action
  2173  	if msg.UUID != "" {
  2174  		s += "[" + msg.UUID + "]"
  2175  	}
  2176  	if msg.Name != "" {
  2177  		s += ", name=" + msg.Name
  2178  	}
  2179  	return s + "]"
  2180  }
  2182  func (msg *aisMsg) StringEx() (s string) {
  2183  	s = msg.String()
  2184  	vs, err := jsoniter.Marshal(msg.Value)
  2185  	debug.AssertNoErr(err)
  2186  	s += ",(" + strings.ReplaceAll(string(vs), ",", ", ") + ")"
  2187  	return
  2188  }
  2190  func (h *htrun) newAmsgStr(msgStr string, bmd *bucketMD) *aisMsg {
  2191  	return h.newAmsg(&apc.ActMsg{Value: msgStr}, bmd)
  2192  }
  2194  func (h *htrun) newAmsgActVal(act string, val any) *aisMsg {
  2195  	return h.newAmsg(&apc.ActMsg{Action: act, Value: val}, nil)
  2196  }
  2198  func (h *htrun) newAmsg(actionMsg *apc.ActMsg, bmd *bucketMD, uuid ...string) *aisMsg {
  2199  	msg := &aisMsg{ActMsg: *actionMsg}
  2200  	if bmd != nil {
  2201  		msg.BMDVersion = bmd.Version
  2202  	} else {
  2203  		msg.BMDVersion = h.owner.bmd.Get().Version
  2204  	}
  2205  	if len(uuid) > 0 {
  2206  		msg.UUID = uuid[0]
  2207  	}
  2208  	return msg
  2209  }
  2211  // apc.ActMsg c-tor and reader
  2212  func (*htrun) readActionMsg(w http.ResponseWriter, r *http.Request) (msg *apc.ActMsg, err error) {
  2213  	msg = &apc.ActMsg{}
  2214  	err = cmn.ReadJSON(w, r, msg)
  2215  	return
  2216  }
  2218  // cmn.ReadJSON with the only difference: EOF is ok
  2219  func readJSON(w http.ResponseWriter, r *http.Request, out any) (err error) {
  2220  	err = jsoniter.NewDecoder(r.Body).Decode(out)
  2221  	cos.Close(r.Body)
  2222  	if err == nil || err == io.EOF {
  2223  		return nil
  2224  	}
  2225  	return cmn.WriteErrJSON(w, r, out, err)
  2226  }
  2228  // (via apc.WhatNodeStatsAndStatus)
  2229  func (h *htrun) _status(smap *smapX) (daeStatus string) {
  2230  	self := smap.GetNode( // updated flags
  2231  	switch {
  2232  	case self.Flags.IsSet(meta.SnodeMaint):
  2233  		daeStatus = apc.NodeMaintenance
  2234  	case self.Flags.IsSet(meta.SnodeDecomm):
  2235  		daeStatus = apc.NodeDecommission
  2236  	}
  2237  	return
  2238  }
  2240  ////////////////
  2241  // callResult //
  2242  ////////////////
  2244  // error helpers for intra-cluster calls
  2246  func (res *callResult) unwrap() (err error) {
  2247  	err = errors.Unwrap(res.err)
  2248  	if err == nil {
  2249  		err = res.err
  2250  	}
  2251  	return
  2252  }
  2254  func (res *callResult) toErr() error {
  2255  	if res.err == nil {
  2256  		return nil
  2257  	}
  2258  	// is cmn.ErrHTTP
  2259  	if herr := cmn.Err2HTTPErr(res.err); herr != nil {
  2260  		// add status, details
  2261  		if res.status >= http.StatusBadRequest {
  2262  			herr.Status = res.status
  2263  		}
  2264  		if herr.Message == "" {
  2265  			herr.Message = res.details
  2266  		}
  2267  		return herr
  2268  	}
  2269  	// res => cmn.ErrHTTP
  2270  	if res.status >= http.StatusBadRequest {
  2271  		var detail string
  2272  		if res.details != "" {
  2273  			detail = "[" + res.details + "]"
  2274  		}
  2275  		return res.herr(nil, fmt.Sprintf("%v%s", res.err, detail))
  2276  	}
  2277  	if res.details == "" {
  2278  		return res.err
  2279  	}
  2280  	return cmn.NewErrFailedTo(nil, "call ", res.details, res.err)
  2281  }
  2283  func (res *callResult) herr(r *http.Request, msg string) *cmn.ErrHTTP {
  2284  	orig := &cmn.ErrHTTP{}
  2285  	if e := jsoniter.Unmarshal([]byte(msg), orig); e == nil {
  2286  		return orig
  2287  	}
  2288  	nherr := cmn.NewErrHTTP(r, errors.New(msg), res.status)
  2289  	if != nil {
  2290  		nherr.Node =
  2291  	}
  2292  	return nherr
  2293  }
  2295  func (res *callResult) errorf(format string, a ...any) error {
  2296  	debug.Assert(res.err != nil)
  2297  	// add formatted
  2298  	msg := fmt.Sprintf(format, a...)
  2299  	if herr := cmn.Err2HTTPErr(res.err); herr != nil {
  2300  		herr.Message = msg + ": " + herr.Message
  2301  		res.err = herr
  2302  	} else {
  2303  		res.err = errors.New(msg + ": " + res.err.Error())
  2304  	}
  2305  	return res.toErr()
  2306  }