github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/prxetl.go (about)

     1  // Package ais provides core functionality for the AIStore object storage.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package ais
     6  
     7  import (
     8  	"io"
     9  	"net/http"
    10  	"net/url"
    11  	"reflect"
    12  	"sort"
    13  	"strconv"
    14  
    15  	"github.com/NVIDIA/aistore/api/apc"
    16  	"github.com/NVIDIA/aistore/cmn"
    17  	"github.com/NVIDIA/aistore/cmn/cos"
    18  	"github.com/NVIDIA/aistore/cmn/debug"
    19  	"github.com/NVIDIA/aistore/cmn/k8s"
    20  	"github.com/NVIDIA/aistore/cmn/nlog"
    21  	"github.com/NVIDIA/aistore/ext/etl"
    22  )
    23  
    24  // TODO: support start/stop/list using `xid`
    25  
    26  // [METHOD] /v1/etl
    27  func (p *proxy) etlHandler(w http.ResponseWriter, r *http.Request) {
    28  	if !p.cluStartedWithRetry() {
    29  		w.WriteHeader(http.StatusServiceUnavailable)
    30  		return
    31  	}
    32  	switch {
    33  	case r.Method == http.MethodPut:
    34  		// require Admin access (a no-op if AuthN is not used, here and elsewhere)
    35  		if err := p.checkAccess(w, r, nil, apc.AceAdmin); err != nil {
    36  			return
    37  		}
    38  		p.handleETLPut(w, r)
    39  	case r.Method == http.MethodPost:
    40  		p.handleETLPost(w, r)
    41  	case r.Method == http.MethodGet:
    42  		p.handleETLGet(w, r)
    43  	case r.Method == http.MethodDelete:
    44  		// ditto
    45  		if err := p.checkAccess(w, r, nil, apc.AceAdmin); err != nil {
    46  			return
    47  		}
    48  		p.handleETLDelete(w, r)
    49  	default:
    50  		cmn.WriteErr405(w, r, http.MethodDelete, http.MethodGet, http.MethodPost)
    51  	}
    52  }
    53  
    54  // GET /v1/etl
    55  func (p *proxy) handleETLGet(w http.ResponseWriter, r *http.Request) {
    56  	apiItems, err := p.parseURL(w, r, apc.URLPathETL.L, 0, true)
    57  	if err != nil {
    58  		return
    59  	}
    60  
    61  	if len(apiItems) == 0 {
    62  		p.listETL(w, r)
    63  		return
    64  	}
    65  
    66  	// /v1/etl/<etl-name>
    67  	if len(apiItems) == 1 {
    68  		p.infoETL(w, r, apiItems[0])
    69  		return
    70  	}
    71  
    72  	switch apiItems[1] {
    73  	case apc.ETLLogs:
    74  		// /v1/etl/<etl-name>/logs[/<target-id>]
    75  		p.logsETL(w, r, apiItems[0], apiItems[2:]...)
    76  	case apc.ETLHealth:
    77  		// /v1/etl/<etl-name>/health
    78  		p.healthETL(w, r)
    79  	case apc.ETLMetrics:
    80  		// /v1/etl/<etl-name>/metrics
    81  		p.metricsETL(w, r)
    82  	default:
    83  		p.writeErrURL(w, r)
    84  	}
    85  }
    86  
    87  // PUT /v1/etl
    88  // Validate and start a new ETL instance:
    89  //   - validate user-provided code/pod specification.
    90  //   - broadcast `etl.InitMsg` to all targets.
    91  //   - (as usual) if any target fails to start ETL stop it on all (targets).
    92  //     otherwise:
    93  //   - add the new ETL instance (represented by the user-specified `etl.InitMsg`) to cluster MD
    94  //   - return ETL UUID to the user.
    95  func (p *proxy) handleETLPut(w http.ResponseWriter, r *http.Request) {
    96  	if _, err := p.parseURL(w, r, apc.URLPathETL.L, 0, false); err != nil {
    97  		return
    98  	}
    99  	if p.forwardCP(w, r, nil, "init ETL") {
   100  		return
   101  	}
   102  
   103  	b, err := io.ReadAll(r.Body)
   104  	if err != nil {
   105  		p.writeErr(w, r, err)
   106  		return
   107  	}
   108  	r.Body.Close()
   109  
   110  	initMsg, err := etl.UnmarshalInitMsg(b)
   111  	if err != nil {
   112  		p.writeErr(w, r, err)
   113  		return
   114  	}
   115  	if err := initMsg.Validate(); err != nil {
   116  		p.writeErr(w, r, err)
   117  		return
   118  	}
   119  
   120  	// must be new
   121  	etlMD := p.owner.etl.get()
   122  	if etlMD.get(initMsg.Name()) != nil {
   123  		p.writeErrf(w, r, "%s: etl[%s] already exists", p, initMsg.Name())
   124  		return
   125  	}
   126  
   127  	// add to cluster MD and start running
   128  	if err := p.startETL(w, initMsg, true /*add to etlMD*/); err != nil {
   129  		p.writeErr(w, r, err)
   130  		return
   131  	}
   132  	if cmn.Rom.FastV(4, cos.SmoduleETL) {
   133  		nlog.Infoln(p.String() + ": " + initMsg.String())
   134  	}
   135  }
   136  
   137  // POST /v1/etl/<etl-name>/stop (or) /v1/etl/<etl-name>/start
   138  // start/stop ETL pods
   139  func (p *proxy) handleETLPost(w http.ResponseWriter, r *http.Request) {
   140  	apiItems, err := p.parseURL(w, r, apc.URLPathETL.L, 2, true)
   141  	if err != nil {
   142  		return
   143  	}
   144  	etlName := apiItems[0]
   145  	if err := k8s.ValidateEtlName(etlName); err != nil {
   146  		p.writeErr(w, r, err)
   147  		return
   148  	}
   149  	etlMD := p.owner.etl.get()
   150  	etlMsg := etlMD.get(etlName)
   151  	if etlMsg == nil {
   152  		p.writeErr(w, r, cos.NewErrNotFound(p, "etl job "+etlName))
   153  		return
   154  	}
   155  
   156  	switch op := apiItems[1]; op {
   157  	case apc.ETLStop:
   158  		p.stopETL(w, r)
   159  	case apc.ETLStart:
   160  		p.startETL(w, etlMsg, false /*add to etlMD*/)
   161  	default:
   162  		debug.Assert(false, "invalid operation: "+op)
   163  		p.writeErrURL(w, r)
   164  	}
   165  }
   166  
   167  // DELETE /v1/etl/<etl-name>
   168  func (p *proxy) handleETLDelete(w http.ResponseWriter, r *http.Request) {
   169  	apiItems, err := p.parseURL(w, r, apc.URLPathETL.L, 1, true)
   170  	if err != nil {
   171  		return
   172  	}
   173  
   174  	if p.forwardCP(w, r, nil, "delete ETL") {
   175  		return
   176  	}
   177  
   178  	etlName := apiItems[0]
   179  	if err := k8s.ValidateEtlName(etlName); err != nil {
   180  		p.writeErr(w, r, err)
   181  		return
   182  	}
   183  	ctx := &etlMDModifier{
   184  		pre:     p._deleteETLPre,
   185  		final:   p._syncEtlMDFinal,
   186  		etlName: etlName,
   187  	}
   188  	if _, err := p.owner.etl.modify(ctx); err != nil {
   189  		p.writeErr(w, r, err)
   190  	}
   191  }
   192  
   193  func (p *proxy) _deleteETLPre(ctx *etlMDModifier, clone *etlMD) (err error) {
   194  	debug.AssertNoErr(k8s.ValidateEtlName(ctx.etlName))
   195  	if exists := clone.del(ctx.etlName); !exists {
   196  		err = cos.NewErrNotFound(p, "etl job "+ctx.etlName)
   197  	}
   198  	return
   199  }
   200  
   201  // broadcast (start ETL) request to all targets
   202  func (p *proxy) startETL(w http.ResponseWriter, msg etl.InitMsg, addToMD bool) error {
   203  	var (
   204  		err  error
   205  		args = allocBcArgs()
   206  		xid  = etl.PrefixXactID + cos.GenUUID()
   207  	)
   208  	{
   209  		args.req = cmn.HreqArgs{
   210  			Method: http.MethodPut,
   211  			Path:   apc.URLPathETL.S,
   212  			Body:   cos.MustMarshal(msg),
   213  			Query:  url.Values{apc.QparamUUID: []string{xid}},
   214  		}
   215  		args.timeout = apc.LongTimeout
   216  	}
   217  	results := p.bcastGroup(args)
   218  	freeBcArgs(args)
   219  	for _, res := range results {
   220  		if res.err == nil {
   221  			continue
   222  		}
   223  		err = res.toErr()
   224  		nlog.Errorln(err)
   225  	}
   226  	freeBcastRes(results)
   227  
   228  	if err != nil {
   229  		// At least one target failed. Terminate all.
   230  		// (Termination calls may succeed for the targets that already succeeded in starting ETL,
   231  		//  or fail otherwise - ignore the failures).
   232  		argsTerm := allocBcArgs()
   233  		argsTerm.req = cmn.HreqArgs{Method: http.MethodPost, Path: apc.URLPathETL.Join(msg.Name(), apc.ETLStop)}
   234  		argsTerm.timeout = apc.LongTimeout
   235  		p.bcastGroup(argsTerm)
   236  		freeBcArgs(argsTerm)
   237  		return err
   238  	}
   239  
   240  	if addToMD {
   241  		ctx := &etlMDModifier{
   242  			pre:   _addETLPre,
   243  			final: p._syncEtlMDFinal,
   244  			msg:   msg,
   245  			wait:  true,
   246  		}
   247  		p.owner.etl.modify(ctx)
   248  	}
   249  	// All init calls succeeded - return running xaction
   250  	w.Header().Set(cos.HdrContentLength, strconv.Itoa(len(xid)))
   251  	w.Write(cos.UnsafeB(xid))
   252  	return nil
   253  }
   254  
   255  func _addETLPre(ctx *etlMDModifier, clone *etlMD) (_ error) {
   256  	debug.Assert(ctx.msg != nil)
   257  	clone.add(ctx.msg)
   258  	return
   259  }
   260  
   261  func (p *proxy) _syncEtlMDFinal(ctx *etlMDModifier, clone *etlMD) {
   262  	wg := p.metasyncer.sync(revsPair{clone, p.newAmsgStr("etl-reg", nil)})
   263  	if ctx.wait {
   264  		wg.Wait()
   265  	}
   266  }
   267  
   268  // GET /v1/etl/<etl-name>
   269  func (p *proxy) infoETL(w http.ResponseWriter, r *http.Request, etlName string) {
   270  	if err := k8s.ValidateEtlName(etlName); err != nil {
   271  		p.writeErr(w, r, err)
   272  		return
   273  	}
   274  
   275  	etlMD := p.owner.etl.get()
   276  	initMsg := etlMD.get(etlName)
   277  	if initMsg == nil {
   278  		p.writeErr(w, r, cos.NewErrNotFound(p, "etl job "+etlName))
   279  		return
   280  	}
   281  	p.writeJSON(w, r, initMsg, "info-etl")
   282  }
   283  
   284  // GET /v1/etl
   285  func (p *proxy) listETL(w http.ResponseWriter, r *http.Request) {
   286  	var (
   287  		args = allocBcArgs()
   288  		etls *etl.InfoList
   289  	)
   290  	args.req = cmn.HreqArgs{Method: http.MethodGet, Path: apc.URLPathETL.S}
   291  	args.timeout = apc.DefaultTimeout
   292  	args.cresv = cresEI{} // -> etl.InfoList
   293  	results := p.bcastGroup(args)
   294  	freeBcArgs(args)
   295  
   296  	for _, res := range results {
   297  		if res.err != nil {
   298  			p.writeErr(w, r, res.toErr())
   299  			freeBcastRes(results)
   300  			return
   301  		}
   302  
   303  		if etls == nil {
   304  			etls = res.v.(*etl.InfoList)
   305  			sort.Sort(etls)
   306  		} else {
   307  			another := res.v.(*etl.InfoList)
   308  			sort.Sort(another)
   309  			if !reflect.DeepEqual(etls, another) {
   310  				// TODO: Should we return an error to a user?
   311  				// Or stop mismatching ETLs and return internal server error?
   312  				nlog.Warningf("Targets returned different ETLs: %v vs %v", etls, another)
   313  			}
   314  		}
   315  	}
   316  	freeBcastRes(results)
   317  	if etls == nil {
   318  		etls = &etl.InfoList{}
   319  	}
   320  	p.writeJSON(w, r, *etls, "list-etl")
   321  }
   322  
   323  // GET /v1/etl/<etl-name>/logs[/<target_id>]
   324  func (p *proxy) logsETL(w http.ResponseWriter, r *http.Request, etlName string, apiItems ...string) {
   325  	var (
   326  		results sliceResults
   327  		args    *bcastArgs
   328  	)
   329  	if len(apiItems) > 0 {
   330  		// specific target
   331  		var (
   332  			tid  = apiItems[0]
   333  			smap = p.owner.smap.get()
   334  			si   = smap.GetTarget(tid)
   335  		)
   336  		if si == nil {
   337  			p.writeErrf(w, r, "unknown target %q", tid)
   338  			return
   339  		}
   340  		results = make(sliceResults, 1)
   341  		cargs := allocCargs()
   342  		{
   343  			cargs.req = cmn.HreqArgs{Method: http.MethodGet, Path: apc.URLPathETL.Join(etlName, apc.ETLLogs)}
   344  			cargs.si = si
   345  			cargs.timeout = apc.DefaultTimeout
   346  			cargs.cresv = cresEL{} // -> etl.Logs
   347  		}
   348  		results[0] = p.call(cargs, smap)
   349  		freeCargs(cargs)
   350  	} else {
   351  		// all targets
   352  		args = allocBcArgs()
   353  		args.req = cmn.HreqArgs{Method: http.MethodGet, Path: r.URL.Path}
   354  		args.timeout = apc.DefaultTimeout
   355  		args.cresv = cresEL{} // -> etl.Logs
   356  		results = p.bcastGroup(args)
   357  		freeBcArgs(args)
   358  	}
   359  	logs := make(etl.LogsByTarget, 0, len(results))
   360  	for _, res := range results {
   361  		if res.err != nil {
   362  			p.writeErr(w, r, res.toErr())
   363  			freeBcastRes(results)
   364  			return
   365  		}
   366  		logs = append(logs, *res.v.(*etl.Logs))
   367  	}
   368  	freeBcastRes(results)
   369  	p.writeJSON(w, r, logs, "logs-etl")
   370  }
   371  
   372  // GET /v1/etl/<etl-name>/health
   373  func (p *proxy) healthETL(w http.ResponseWriter, r *http.Request) {
   374  	var (
   375  		results sliceResults
   376  		args    *bcastArgs
   377  	)
   378  	args = allocBcArgs()
   379  	args.req = cmn.HreqArgs{Method: http.MethodGet, Path: r.URL.Path}
   380  	results = p.bcastGroup(args)
   381  	defer freeBcastRes(results)
   382  	freeBcArgs(args)
   383  
   384  	healths := make(etl.HealthByTarget, 0, len(results))
   385  	for _, res := range results {
   386  		if res.err != nil {
   387  			p.writeErr(w, r, res.toErr(), res.status)
   388  			return
   389  		}
   390  		msg := etl.HealthStatus{
   391  			TargetID: res.si.ID(),
   392  			Status:   string(res.bytes),
   393  		}
   394  		healths = append(healths, &msg)
   395  	}
   396  	p.writeJSON(w, r, healths, "health-etl")
   397  }
   398  
   399  // GET /v1/etl/<etl-name>/metrics
   400  func (p *proxy) metricsETL(w http.ResponseWriter, r *http.Request) {
   401  	var (
   402  		results sliceResults
   403  		args    *bcastArgs
   404  	)
   405  	args = allocBcArgs()
   406  	args.req = cmn.HreqArgs{Method: http.MethodGet, Path: r.URL.Path}
   407  	args.timeout = apc.DefaultTimeout
   408  	args.cresv = cresEM{} // -> etl.CPUMemByTarget
   409  	results = p.bcastGroup(args)
   410  	defer freeBcastRes(results)
   411  	freeBcArgs(args)
   412  
   413  	metrics := make(etl.CPUMemByTarget, 0, len(results))
   414  	for _, res := range results {
   415  		if res.err != nil {
   416  			p.writeErr(w, r, res.toErr(), res.status)
   417  			return
   418  		}
   419  		metrics = append(metrics, res.v.(*etl.CPUMemUsed))
   420  	}
   421  	sort.SliceStable(metrics, func(i, j int) bool { return metrics[i].TargetID < metrics[j].TargetID })
   422  	p.writeJSON(w, r, metrics, "metrics-etl")
   423  }
   424  
   425  // POST /v1/etl/<etl-name>/stop
   426  func (p *proxy) stopETL(w http.ResponseWriter, r *http.Request) {
   427  	args := allocBcArgs()
   428  	args.req = cmn.HreqArgs{Method: http.MethodPost, Path: r.URL.Path}
   429  	args.timeout = apc.LongTimeout
   430  	results := p.bcastGroup(args)
   431  	freeBcArgs(args)
   432  	for _, res := range results {
   433  		if res.err == nil {
   434  			continue
   435  		}
   436  		p.writeErr(w, r, res.toErr())
   437  		break
   438  	}
   439  	freeBcastRes(results)
   440  }