github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/prxetl.go (about) 1 // Package ais provides core functionality for the AIStore object storage. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package ais 6 7 import ( 8 "io" 9 "net/http" 10 "net/url" 11 "reflect" 12 "sort" 13 "strconv" 14 15 "github.com/NVIDIA/aistore/api/apc" 16 "github.com/NVIDIA/aistore/cmn" 17 "github.com/NVIDIA/aistore/cmn/cos" 18 "github.com/NVIDIA/aistore/cmn/debug" 19 "github.com/NVIDIA/aistore/cmn/k8s" 20 "github.com/NVIDIA/aistore/cmn/nlog" 21 "github.com/NVIDIA/aistore/ext/etl" 22 ) 23 24 // TODO: support start/stop/list using `xid` 25 26 // [METHOD] /v1/etl 27 func (p *proxy) etlHandler(w http.ResponseWriter, r *http.Request) { 28 if !p.cluStartedWithRetry() { 29 w.WriteHeader(http.StatusServiceUnavailable) 30 return 31 } 32 switch { 33 case r.Method == http.MethodPut: 34 // require Admin access (a no-op if AuthN is not used, here and elsewhere) 35 if err := p.checkAccess(w, r, nil, apc.AceAdmin); err != nil { 36 return 37 } 38 p.handleETLPut(w, r) 39 case r.Method == http.MethodPost: 40 p.handleETLPost(w, r) 41 case r.Method == http.MethodGet: 42 p.handleETLGet(w, r) 43 case r.Method == http.MethodDelete: 44 // ditto 45 if err := p.checkAccess(w, r, nil, apc.AceAdmin); err != nil { 46 return 47 } 48 p.handleETLDelete(w, r) 49 default: 50 cmn.WriteErr405(w, r, http.MethodDelete, http.MethodGet, http.MethodPost) 51 } 52 } 53 54 // GET /v1/etl 55 func (p *proxy) handleETLGet(w http.ResponseWriter, r *http.Request) { 56 apiItems, err := p.parseURL(w, r, apc.URLPathETL.L, 0, true) 57 if err != nil { 58 return 59 } 60 61 if len(apiItems) == 0 { 62 p.listETL(w, r) 63 return 64 } 65 66 // /v1/etl/<etl-name> 67 if len(apiItems) == 1 { 68 p.infoETL(w, r, apiItems[0]) 69 return 70 } 71 72 switch apiItems[1] { 73 case apc.ETLLogs: 74 // /v1/etl/<etl-name>/logs[/<target-id>] 75 p.logsETL(w, r, apiItems[0], apiItems[2:]...) 76 case apc.ETLHealth: 77 // /v1/etl/<etl-name>/health 78 p.healthETL(w, r) 79 case apc.ETLMetrics: 80 // /v1/etl/<etl-name>/metrics 81 p.metricsETL(w, r) 82 default: 83 p.writeErrURL(w, r) 84 } 85 } 86 87 // PUT /v1/etl 88 // Validate and start a new ETL instance: 89 // - validate user-provided code/pod specification. 90 // - broadcast `etl.InitMsg` to all targets. 91 // - (as usual) if any target fails to start ETL stop it on all (targets). 92 // otherwise: 93 // - add the new ETL instance (represented by the user-specified `etl.InitMsg`) to cluster MD 94 // - return ETL UUID to the user. 95 func (p *proxy) handleETLPut(w http.ResponseWriter, r *http.Request) { 96 if _, err := p.parseURL(w, r, apc.URLPathETL.L, 0, false); err != nil { 97 return 98 } 99 if p.forwardCP(w, r, nil, "init ETL") { 100 return 101 } 102 103 b, err := io.ReadAll(r.Body) 104 if err != nil { 105 p.writeErr(w, r, err) 106 return 107 } 108 r.Body.Close() 109 110 initMsg, err := etl.UnmarshalInitMsg(b) 111 if err != nil { 112 p.writeErr(w, r, err) 113 return 114 } 115 if err := initMsg.Validate(); err != nil { 116 p.writeErr(w, r, err) 117 return 118 } 119 120 // must be new 121 etlMD := p.owner.etl.get() 122 if etlMD.get(initMsg.Name()) != nil { 123 p.writeErrf(w, r, "%s: etl[%s] already exists", p, initMsg.Name()) 124 return 125 } 126 127 // add to cluster MD and start running 128 if err := p.startETL(w, initMsg, true /*add to etlMD*/); err != nil { 129 p.writeErr(w, r, err) 130 return 131 } 132 if cmn.Rom.FastV(4, cos.SmoduleETL) { 133 nlog.Infoln(p.String() + ": " + initMsg.String()) 134 } 135 } 136 137 // POST /v1/etl/<etl-name>/stop (or) /v1/etl/<etl-name>/start 138 // start/stop ETL pods 139 func (p *proxy) handleETLPost(w http.ResponseWriter, r *http.Request) { 140 apiItems, err := p.parseURL(w, r, apc.URLPathETL.L, 2, true) 141 if err != nil { 142 return 143 } 144 etlName := apiItems[0] 145 if err := k8s.ValidateEtlName(etlName); err != nil { 146 p.writeErr(w, r, err) 147 return 148 } 149 etlMD := p.owner.etl.get() 150 etlMsg := etlMD.get(etlName) 151 if etlMsg == nil { 152 p.writeErr(w, r, cos.NewErrNotFound(p, "etl job "+etlName)) 153 return 154 } 155 156 switch op := apiItems[1]; op { 157 case apc.ETLStop: 158 p.stopETL(w, r) 159 case apc.ETLStart: 160 p.startETL(w, etlMsg, false /*add to etlMD*/) 161 default: 162 debug.Assert(false, "invalid operation: "+op) 163 p.writeErrURL(w, r) 164 } 165 } 166 167 // DELETE /v1/etl/<etl-name> 168 func (p *proxy) handleETLDelete(w http.ResponseWriter, r *http.Request) { 169 apiItems, err := p.parseURL(w, r, apc.URLPathETL.L, 1, true) 170 if err != nil { 171 return 172 } 173 174 if p.forwardCP(w, r, nil, "delete ETL") { 175 return 176 } 177 178 etlName := apiItems[0] 179 if err := k8s.ValidateEtlName(etlName); err != nil { 180 p.writeErr(w, r, err) 181 return 182 } 183 ctx := &etlMDModifier{ 184 pre: p._deleteETLPre, 185 final: p._syncEtlMDFinal, 186 etlName: etlName, 187 } 188 if _, err := p.owner.etl.modify(ctx); err != nil { 189 p.writeErr(w, r, err) 190 } 191 } 192 193 func (p *proxy) _deleteETLPre(ctx *etlMDModifier, clone *etlMD) (err error) { 194 debug.AssertNoErr(k8s.ValidateEtlName(ctx.etlName)) 195 if exists := clone.del(ctx.etlName); !exists { 196 err = cos.NewErrNotFound(p, "etl job "+ctx.etlName) 197 } 198 return 199 } 200 201 // broadcast (start ETL) request to all targets 202 func (p *proxy) startETL(w http.ResponseWriter, msg etl.InitMsg, addToMD bool) error { 203 var ( 204 err error 205 args = allocBcArgs() 206 xid = etl.PrefixXactID + cos.GenUUID() 207 ) 208 { 209 args.req = cmn.HreqArgs{ 210 Method: http.MethodPut, 211 Path: apc.URLPathETL.S, 212 Body: cos.MustMarshal(msg), 213 Query: url.Values{apc.QparamUUID: []string{xid}}, 214 } 215 args.timeout = apc.LongTimeout 216 } 217 results := p.bcastGroup(args) 218 freeBcArgs(args) 219 for _, res := range results { 220 if res.err == nil { 221 continue 222 } 223 err = res.toErr() 224 nlog.Errorln(err) 225 } 226 freeBcastRes(results) 227 228 if err != nil { 229 // At least one target failed. Terminate all. 230 // (Termination calls may succeed for the targets that already succeeded in starting ETL, 231 // or fail otherwise - ignore the failures). 232 argsTerm := allocBcArgs() 233 argsTerm.req = cmn.HreqArgs{Method: http.MethodPost, Path: apc.URLPathETL.Join(msg.Name(), apc.ETLStop)} 234 argsTerm.timeout = apc.LongTimeout 235 p.bcastGroup(argsTerm) 236 freeBcArgs(argsTerm) 237 return err 238 } 239 240 if addToMD { 241 ctx := &etlMDModifier{ 242 pre: _addETLPre, 243 final: p._syncEtlMDFinal, 244 msg: msg, 245 wait: true, 246 } 247 p.owner.etl.modify(ctx) 248 } 249 // All init calls succeeded - return running xaction 250 w.Header().Set(cos.HdrContentLength, strconv.Itoa(len(xid))) 251 w.Write(cos.UnsafeB(xid)) 252 return nil 253 } 254 255 func _addETLPre(ctx *etlMDModifier, clone *etlMD) (_ error) { 256 debug.Assert(ctx.msg != nil) 257 clone.add(ctx.msg) 258 return 259 } 260 261 func (p *proxy) _syncEtlMDFinal(ctx *etlMDModifier, clone *etlMD) { 262 wg := p.metasyncer.sync(revsPair{clone, p.newAmsgStr("etl-reg", nil)}) 263 if ctx.wait { 264 wg.Wait() 265 } 266 } 267 268 // GET /v1/etl/<etl-name> 269 func (p *proxy) infoETL(w http.ResponseWriter, r *http.Request, etlName string) { 270 if err := k8s.ValidateEtlName(etlName); err != nil { 271 p.writeErr(w, r, err) 272 return 273 } 274 275 etlMD := p.owner.etl.get() 276 initMsg := etlMD.get(etlName) 277 if initMsg == nil { 278 p.writeErr(w, r, cos.NewErrNotFound(p, "etl job "+etlName)) 279 return 280 } 281 p.writeJSON(w, r, initMsg, "info-etl") 282 } 283 284 // GET /v1/etl 285 func (p *proxy) listETL(w http.ResponseWriter, r *http.Request) { 286 var ( 287 args = allocBcArgs() 288 etls *etl.InfoList 289 ) 290 args.req = cmn.HreqArgs{Method: http.MethodGet, Path: apc.URLPathETL.S} 291 args.timeout = apc.DefaultTimeout 292 args.cresv = cresEI{} // -> etl.InfoList 293 results := p.bcastGroup(args) 294 freeBcArgs(args) 295 296 for _, res := range results { 297 if res.err != nil { 298 p.writeErr(w, r, res.toErr()) 299 freeBcastRes(results) 300 return 301 } 302 303 if etls == nil { 304 etls = res.v.(*etl.InfoList) 305 sort.Sort(etls) 306 } else { 307 another := res.v.(*etl.InfoList) 308 sort.Sort(another) 309 if !reflect.DeepEqual(etls, another) { 310 // TODO: Should we return an error to a user? 311 // Or stop mismatching ETLs and return internal server error? 312 nlog.Warningf("Targets returned different ETLs: %v vs %v", etls, another) 313 } 314 } 315 } 316 freeBcastRes(results) 317 if etls == nil { 318 etls = &etl.InfoList{} 319 } 320 p.writeJSON(w, r, *etls, "list-etl") 321 } 322 323 // GET /v1/etl/<etl-name>/logs[/<target_id>] 324 func (p *proxy) logsETL(w http.ResponseWriter, r *http.Request, etlName string, apiItems ...string) { 325 var ( 326 results sliceResults 327 args *bcastArgs 328 ) 329 if len(apiItems) > 0 { 330 // specific target 331 var ( 332 tid = apiItems[0] 333 smap = p.owner.smap.get() 334 si = smap.GetTarget(tid) 335 ) 336 if si == nil { 337 p.writeErrf(w, r, "unknown target %q", tid) 338 return 339 } 340 results = make(sliceResults, 1) 341 cargs := allocCargs() 342 { 343 cargs.req = cmn.HreqArgs{Method: http.MethodGet, Path: apc.URLPathETL.Join(etlName, apc.ETLLogs)} 344 cargs.si = si 345 cargs.timeout = apc.DefaultTimeout 346 cargs.cresv = cresEL{} // -> etl.Logs 347 } 348 results[0] = p.call(cargs, smap) 349 freeCargs(cargs) 350 } else { 351 // all targets 352 args = allocBcArgs() 353 args.req = cmn.HreqArgs{Method: http.MethodGet, Path: r.URL.Path} 354 args.timeout = apc.DefaultTimeout 355 args.cresv = cresEL{} // -> etl.Logs 356 results = p.bcastGroup(args) 357 freeBcArgs(args) 358 } 359 logs := make(etl.LogsByTarget, 0, len(results)) 360 for _, res := range results { 361 if res.err != nil { 362 p.writeErr(w, r, res.toErr()) 363 freeBcastRes(results) 364 return 365 } 366 logs = append(logs, *res.v.(*etl.Logs)) 367 } 368 freeBcastRes(results) 369 p.writeJSON(w, r, logs, "logs-etl") 370 } 371 372 // GET /v1/etl/<etl-name>/health 373 func (p *proxy) healthETL(w http.ResponseWriter, r *http.Request) { 374 var ( 375 results sliceResults 376 args *bcastArgs 377 ) 378 args = allocBcArgs() 379 args.req = cmn.HreqArgs{Method: http.MethodGet, Path: r.URL.Path} 380 results = p.bcastGroup(args) 381 defer freeBcastRes(results) 382 freeBcArgs(args) 383 384 healths := make(etl.HealthByTarget, 0, len(results)) 385 for _, res := range results { 386 if res.err != nil { 387 p.writeErr(w, r, res.toErr(), res.status) 388 return 389 } 390 msg := etl.HealthStatus{ 391 TargetID: res.si.ID(), 392 Status: string(res.bytes), 393 } 394 healths = append(healths, &msg) 395 } 396 p.writeJSON(w, r, healths, "health-etl") 397 } 398 399 // GET /v1/etl/<etl-name>/metrics 400 func (p *proxy) metricsETL(w http.ResponseWriter, r *http.Request) { 401 var ( 402 results sliceResults 403 args *bcastArgs 404 ) 405 args = allocBcArgs() 406 args.req = cmn.HreqArgs{Method: http.MethodGet, Path: r.URL.Path} 407 args.timeout = apc.DefaultTimeout 408 args.cresv = cresEM{} // -> etl.CPUMemByTarget 409 results = p.bcastGroup(args) 410 defer freeBcastRes(results) 411 freeBcArgs(args) 412 413 metrics := make(etl.CPUMemByTarget, 0, len(results)) 414 for _, res := range results { 415 if res.err != nil { 416 p.writeErr(w, r, res.toErr(), res.status) 417 return 418 } 419 metrics = append(metrics, res.v.(*etl.CPUMemUsed)) 420 } 421 sort.SliceStable(metrics, func(i, j int) bool { return metrics[i].TargetID < metrics[j].TargetID }) 422 p.writeJSON(w, r, metrics, "metrics-etl") 423 } 424 425 // POST /v1/etl/<etl-name>/stop 426 func (p *proxy) stopETL(w http.ResponseWriter, r *http.Request) { 427 args := allocBcArgs() 428 args.req = cmn.HreqArgs{Method: http.MethodPost, Path: r.URL.Path} 429 args.timeout = apc.LongTimeout 430 results := p.bcastGroup(args) 431 freeBcArgs(args) 432 for _, res := range results { 433 if res.err == nil { 434 continue 435 } 436 p.writeErr(w, r, res.toErr()) 437 break 438 } 439 freeBcastRes(results) 440 }