github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ec/getxaction.go (about) 1 // Package ec provides erasure coding (EC) based data protection for AIStore. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package ec 6 7 import ( 8 "fmt" 9 "io" 10 "net/http" 11 "sync" 12 "time" 13 14 "github.com/NVIDIA/aistore/api/apc" 15 "github.com/NVIDIA/aistore/cmn" 16 "github.com/NVIDIA/aistore/cmn/cos" 17 "github.com/NVIDIA/aistore/cmn/debug" 18 "github.com/NVIDIA/aistore/cmn/nlog" 19 "github.com/NVIDIA/aistore/core" 20 "github.com/NVIDIA/aistore/core/meta" 21 "github.com/NVIDIA/aistore/fs" 22 "github.com/NVIDIA/aistore/transport" 23 "github.com/NVIDIA/aistore/xact" 24 "github.com/NVIDIA/aistore/xact/xreg" 25 ) 26 27 type ( 28 getFactory struct { 29 xreg.RenewBase 30 xctn *XactGet 31 } 32 33 // Erasure coding runner: accepts requests and dispatches them to 34 // a correct mountpath runner. Runner uses dedicated to EC memory manager 35 // inherited by dependent mountpath runners 36 XactGet struct { 37 xactECBase 38 xactReqBase 39 getJoggers map[string]*getJogger // mountpath joggers for GET 40 } 41 42 // extended x-ec-get statistics 43 ExtECGetStats struct { 44 AvgTime cos.Duration `json:"ec.decode.ns"` 45 ErrCount int64 `json:"ec.decode.err.n,string"` 46 AvgObjTime cos.Duration `json:"ec.obj.process.ns"` 47 AvgQueueLen float64 `json:"ec.queue.len.f"` 48 IsIdle bool `json:"is_idle"` 49 } 50 ) 51 52 // interface guard 53 var ( 54 _ xact.Demand = (*XactGet)(nil) 55 _ xreg.Renewable = (*getFactory)(nil) 56 ) 57 58 //////////////// 59 // getFactory // 60 //////////////// 61 62 func (*getFactory) New(_ xreg.Args, bck *meta.Bck) xreg.Renewable { 63 p := &getFactory{RenewBase: xreg.RenewBase{Bck: bck}} 64 return p 65 } 66 67 func (p *getFactory) Start() error { 68 xec := ECM.NewGetXact(p.Bck.Bucket()) 69 xec.DemandBase.Init(cos.GenUUID(), p.Kind(), p.Bck, 0 /*use default*/) 70 p.xctn = xec 71 go xec.Run(nil) 72 return nil 73 } 74 func (*getFactory) Kind() string { return apc.ActECGet } 75 func (p *getFactory) Get() core.Xact { return p.xctn } 76 77 func (p *getFactory) WhenPrevIsRunning(xprev xreg.Renewable) (xreg.WPR, error) { 78 debug.Assertf(false, "%s vs %s", p.Str(p.Kind()), xprev) // xreg.usePrev() must've returned true 79 return xreg.WprUse, nil 80 } 81 82 ///////////// 83 // XactGet // 84 ///////////// 85 86 func newGetXact(bck *cmn.Bck, mgr *Manager) *XactGet { 87 var ( 88 avail, disabled = fs.Get() 89 totalPaths = len(avail) + len(disabled) 90 config = cmn.GCO.Get() 91 xctn = &XactGet{ 92 getJoggers: make(map[string]*getJogger, totalPaths), 93 } 94 ) 95 xctn.xactECBase.init(config, bck, mgr) 96 xctn.xactReqBase.init() 97 98 // create all runners but do not start them until Run is called 99 for mpath := range avail { 100 getJog := xctn.newGetJogger(mpath) 101 xctn.getJoggers[mpath] = getJog 102 } 103 for mpath := range disabled { 104 getJog := xctn.newGetJogger(mpath) 105 xctn.getJoggers[mpath] = getJog 106 } 107 return xctn 108 } 109 110 func (r *XactGet) DispatchResp(iReq intraReq, hdr *transport.ObjHdr, bck *meta.Bck, reader io.Reader) { 111 objName, objAttrs := hdr.ObjName, hdr.ObjAttrs 112 uname := unique(hdr.SID, bck, objName) 113 switch hdr.Opcode { 114 // It is response to slice/replica request by an object 115 // restoration process. In this case, there should exists 116 // a slice "waiting" for the data to arrive (registered with `regWriter`. 117 // Read the data into the slice writer and notify the slice when 118 // the transfer is complete 119 case respPut: 120 if cmn.Rom.FastV(4, cos.SmoduleEC) { 121 nlog.Infof("Response from %s, %s", hdr.SID, uname) 122 } 123 r.dOwner.mtx.Lock() 124 writer, ok := r.dOwner.slices[uname] 125 r.dOwner.mtx.Unlock() 126 127 if !ok { 128 err := fmt.Errorf("%s: no slice writer for %s (uname %s)", core.T, bck.Cname(objName), uname) 129 r.AddErr(err, 0) 130 return 131 } 132 if err := _writerReceive(writer, iReq.exists, objAttrs, reader); err != nil { 133 err = fmt.Errorf("%s: failed to read %s replica: %w (uname %s)", core.T, bck.Cname(objName), err, uname) 134 r.AddErr(err, 0) 135 } 136 default: 137 debug.Assert(false, "opcode", hdr.Opcode) 138 nlog.Errorf("Invalid request: %d", hdr.Opcode) 139 } 140 } 141 142 func (r *XactGet) newGetJogger(mpath string) *getJogger { 143 var ( 144 client *http.Client 145 cargs = cmn.TransportArgs{Timeout: r.config.Client.Timeout.D()} 146 ) 147 if r.config.Net.HTTP.UseHTTPS { 148 client = cmn.NewIntraClientTLS(cargs, r.config) 149 } else { 150 client = cmn.NewClient(cargs) 151 } 152 j := &getJogger{ 153 parent: r, 154 mpath: mpath, 155 client: client, 156 workCh: make(chan *request, requestBufSizeFS), 157 } 158 j.stopCh.Init() 159 return j 160 } 161 162 func (r *XactGet) dispatchRequest(req *request, lom *core.LOM) error { 163 if !r.ecRequestsEnabled() { 164 if req.ErrCh != nil { 165 req.ErrCh <- ErrorECDisabled 166 close(req.ErrCh) 167 } 168 return ErrorECDisabled 169 } 170 171 debug.Assert(req.Action == ActRestore) 172 173 jogger, ok := r.getJoggers[lom.Mountpath().Path] 174 if !ok { 175 debug.Assert(false, "invalid "+lom.Mountpath().String()) 176 } 177 r.stats.updateQueue(len(jogger.workCh)) 178 jogger.workCh <- req 179 return nil 180 } 181 182 func (r *XactGet) Run(*sync.WaitGroup) { 183 nlog.Infoln(r.Name()) 184 for _, jog := range r.getJoggers { 185 go jog.run() 186 } 187 188 ticker := time.NewTicker(r.config.Periodic.StatsTime.D()) 189 defer ticker.Stop() 190 191 // as of now all requests are equal. Some may get throttling later 192 for { 193 select { 194 case <-ticker.C: 195 if cmn.Rom.FastV(4, cos.SmoduleEC) { 196 if s := r.ECStats().String(); s != "" { 197 nlog.Infoln(s) 198 } 199 } 200 case mpathRequest := <-r.mpathReqCh: 201 switch mpathRequest.action { 202 case apc.ActMountpathAttach: 203 r.addMpath(mpathRequest.mpath) 204 case apc.ActMountpathDetach: 205 r.removeMpath(mpathRequest.mpath) 206 } 207 case <-r.IdleTimer(): 208 // It's OK not to notify ecmanager, it'll just have stopped xctn in a map. 209 r.stop() 210 return 211 case msg := <-r.controlCh: 212 if msg.Action == ActEnableRequests { 213 r.setEcRequestsEnabled() 214 break 215 } 216 debug.Assert(msg.Action == ActClearRequests) 217 218 r.setEcRequestsDisabled() 219 r.stop() 220 return 221 case <-r.ChanAbort(): 222 r.stop() 223 return 224 } 225 } 226 } 227 228 func (r *XactGet) Stop(err error) { r.Abort(err) } 229 230 func (r *XactGet) stop() { 231 r.DemandBase.Stop() 232 for _, jog := range r.getJoggers { 233 jog.stop() 234 } 235 236 // Don't close bundles, they are shared between bucket's EC actions 237 r.Finish() 238 } 239 240 // Decode schedules an object to be restored from existing slices. 241 // A caller should wait for the main object restoration is completed. When 242 // ecrunner finishes main object restoration process it puts into request.ErrCh 243 // channel the error or nil. The caller may read the object after receiving 244 // a nil value from channel but ecrunner keeps working - it reuploads all missing 245 // slices or copies 246 func (r *XactGet) decode(req *request, lom *core.LOM) { 247 debug.Assert(req.Action == ActRestore, "invalid action for restore: "+req.Action) 248 r.stats.updateDecode() 249 req.putTime = time.Now() 250 req.tm = time.Now() 251 252 if err := r.dispatchRequest(req, lom); err != nil { 253 nlog.Errorf("Failed to restore %s: %v", lom, err) 254 freeReq(req) 255 } 256 } 257 258 // ClearRequests disables receiving new EC requests, they will be terminated with error 259 // Then it starts draining a channel from pending EC requests 260 // It does not enable receiving new EC requests, it has to be done explicitly, when EC is enabled again 261 func (r *XactGet) ClearRequests() { 262 msg := RequestsControlMsg{ 263 Action: ActClearRequests, 264 } 265 266 r.controlCh <- msg 267 } 268 269 func (r *XactGet) EnableRequests() { 270 msg := RequestsControlMsg{ 271 Action: ActEnableRequests, 272 } 273 274 r.controlCh <- msg 275 } 276 277 // 278 // fsprunner methods 279 // 280 281 func (r *XactGet) addMpath(mpath string) { 282 jogger, ok := r.getJoggers[mpath] 283 if ok && jogger != nil { 284 nlog.Warningf("Attempted to add already existing mountpath: %s", mpath) 285 return 286 } 287 getJog := r.newGetJogger(mpath) 288 r.getJoggers[mpath] = getJog 289 go getJog.run() 290 } 291 292 func (r *XactGet) removeMpath(mpath string) { 293 getJog, ok := r.getJoggers[mpath] 294 if !ok { 295 debug.Assert(false, "invalid mountpath: "+mpath) 296 } 297 getJog.stop() 298 delete(r.getJoggers, mpath) 299 } 300 301 func (r *XactGet) Snap() (snap *core.Snap) { 302 snap = r.baseSnap() 303 st := r.stats.stats() 304 snap.Ext = &ExtECGetStats{ 305 AvgTime: cos.Duration(st.DecodeTime), 306 ErrCount: st.DecodeErr, 307 AvgObjTime: cos.Duration(st.ObjTime), 308 AvgQueueLen: st.QueueLen, 309 IsIdle: r.Pending() == 0, 310 } 311 snap.Stats.Objs = st.GetReq 312 return 313 }