github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ec/xaction.go (about) 1 // Package ec provides erasure coding (EC) based data protection for AIStore. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package ec 6 7 import ( 8 "fmt" 9 "io" 10 "os" 11 "sync" 12 13 "github.com/NVIDIA/aistore/cmn" 14 "github.com/NVIDIA/aistore/cmn/atomic" 15 "github.com/NVIDIA/aistore/cmn/cos" 16 "github.com/NVIDIA/aistore/cmn/debug" 17 "github.com/NVIDIA/aistore/cmn/nlog" 18 "github.com/NVIDIA/aistore/core" 19 "github.com/NVIDIA/aistore/core/meta" 20 "github.com/NVIDIA/aistore/transport" 21 "github.com/NVIDIA/aistore/xact" 22 ) 23 24 const ( 25 requestBufSizeFS = 70 26 requestBufSizeEncode = 16 27 ) 28 29 type ( 30 xactECBase struct { 31 xact.DemandBase 32 config *cmn.Config // config 33 stats stats // EC statistics 34 bck cmn.Bck // which bucket xctn belongs to 35 36 dOwner *dataOwner // data slice manager 37 mgr *Manager // EC manager 38 } 39 40 xactReqBase struct { 41 mpathReqCh chan mpathReq // notify about mountpath changes 42 controlCh chan RequestsControlMsg 43 44 rejectReq atomic.Bool // marker if EC requests should be rejected 45 } 46 47 mpathReq struct { 48 action string 49 mpath string 50 } 51 52 // Manages SGL objects that are waiting for a data from a remote target 53 dataOwner struct { 54 mtx sync.Mutex 55 slices map[string]*slice 56 } 57 ) 58 59 func (r *xactECBase) init(config *cmn.Config, bck *cmn.Bck, mgr *Manager) { 60 r.stats = stats{bck: *bck} 61 r.config = config 62 r.bck = *bck 63 r.dOwner = &dataOwner{slices: make(map[string]*slice, 10)} 64 r.mgr = mgr 65 } 66 67 ///////////////// 68 // xactReqBase // 69 ///////////////// 70 71 func (r *xactReqBase) init() { 72 r.mpathReqCh = make(chan mpathReq, 1) 73 r.controlCh = make(chan RequestsControlMsg, 8) 74 } 75 76 // ClearRequests disables receiving new EC requests, they will be terminated with error 77 // Then it starts draining a channel from pending EC requests 78 // It does not enable receiving new EC requests, it has to be done explicitly, when EC is enabled again 79 func (r *xactReqBase) ClearRequests() { 80 msg := RequestsControlMsg{ 81 Action: ActClearRequests, 82 } 83 84 r.controlCh <- msg 85 } 86 87 func (r *xactReqBase) EnableRequests() { 88 msg := RequestsControlMsg{ 89 Action: ActEnableRequests, 90 } 91 92 r.controlCh <- msg 93 } 94 95 func (r *xactReqBase) setEcRequestsDisabled() { 96 r.rejectReq.Store(true) 97 } 98 99 func (r *xactReqBase) setEcRequestsEnabled() { 100 r.rejectReq.Store(false) 101 } 102 103 func (r *xactReqBase) ecRequestsEnabled() bool { 104 return !r.rejectReq.Load() 105 } 106 107 //////////////// 108 // xactECBase // 109 //////////////// 110 111 func newSliceResponse(md *Metadata, attrs *cmn.ObjAttrs, fqn string) (reader cos.ReadOpenCloser, err error) { 112 attrs.Ver = md.ObjVersion 113 attrs.Cksum = cos.NewCksum(md.CksumType, md.CksumValue) 114 115 stat, err := os.Stat(fqn) 116 if err != nil { 117 return nil, err 118 } 119 attrs.Size = stat.Size() 120 reader, err = cos.NewFileHandle(fqn) 121 if err != nil { 122 nlog.Warningf("Failed to read file stats: %s", err) 123 return nil, err 124 } 125 return reader, nil 126 } 127 128 // replica/full object request 129 func newReplicaResponse(attrs *cmn.ObjAttrs, bck *meta.Bck, objName string) (reader cos.ReadOpenCloser, err error) { 130 lom := core.AllocLOM(objName) 131 defer core.FreeLOM(lom) 132 if err = lom.InitBck(bck.Bucket()); err != nil { 133 return nil, err 134 } 135 if err = lom.Load(true /*cache it*/, false /*locked*/); err != nil { 136 nlog.Warningln(err) 137 return nil, err 138 } 139 reader, err = cos.NewFileHandle(lom.FQN) 140 if err != nil { 141 return nil, err 142 } 143 if lom.SizeBytes() == 0 { 144 return nil, nil 145 } 146 attrs.Size = lom.SizeBytes() 147 attrs.Ver = lom.Version() 148 attrs.Atime = lom.AtimeUnix() 149 attrs.Cksum = lom.Checksum() 150 return reader, nil 151 } 152 153 // Sends the replica/meta/slice data: either to copy replicas/slices after 154 // encoding or to send requested "object" to a client. In the latter case 155 // if the local object does not exist, it sends an empty body and sets 156 // exists=false in response header 157 func (r *xactECBase) dataResponse(act intraReqType, hdr *transport.ObjHdr, fqn string, bck *meta.Bck, objName string, 158 md *Metadata) (err error) { 159 var ( 160 reader cos.ReadOpenCloser 161 objAttrs cmn.ObjAttrs 162 ) 163 ireq := newIntraReq(act, nil, bck) 164 if md != nil && md.SliceID != 0 { 165 // slice request 166 reader, err = newSliceResponse(md, &objAttrs, fqn) 167 ireq.exists = err == nil 168 } else { 169 // replica/full object request 170 reader, err = newReplicaResponse(&objAttrs, bck, objName) 171 ireq.exists = err == nil 172 } 173 debug.Assert((objAttrs.Size == 0 && reader == nil) || (objAttrs.Size != 0 && reader != nil)) 174 175 rHdr := transport.ObjHdr{ObjName: objName, ObjAttrs: objAttrs, Opcode: act} 176 rHdr.Bck.Copy(bck.Bucket()) 177 rHdr.Opaque = ireq.NewPack(g.smm) 178 179 o := transport.AllocSend() 180 o.Hdr, o.Callback = rHdr, r.sendCb 181 182 r.ObjsAdd(1, objAttrs.Size) 183 r.IncPending() 184 return r.sendByDaemonID([]string{hdr.SID}, o, reader, false) 185 } 186 187 func (r *xactECBase) sendCb(hdr *transport.ObjHdr, _ io.ReadCloser, _ any, err error) { 188 g.smm.Free(hdr.Opaque) 189 if err != nil { 190 err = cmn.NewErrFailedTo(core.T, "ec-send", hdr.Cname(), err) 191 r.AddErr(err, 0) 192 } 193 r.DecPending() 194 } 195 196 // Send a data or request to one or few targets by their DaemonIDs. Most of the time 197 // only DaemonID is known - that is why the function gets DaemonID and internally 198 // transforms it into meta.Snode. 199 // * daemonIDs - a list of targets 200 // * hdr - transport header 201 // * reader - a data to send 202 // * cb - optional callback to be called when the transfer completes 203 // * isRequest - defines the type of request: 204 // - true - send lightweight request to all targets (usually reader is nil 205 // in this case) 206 // - false - send a slice/replica/metadata to targets 207 func (r *xactECBase) sendByDaemonID(daemonIDs []string, o *transport.Obj, reader cos.ReadOpenCloser, isRequest bool) error { 208 var ( 209 err error 210 nodes = meta.AllocNodes(len(daemonIDs)) 211 smap = core.T.Sowner().Get() 212 ) 213 for _, id := range daemonIDs { 214 si, ok := smap.Tmap[id] 215 if !ok { 216 nlog.Errorf("t[%s] not found", id) 217 continue 218 } 219 nodes = append(nodes, si) 220 } 221 if isRequest { 222 err = r.mgr.req().Send(o, reader, nodes...) 223 } else { 224 err = r.mgr.resp().Send(o, reader, nodes...) 225 } 226 meta.FreeNodes(nodes) 227 return err 228 } 229 230 // send request to a target, wait for its response, read the data into writer. 231 // - daemonID - target to send a request 232 // - bucket/objName - what to request 233 // - uname - unique name for the operation: the name is built from daemonID, 234 // bucket and object names. HTTP data receiving handler generates a name 235 // when receiving data and if it finds a writer registered with the same 236 // name, it puts the data to its writer and notifies when download is done 237 // - request - request to send 238 // - writer - an opened writer that will receive the replica/slice/meta 239 func (r *xactECBase) readRemote(lom *core.LOM, daemonID, uname string, request []byte, writer io.Writer) (int64, error) { 240 hdr := transport.ObjHdr{ObjName: lom.ObjName, Opaque: request, Opcode: reqGet} 241 hdr.Bck.Copy(lom.Bucket()) 242 243 o := transport.AllocSend() 244 o.Hdr = hdr 245 246 sw := &slice{writer: writer, twg: cos.NewTimeoutGroup(), lom: lom} 247 sw.twg.Add(1) 248 r.regWriter(uname, sw) 249 250 if cmn.Rom.FastV(4, cos.SmoduleEC) { 251 nlog.Infof("Requesting object %s from %s", lom, daemonID) 252 } 253 if err := r.sendByDaemonID([]string{daemonID}, o, nil, true); err != nil { 254 r.unregWriter(uname) 255 r.AddErr(err) 256 return 0, err 257 } 258 if sw.twg.WaitTimeout(r.config.Timeout.SendFile.D()) { 259 r.unregWriter(uname) 260 err := fmt.Errorf("read-remote(%s): timeout %v", uname, r.config.Timeout.SendFile.D()) 261 r.AddErr(err) 262 return 0, err 263 } 264 r.unregWriter(uname) 265 266 if cmn.Rom.FastV(4, cos.SmoduleEC) { 267 nlog.Infof("Received object %s from %s", lom, daemonID) 268 } 269 if sw.version != "" { 270 lom.SetVersion(sw.version) 271 } 272 lom.SetCksum(sw.cksum) 273 lom.Uncache() 274 return sw.n, nil 275 } 276 277 // Registers a new slice that will wait for the data to come from 278 // a remote target 279 func (r *xactECBase) regWriter(uname string, writer *slice) bool { 280 r.dOwner.mtx.Lock() 281 _, ok := r.dOwner.slices[uname] 282 if ok { 283 nlog.Errorf("Writer for %s is already registered", uname) 284 } else { 285 r.dOwner.slices[uname] = writer 286 } 287 r.dOwner.mtx.Unlock() 288 289 return !ok 290 } 291 292 // Unregisters a slice that has been waiting for the data to come from 293 // a remote target 294 func (r *xactECBase) unregWriter(uname string) { 295 r.dOwner.mtx.Lock() 296 delete(r.dOwner.slices, uname) 297 r.dOwner.mtx.Unlock() 298 } 299 300 // Used to copy replicas/slices after the object is encoded after PUT/restored 301 // after GET, or to respond to meta/slice/replica request. 302 // - daemonIDs - receivers of the data 303 // - bucket/objName - object path 304 // - reader - object/slice/meta data 305 // - src - extra information about the data to send 306 // - cb - a caller may set its own callback to execute when the transfer is done. 307 // A special case: 308 // if a caller does not define its own callback, and it sets the `obj` in 309 // `src` it means that the caller wants to automatically free the memory 310 // allocated for the `obj` SGL after the object is transferred. The caller 311 // may set optional counter in `obj` - the default callback decreases the 312 // counter each time the callback is called and when the value drops below 1, 313 // `writeRemote` callback frees the SGL 314 // The counter is used for sending slices of one big SGL to a few nodes. In 315 // this case every slice must be sent to only one target, and transport bundle 316 // cannot help to track automatically when SGL should be freed. 317 func (r *xactECBase) writeRemote(daemonIDs []string, lom *core.LOM, src *dataSource, cb transport.ObjSentCB) error { 318 if src.metadata != nil && src.metadata.ObjVersion == "" { 319 src.metadata.ObjVersion = lom.Version() 320 } 321 req := newIntraReq(src.reqType, src.metadata, lom.Bck()) 322 req.isSlice = src.isSlice 323 324 putData := req.NewPack(g.smm) 325 objAttrs := cmn.ObjAttrs{ 326 Size: src.size, 327 Ver: lom.Version(), 328 Atime: lom.AtimeUnix(), 329 } 330 if src.metadata != nil && src.metadata.SliceID != 0 { 331 // for a slice read everything from slice's metadata 332 if src.metadata.ObjVersion != "" { 333 objAttrs.Ver = src.metadata.ObjVersion 334 } 335 objAttrs.Cksum = cos.NewCksum(src.metadata.CksumType, src.metadata.CksumValue) 336 } else { 337 objAttrs.Cksum = lom.Checksum() 338 } 339 hdr := transport.ObjHdr{ 340 ObjName: lom.ObjName, 341 ObjAttrs: objAttrs, 342 Opaque: putData, 343 Opcode: src.reqType, 344 } 345 hdr.Bck.Copy(lom.Bucket()) 346 oldCallback := cb 347 cb = func(hdr *transport.ObjHdr, reader io.ReadCloser, arg any, err error) { 348 g.smm.Free(hdr.Opaque) 349 if oldCallback != nil { 350 oldCallback(hdr, reader, arg, err) 351 } 352 r.DecPending() 353 } 354 355 o := transport.AllocSend() 356 o.Hdr, o.Callback = hdr, cb 357 358 r.IncPending() 359 return r.sendByDaemonID(daemonIDs, o, src.reader, false) 360 } 361 362 // Save data from a target response to SGL or file. When exists is false it 363 // just drains the response body and returns - because it does not contain 364 // any data. On completion the function must call writer.wg.Done to notify 365 // the caller that the data read is completed. 366 // * writer - where to save the slice/meta/replica data 367 // * exists - if the remote target had the requested object 368 // * reader - response body 369 func _writerReceive(writer *slice, exists bool, objAttrs cmn.ObjAttrs, reader io.Reader) (err error) { 370 if !exists { 371 writer.twg.Done() 372 return ErrorNotFound 373 } 374 375 buf, slab := g.pmm.Alloc() 376 writer.n, err = io.CopyBuffer(writer.writer, reader, buf) 377 writer.cksum = objAttrs.Cksum 378 if writer.version == "" && objAttrs.Ver != "" { 379 writer.version = objAttrs.Ver 380 } 381 382 writer.twg.Done() 383 slab.Free(buf) 384 return err 385 } 386 387 func (r *xactECBase) ECStats() *Stats { return r.stats.stats() } 388 389 func (r *xactECBase) baseSnap() (snap *core.Snap) { 390 snap = &core.Snap{} 391 r.ToSnap(snap) 392 393 snap.IdleX = r.IsIdle() 394 return 395 }