github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ec/putjogger.go (about) 1 // Package ec provides erasure coding (EC) based data protection for AIStore. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package ec 6 7 import ( 8 "bytes" 9 "errors" 10 "fmt" 11 "io" 12 "os" 13 "sync" 14 "time" 15 16 "github.com/NVIDIA/aistore/cmn" 17 "github.com/NVIDIA/aistore/cmn/atomic" 18 "github.com/NVIDIA/aistore/cmn/cos" 19 "github.com/NVIDIA/aistore/cmn/debug" 20 "github.com/NVIDIA/aistore/cmn/mono" 21 "github.com/NVIDIA/aistore/cmn/nlog" 22 "github.com/NVIDIA/aistore/core" 23 "github.com/NVIDIA/aistore/core/meta" 24 "github.com/NVIDIA/aistore/fs" 25 "github.com/NVIDIA/aistore/memsys" 26 "github.com/NVIDIA/aistore/transport" 27 "github.com/klauspost/reedsolomon" 28 ) 29 30 type ( 31 encodeCtx struct { 32 lom *core.LOM // replica 33 meta *Metadata // 34 fh *cos.FileHandle // file handle for the replica 35 sliceSize int64 // calculated slice size 36 padSize int64 // zero tail of the last object's data slice 37 dataSlices int // the number of data slices 38 paritySlices int // the number of parity slices 39 cksums []*cos.CksumHash // checksums of parity slices (filled by reed-solomon) 40 slices []*slice // all EC slices (in the order of slice IDs) 41 targets []*meta.Snode // target list (in the order of slice IDs: targets[i] receives slices[i]) 42 } 43 44 // a mountpath putJogger: processes PUT/DEL requests to one mountpath 45 putJogger struct { 46 parent *XactPut 47 slab *memsys.Slab 48 buffer []byte 49 mpath string 50 51 putCh chan *request // top priority operation (object PUT) 52 xactCh chan *request // low priority operation (ec-encode) 53 stopCh cos.StopCh // jogger management channel: to stop it 54 55 toDisk bool // use files or SGL 56 } 57 ) 58 59 var ( 60 encCtxPool sync.Pool 61 emptyCtx encodeCtx 62 errSliceSendFailed = errors.New("failed to send slice") 63 ) 64 65 func allocCtx() (ctx *encodeCtx) { 66 if v := encCtxPool.Get(); v != nil { 67 ctx = v.(*encodeCtx) 68 } else { 69 ctx = &encodeCtx{} 70 } 71 return 72 } 73 74 func (ctx *encodeCtx) freeReplica() { 75 freeObject(ctx.fh) 76 } 77 78 /////////////// 79 // putJogger // 80 /////////////// 81 82 func (*putJogger) newCtx(lom *core.LOM, meta *Metadata) (ctx *encodeCtx, err error) { 83 ctx = allocCtx() 84 ctx.lom = lom 85 ctx.dataSlices = lom.Bprops().EC.DataSlices 86 ctx.paritySlices = lom.Bprops().EC.ParitySlices 87 ctx.meta = meta 88 89 totalCnt := ctx.paritySlices + ctx.dataSlices 90 ctx.sliceSize = SliceSize(ctx.lom.SizeBytes(), ctx.dataSlices) 91 ctx.slices = make([]*slice, totalCnt) 92 ctx.padSize = ctx.sliceSize*int64(ctx.dataSlices) - ctx.lom.SizeBytes() 93 94 ctx.fh, err = cos.NewFileHandle(lom.FQN) 95 return ctx, err 96 } 97 98 func (*putJogger) freeCtx(ctx *encodeCtx) { 99 *ctx = emptyCtx 100 encCtxPool.Put(ctx) 101 } 102 103 func (c *putJogger) freeResources() { 104 c.slab.Free(c.buffer) 105 c.buffer = nil 106 c.slab = nil 107 } 108 109 func (c *putJogger) processRequest(req *request) { 110 lom, err := req.LIF.LOM() 111 if err != nil { 112 return 113 } 114 115 c.parent.IncPending() 116 defer func() { 117 if req.Callback != nil { 118 req.Callback(lom, err) 119 } 120 core.FreeLOM(lom) 121 c.parent.DecPending() 122 }() 123 124 if req.Action == ActSplit { 125 if err = lom.Load(false /*cache it*/, false /*locked*/); err != nil { 126 return 127 } 128 ecConf := lom.Bprops().EC 129 memRequired := lom.SizeBytes() * int64(ecConf.DataSlices+ecConf.ParitySlices) / int64(ecConf.ParitySlices) 130 c.toDisk = useDisk(memRequired, c.parent.config) 131 } 132 133 c.parent.stats.updateWaitTime(time.Since(req.tm)) 134 req.tm = time.Now() 135 if err = c.ec(req, lom); err != nil { 136 err = cmn.NewErrFailedTo(core.T, req.Action, lom.Cname(), err) 137 c.parent.AddErr(err, 0) 138 } 139 } 140 141 func (c *putJogger) run(wg *sync.WaitGroup) { 142 nlog.Infof("Started EC for mountpath: %s, bucket %s", c.mpath, c.parent.bck) 143 defer wg.Done() 144 c.buffer, c.slab = g.pmm.Alloc() 145 for { 146 select { 147 case req := <-c.putCh: 148 c.processRequest(req) 149 freeReq(req) 150 case req := <-c.xactCh: 151 c.processRequest(req) 152 freeReq(req) 153 case <-c.stopCh.Listen(): 154 c.freeResources() 155 return 156 } 157 } 158 } 159 160 func (c *putJogger) stop() { 161 nlog.Infof("Stopping EC for mountpath: %s, bucket %s", c.mpath, c.parent.bck) 162 c.stopCh.Close() 163 } 164 165 func (c *putJogger) ec(req *request, lom *core.LOM) (err error) { 166 switch req.Action { 167 case ActSplit: 168 if err = c.encode(req, lom); err != nil { 169 ctMeta := core.NewCTFromLOM(lom, fs.ECMetaType) 170 errRm := cos.RemoveFile(ctMeta.FQN()) 171 debug.AssertNoErr(errRm) 172 } 173 c.parent.stats.updateEncodeTime(time.Since(req.tm), err != nil) 174 case ActDelete: 175 err = c.cleanup(lom) 176 c.parent.stats.updateDeleteTime(time.Since(req.tm), err != nil) 177 default: 178 err = fmt.Errorf("invalid EC action for putJogger: %v", req.Action) 179 } 180 181 if err == nil { 182 c.parent.stats.updateObjTime(time.Since(req.putTime)) 183 } 184 return err 185 } 186 187 func (c *putJogger) replicate(ctx *encodeCtx) error { 188 err := c.createCopies(ctx) 189 if err != nil { 190 ctx.freeReplica() 191 c.cleanup(ctx.lom) 192 } 193 return err 194 } 195 196 func (c *putJogger) splitAndDistribute(ctx *encodeCtx) error { 197 err := initializeSlices(ctx) 198 if err == nil { 199 err = c.sendSlices(ctx) 200 } 201 if err != nil { 202 ctx.freeReplica() 203 if err != errSliceSendFailed { 204 freeSlices(ctx.slices) 205 } 206 c.cleanup(ctx.lom) 207 } 208 return err 209 } 210 211 // calculates and stores data and parity slices 212 func (c *putJogger) encode(req *request, lom *core.LOM) error { 213 if cmn.Rom.FastV(4, cos.SmoduleEC) { 214 nlog.Infof("Encoding %q...", lom) 215 } 216 var ( 217 ecConf = lom.Bprops().EC 218 reqTargets = ecConf.ParitySlices + 1 219 smap = core.T.Sowner().Get() 220 ) 221 if !req.IsCopy { 222 reqTargets += ecConf.DataSlices 223 } 224 targetCnt := smap.CountActiveTs() 225 if targetCnt < reqTargets { 226 return fmt.Errorf("%v: given EC config (d=%d, p=%d), %d targets required to encode %s (have %d, %s)", 227 cmn.ErrNotEnoughTargets, ecConf.DataSlices, ecConf.ParitySlices, reqTargets, lom, targetCnt, smap.StringEx()) 228 } 229 230 var ( 231 ctMeta = core.NewCTFromLOM(lom, fs.ECMetaType) 232 generation = mono.NanoTime() 233 cksumType, cksumValue = lom.Checksum().Get() 234 ) 235 meta := &Metadata{ 236 MDVersion: MDVersionLast, 237 Generation: generation, 238 Size: lom.SizeBytes(), 239 Data: ecConf.DataSlices, 240 Parity: ecConf.ParitySlices, 241 IsCopy: req.IsCopy, 242 ObjCksum: cksumValue, 243 CksumType: cksumType, 244 FullReplica: core.T.SID(), 245 Daemons: make(cos.MapStrUint16, reqTargets), 246 } 247 248 c.parent.LomAdd(lom) 249 250 ctx, err := c.newCtx(lom, meta) 251 defer c.freeCtx(ctx) 252 if err != nil { 253 return err 254 } 255 targets, err := smap.HrwTargetList(ctx.lom.Uname(), reqTargets) 256 if err != nil { 257 return err 258 } 259 ctx.targets = targets[1:] 260 meta.Daemons[targets[0].ID()] = 0 // main or full replica always on the first target 261 for i, tgt := range ctx.targets { 262 sliceID := uint16(i + 1) 263 if meta.IsCopy { 264 sliceID = 0 265 } 266 meta.Daemons[tgt.ID()] = sliceID 267 } 268 269 if meta.IsCopy { 270 err = c.replicate(ctx) 271 } else { 272 err = c.splitAndDistribute(ctx) 273 } 274 if err != nil { 275 return err 276 } 277 metaBuf := bytes.NewReader(meta.NewPack()) 278 if err := ctMeta.Write(metaBuf, -1); err != nil { 279 return err 280 } 281 if _, exists := core.T.Bowner().Get().Get(ctMeta.Bck()); !exists { 282 if errRm := cos.RemoveFile(ctMeta.FQN()); errRm != nil { 283 nlog.Errorf("nested error: encode -> remove metafile: %v", errRm) 284 } 285 return fmt.Errorf("%s metafile saved while bucket %s was being destroyed", ctMeta.ObjectName(), ctMeta.Bucket()) 286 } 287 return nil 288 } 289 290 func (c *putJogger) ctSendCallback(hdr *transport.ObjHdr, _ io.ReadCloser, _ any, err error) { 291 g.smm.Free(hdr.Opaque) 292 if err != nil { 293 nlog.Errorf("failed to send o[%s]: %v", hdr.Cname(), err) 294 } 295 c.parent.DecPending() 296 } 297 298 // Remove slices and replicas across the cluster: remove local metafile 299 // if exists and broadcast the request to other targets 300 func (c *putJogger) cleanup(lom *core.LOM) error { 301 ctMeta := core.NewCTFromLOM(lom, fs.ECMetaType) 302 md, err := LoadMetadata(ctMeta.FQN()) 303 if err != nil { 304 if os.IsNotExist(err) { 305 // Metafile does not exist = nothing to clean up 306 err = nil 307 } 308 return err 309 } 310 nodes := md.RemoteTargets() 311 if err := cos.RemoveFile(ctMeta.FQN()); err != nil { 312 return err 313 } 314 315 request := newIntraReq(reqDel, nil, lom.Bck()).NewPack(g.smm) 316 o := transport.AllocSend() 317 o.Hdr = transport.ObjHdr{ObjName: lom.ObjName, Opaque: request, Opcode: reqDel} 318 o.Hdr.Bck.Copy(lom.Bucket()) 319 o.Callback = c.ctSendCallback 320 c.parent.IncPending() 321 return c.parent.mgr.req().Send(o, nil, nodes...) 322 } 323 324 // Sends object replicas to targets that must have replicas after the client 325 // uploads the main replica 326 func (c *putJogger) createCopies(ctx *encodeCtx) error { 327 // generate a list of target to send the replica (all excluding this one) 328 nodes := make([]string, 0, len(ctx.targets)) 329 for _, tgt := range ctx.targets { 330 nodes = append(nodes, tgt.ID()) 331 } 332 333 // broadcast the replica to the targets 334 src := &dataSource{ 335 reader: ctx.fh, 336 size: ctx.lom.SizeBytes(), 337 metadata: ctx.meta, 338 reqType: reqPut, 339 } 340 return c.parent.writeRemote(nodes, ctx.lom, src, nil) 341 } 342 343 func checksumDataSlices(ctx *encodeCtx, cksmReaders []io.Reader, cksumType string) error { 344 debug.Assert(cksumType != "") // caller checks for 'none' 345 for i, reader := range cksmReaders { 346 _, cksum, err := cos.CopyAndChecksum(io.Discard, reader, nil, cksumType) 347 if err != nil { 348 return err 349 } 350 ctx.slices[i].cksum = cksum.Clone() 351 } 352 return nil 353 } 354 355 // generateSlicesToMemory gets FQN to the original file and encodes it into EC slices 356 // writers are slices created by EC encoding process(memory is allocated) 357 func generateSlicesToMemory(ctx *encodeCtx) error { 358 var ( 359 cksumType = ctx.lom.CksumType() 360 initSize = min(ctx.sliceSize, cos.MiB) 361 sliceWriters = make([]io.Writer, ctx.paritySlices) 362 ) 363 for i := range ctx.paritySlices { 364 writer := g.pmm.NewSGL(initSize) 365 ctx.slices[i+ctx.dataSlices] = &slice{obj: writer} 366 if cksumType == cos.ChecksumNone { 367 sliceWriters[i] = writer 368 } else { 369 ctx.cksums[i] = cos.NewCksumHash(cksumType) 370 sliceWriters[i] = cos.NewWriterMulti(writer, ctx.cksums[i].H) 371 } 372 } 373 374 return finalizeSlices(ctx, sliceWriters) 375 } 376 377 func initializeSlices(ctx *encodeCtx) (err error) { 378 // readers are slices of original object(no memory allocated) 379 cksmReaders := make([]io.Reader, ctx.dataSlices) 380 sizeLeft := ctx.lom.SizeBytes() 381 for i := range ctx.dataSlices { 382 var ( 383 reader cos.ReadOpenCloser 384 cksmReader cos.ReadOpenCloser 385 offset = int64(i) * ctx.sliceSize 386 ) 387 if sizeLeft < ctx.sliceSize { 388 reader = cos.NewSectionHandle(ctx.fh, offset, sizeLeft, ctx.padSize) 389 cksmReader = cos.NewSectionHandle(ctx.fh, offset, sizeLeft, ctx.padSize) 390 } else { 391 reader = cos.NewSectionHandle(ctx.fh, offset, ctx.sliceSize, 0) 392 cksmReader = cos.NewSectionHandle(ctx.fh, offset, ctx.sliceSize, 0) 393 } 394 ctx.slices[i] = &slice{obj: ctx.fh, reader: reader} 395 cksmReaders[i] = cksmReader 396 sizeLeft -= ctx.sliceSize 397 } 398 399 // We have established readers of data slices, we can already start calculating hashes for them 400 // during calculating parity slices and their hashes 401 if cksumType := ctx.lom.CksumType(); cksumType != cos.ChecksumNone { 402 ctx.cksums = make([]*cos.CksumHash, ctx.paritySlices) 403 err = checksumDataSlices(ctx, cksmReaders, cksumType) 404 } 405 return 406 } 407 408 func finalizeSlices(ctx *encodeCtx, writers []io.Writer) error { 409 stream, err := reedsolomon.NewStreamC(ctx.dataSlices, ctx.paritySlices, true, true) 410 if err != nil { 411 return err 412 } 413 414 // Calculate parity slices and their checksums 415 readers := make([]io.Reader, ctx.dataSlices) 416 for i := range ctx.dataSlices { 417 readers[i] = ctx.slices[i].reader 418 } 419 if err := stream.Encode(readers, writers); err != nil { 420 return err 421 } 422 423 if cksumType := ctx.lom.CksumType(); cksumType != cos.ChecksumNone { 424 for i := range ctx.cksums { 425 ctx.cksums[i].Finalize() 426 ctx.slices[i+ctx.dataSlices].cksum = ctx.cksums[i].Clone() 427 } 428 } 429 return nil 430 } 431 432 // generateSlicesToDisk gets FQN to the original file and encodes it into EC slices 433 func generateSlicesToDisk(ctx *encodeCtx) error { 434 writers := make([]io.Writer, ctx.paritySlices) 435 sliceWriters := make([]io.Writer, ctx.paritySlices) 436 437 defer func() { 438 for _, wr := range writers { 439 if wr == nil { 440 continue 441 } 442 // writer can be only *os.File within this function 443 f := wr.(*os.File) 444 cos.Close(f) 445 } 446 }() 447 448 cksumType := ctx.lom.CksumType() 449 for i := range ctx.paritySlices { 450 workFQN := fs.CSM.Gen(ctx.lom, fs.WorkfileType, fmt.Sprintf("ec-write-%d", i)) 451 writer, err := ctx.lom.CreateFile(workFQN) 452 if err != nil { 453 return err 454 } 455 ctx.slices[i+ctx.dataSlices] = &slice{writer: writer, workFQN: workFQN} 456 writers[i] = writer 457 if cksumType == cos.ChecksumNone { 458 sliceWriters[i] = writer 459 } else { 460 ctx.cksums[i] = cos.NewCksumHash(cksumType) 461 sliceWriters[i] = cos.NewWriterMulti(writer, ctx.cksums[i].H) 462 } 463 } 464 465 return finalizeSlices(ctx, sliceWriters) 466 } 467 468 func (c *putJogger) sendSlice(ctx *encodeCtx, data *slice, node *meta.Snode, idx int) error { 469 // Reopen the slice's reader, because it was read to the end by erasure 470 // encoding while calculating parity slices. 471 reader, err := ctx.slices[idx].reopenReader() 472 if err != nil { 473 data.release() 474 return err 475 } 476 477 mcopy := &Metadata{} 478 cos.CopyStruct(mcopy, ctx.meta) 479 mcopy.SliceID = idx + 1 480 mcopy.ObjVersion = ctx.lom.Version() 481 if ctx.slices[idx].cksum != nil { 482 mcopy.CksumType, mcopy.CksumValue = ctx.slices[idx].cksum.Get() 483 } 484 485 src := &dataSource{ 486 reader: reader, 487 size: ctx.sliceSize, 488 obj: data, 489 metadata: mcopy, 490 isSlice: true, 491 reqType: reqPut, 492 } 493 sentCB := func(hdr *transport.ObjHdr, _ io.ReadCloser, _ any, err error) { 494 if data != nil { 495 data.release() 496 } 497 if err != nil { 498 nlog.Errorln("Failed to send", hdr.Cname()+": ", err) 499 } 500 } 501 502 return c.parent.writeRemote([]string{node.ID()}, ctx.lom, src, sentCB) 503 } 504 505 // Copies the constructed EC slices to remote targets. 506 func (c *putJogger) sendSlices(ctx *encodeCtx) (err error) { 507 // load the data slices from original object and construct parity ones 508 if c.toDisk { 509 err = generateSlicesToDisk(ctx) 510 } else { 511 err = generateSlicesToMemory(ctx) 512 } 513 514 if err != nil { 515 return err 516 } 517 518 dataSlice := &slice{refCnt: *atomic.NewInt32(int32(ctx.dataSlices)), obj: ctx.fh} 519 // If the slice is data one - no immediate cleanup is required because this 520 // slice is just a section reader of the entire file. 521 var copyErr error 522 for i, tgt := range ctx.targets { 523 var sl *slice 524 // Each data slice is a section reader of the replica, so the memory is 525 // freed only after the last data slice is sent. Parity slices allocate memory, 526 // so the counter is set to 1, to free immediately after send. 527 if i < ctx.dataSlices { 528 sl = dataSlice 529 } else { 530 sl = &slice{refCnt: *atomic.NewInt32(1), obj: ctx.slices[i].obj, workFQN: ctx.slices[i].workFQN} 531 } 532 if err := c.sendSlice(ctx, sl, tgt, i); err != nil { 533 copyErr = err 534 } 535 } 536 537 if copyErr != nil { 538 nlog.Errorf("Error while copying (data=%d, parity=%d) for %q: %v", 539 ctx.dataSlices, ctx.paritySlices, ctx.lom.ObjName, copyErr) 540 err = errSliceSendFailed 541 } else if cmn.Rom.FastV(4, cos.SmoduleEC) { 542 nlog.Infof("EC created (data=%d, parity=%d) for %q", 543 ctx.dataSlices, ctx.paritySlices, ctx.lom.ObjName) 544 } 545 546 return err 547 }