github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ec/ec.go (about) 1 // Package ec provides erasure coding (EC) based data protection for AIStore. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package ec 6 7 import ( 8 "bytes" 9 "errors" 10 "fmt" 11 "io" 12 "net/http" 13 "net/url" 14 "os" 15 "sync" 16 "time" 17 18 "github.com/NVIDIA/aistore/api/apc" 19 "github.com/NVIDIA/aistore/cmn" 20 "github.com/NVIDIA/aistore/cmn/atomic" 21 "github.com/NVIDIA/aistore/cmn/cos" 22 "github.com/NVIDIA/aistore/cmn/debug" 23 "github.com/NVIDIA/aistore/cmn/nlog" 24 "github.com/NVIDIA/aistore/core" 25 "github.com/NVIDIA/aistore/core/meta" 26 "github.com/NVIDIA/aistore/fs" 27 "github.com/NVIDIA/aistore/memsys" 28 "github.com/NVIDIA/aistore/transport" 29 "github.com/NVIDIA/aistore/xact/xreg" 30 ) 31 32 // EC module provides data protection on a per bucket basis. By default, the 33 // data protection is off. To enable it, set the bucket EC configuration: 34 // ECConf: 35 // Enable: true|false # enables or disables protection 36 // DataSlices: [1-32] # the number of data slices 37 // ParitySlices: [1-32] # the number of parity slices 38 // ObjSizeLimit: 0 # replication versus erasure coding 39 // 40 // NOTE: replicating small object is cheaper than erasure encoding. 41 // The ObjSizeLimit option sets the corresponding threshold. Set it to the 42 // size (in bytes), or 0 (zero) to use the AIStore default 256KiB. 43 // 44 // NOTE: ParitySlices defines the maximum number of storage targets a cluster 45 // can loose but it is still able to restore the original object 46 // 47 // NOTE: Since small objects are always replicated, they always have only one 48 // data slice and #ParitySlices replicas 49 // 50 // NOTE: All slices and replicas must be on the different targets. The target 51 // list is calculated by HrwTargetList. The first target in the list is the 52 // "main" target that keeps the full object, the others keep only slices/replicas 53 // 54 // NOTE: All slices must be of the same size. So, the last slice can be padded 55 // with zeros. In most cases, padding results in the total size of data 56 // replicas being a bit bigger than than the size of the original object. 57 // 58 // NOTE: Every slice and replica must have corresponding metadata file that is 59 // located in the same mountpath as its slice/replica 60 // 61 // 62 // EC local storage directories inside mountpaths: 63 // /%ob/ - for main object and its replicas 64 // /%ec/ - for object data and parity slices 65 // /%mt/ - for metadata files 66 // 67 // How protection works. 68 // 69 // Object PUT: 70 // 1. The main target - the target responsible for keeping the full object 71 // data and for restoring the object if damaged - is selected by 72 // HrwTarget. A proxy delegates object PUT request to it. 73 // 2. The main target calculates all other targets to keep slices/replicas. For 74 // small files it is #ParitySlices, for big ones it #DataSlices+#ParitySlices 75 // targets. 76 // 3. If the object is small, the main target broadcast the replicas. 77 // Otherwise, the target calculates data and parity slices, then sends them. 78 // 79 // Object GET: 80 // 1. The main target - the target that is responsible for keeping the full object 81 // data and for restoring the object becomes damaged - is determined by 82 // HrwTarget algorithm. A proxy delegates object GET request to it. 83 // 2. If the main target has the original object, it sends the data back 84 // Otherwise it tries to look up it inside other mountpaths (if resilver 85 // is running) or on remote targets (if rebalance is running). 86 // 3. If everything fails and EC is enabled for the bucket, the main target 87 // initiates object restoration process: 88 // - First, the main target requests for object's metafile from all targets 89 // in the cluster. If no target responds with a valid metafile, the object 90 // is considered missing. 91 // - Otherwise, the main target tries to download and restore the original data: 92 // Replica case: 93 // The main target request targets which have valid metafile for a replica 94 // one by one. When a target sends a valid object, the main target saves 95 // the object to local storage and reuploads its replicas to the targets. 96 // EC case: 97 // The main target requests targets which have valid metafile for slices 98 // in parallel. When all the targets respond, the main target starts 99 // restoring the object, and, in case of success, saves the restored object 100 // to local storage and sends recalculated data and parity slices to the 101 // targets which must have a slice but are 'empty' at this moment. 102 // NOTE: the slices are stored on targets in random order, except the first 103 // PUT when the main target stores the slices in the order of HrwTargetList 104 // algorithm returns. 105 106 const ( 107 ActSplit = "split" 108 ActRestore = "restore" 109 ActDelete = "delete" 110 111 RespStreamName = "ec-resp" 112 ReqStreamName = "ec-req" 113 114 ActClearRequests = "clear-requests" 115 ActEnableRequests = "enable-requests" 116 117 URLCT = "ct" // for using in URL path - requests for slices/replicas 118 URLMeta = "meta" /// .. - metadata requests 119 120 // EC switches to disk from SGL when memory pressure is high and the amount of 121 // memory required to encode an object exceeds the limit 122 objSizeHighMem = 50 * cos.MiB 123 ) 124 125 type ( 126 // request - structure to request an object to be EC'ed or restored 127 request struct { 128 LIF core.LIF // object info 129 Action string // what to do with the object (see Act* consts) 130 ErrCh chan error // for final EC result (used only in restore) 131 Callback core.OnFinishObj 132 133 putTime time.Time // time when the object is put into main queue 134 tm time.Time // to measure different steps 135 IsCopy bool // replicate or use erasure coding 136 rebuild bool // true - internal request to reencode, e.g., from ec-encode xaction 137 } 138 139 RequestsControlMsg struct { 140 Action string 141 } 142 143 WriteArgs struct { 144 MD []byte // CT's metafile content 145 Reader io.Reader // CT content 146 BID uint64 // bucket ID 147 Cksum *cos.Cksum // object checksum 148 Generation int64 // EC Generation 149 Xact core.Xact // xaction that drives it 150 } 151 152 // keeps temporarily a slice of object data until it is sent to remote node 153 slice struct { 154 obj cos.ReadOpenCloser // the whole object or its replica 155 reader cos.ReadOpenCloser // used in encoding - a slice of `obj` 156 writer io.Writer // for parity slices and downloading slices from other targets when restoring 157 twg *cos.TimeoutGroup // for synchronous download (when restoring from slices) 158 lom *core.LOM // for xattrs 159 n int64 // number of byte sent/received 160 refCnt atomic.Int32 // number of references 161 workFQN string // FQN for temporary slice/replica 162 cksum *cos.Cksum // checksum of the slice 163 version string // version of the remote object 164 } 165 166 // a source for data response: the data to send to the caller 167 // If obj is not nil then after the reader is sent to the remote target, 168 // the obj's counter is decreased. And if its value drops to zero the 169 // allocated SGL is freed. This logic is required to send a set of 170 // sliceReaders that point to the same SGL (broadcasting data slices) 171 dataSource struct { 172 reader cos.ReadOpenCloser // a reader to sent to a remote target 173 size int64 // size of the data 174 obj *slice // internal info about SGL slice 175 metadata *Metadata // object's metadata 176 isSlice bool // is it slice or replica 177 reqType intraReqType // request's type, slice/meta request/response 178 } 179 ) 180 181 type global struct { 182 reqPool sync.Pool 183 pmm *memsys.MMSA // memory manager slab/SGL allocator (pages) 184 smm *memsys.MMSA // ditto, bytes 185 emptyReq request 186 } 187 188 var g global 189 190 var ( 191 ErrorECDisabled = errors.New("EC is disabled for bucket") 192 ErrorNoMetafile = errors.New("no metafile") 193 ErrorNotFound = errors.New("not found") 194 ) 195 196 func Init() { 197 g.pmm = core.T.PageMM() 198 g.smm = core.T.ByteMM() 199 200 fs.CSM.Reg(fs.ECSliceType, &fs.ECSliceContentResolver{}) 201 fs.CSM.Reg(fs.ECMetaType, &fs.ECMetaContentResolver{}) 202 203 xreg.RegBckXact(&getFactory{}) 204 xreg.RegBckXact(&putFactory{}) 205 xreg.RegBckXact(&rspFactory{}) 206 xreg.RegBckXact(&encFactory{}) 207 208 if err := initManager(); err != nil { 209 cos.ExitLogf("Failed to init manager: %v", err) 210 } 211 } 212 213 /////////// 214 // slice // 215 /////////// 216 217 // Free allocated memory and removes slice's temporary file 218 func (s *slice) free() { 219 freeObject(s.obj) 220 s.obj = nil 221 if s.reader != nil { 222 cos.Close(s.reader) 223 } 224 if s.writer != nil { 225 switch w := s.writer.(type) { 226 case *os.File: 227 cos.Close(w) 228 case *memsys.SGL: 229 w.Free() 230 default: 231 debug.FailTypeCast(s.writer) 232 } 233 } 234 if s.workFQN != "" { 235 if err := os.Remove(s.workFQN); err != nil && !os.IsNotExist(err) { 236 nlog.Errorln(err) 237 } 238 } 239 } 240 241 // Decrease the number of links to the object (the initial number is set 242 // at slice creation time). If the number drops to zero the allocated 243 // memory/temporary file is cleaned up 244 func (s *slice) release() { 245 if s.obj != nil || s.workFQN != "" { 246 refCnt := s.refCnt.Dec() 247 if refCnt < 1 { 248 s.free() 249 } 250 } 251 } 252 253 func (s *slice) reopenReader() (reader cos.ReadOpenCloser, err error) { 254 if s.reader != nil { 255 var rc io.ReadCloser 256 reader = s.reader 257 switch r := reader.(type) { 258 case *memsys.Reader: 259 _, err = r.Seek(0, io.SeekStart) 260 case *cos.SectionHandle: 261 rc, err = r.Open() 262 if err == nil { 263 reader = rc.(cos.ReadOpenCloser) 264 } 265 default: 266 debug.FailTypeCast(s.reader) 267 err = fmt.Errorf("unsupported reader type: %T", s.reader) 268 } 269 return reader, err 270 } 271 272 if sgl, ok := s.obj.(*memsys.SGL); ok { 273 reader = memsys.NewReader(sgl) 274 } else if s.workFQN != "" { 275 reader, err = cos.NewFileHandle(s.workFQN) 276 } else { 277 debug.FailTypeCast(s.obj) 278 err = fmt.Errorf("unsupported obj type: %T", s.obj) 279 } 280 return reader, err 281 } 282 283 // 284 // misc. utils 285 // 286 287 func allocateReq(action string, lif core.LIF) (req *request) { 288 if v := g.reqPool.Get(); v != nil { 289 req = v.(*request) 290 } else { 291 req = &request{} 292 } 293 req.Action = action 294 req.LIF = lif 295 return 296 } 297 298 func freeReq(req *request) { 299 *req = g.emptyReq 300 g.reqPool.Put(req) 301 } 302 303 // SliceSize returns the size of one slice that EC will create for the object 304 func SliceSize(fileSize int64, slices int) int64 { 305 return (fileSize + int64(slices) - 1) / int64(slices) 306 } 307 308 // Monitoring the background transferring of replicas and slices requires 309 // a unique ID for each of them. Because of all replicas/slices of an object have 310 // the same names, cluster.Uname is not enough to generate unique ID. Adding an 311 // extra prefix - an identifier of the destination - solves the issue 312 func unique(prefix string, bck *meta.Bck, objName string) string { 313 return prefix + cos.PathSeparator + bck.MakeUname(objName) 314 } 315 316 func IsECCopy(size int64, ecConf *cmn.ECConf) bool { 317 return size < ecConf.ObjSizeLimit || ecConf.ObjSizeLimit == cmn.ObjSizeToAlwaysReplicate 318 } 319 320 // returns whether EC must use disk instead of keeping everything in memory. 321 // Depends on available free memory and size of an object to process 322 func useDisk(objSize int64, config *cmn.Config) bool { 323 if config.EC.DiskOnly { 324 return true 325 } 326 memPressure := g.pmm.Pressure() 327 switch memPressure { 328 case memsys.OOM, memsys.PressureExtreme: 329 return true 330 case memsys.PressureHigh: 331 return objSize > objSizeHighMem 332 default: 333 return false 334 } 335 } 336 337 // Frees allocated memory if it is SGL or closes the file handle if regular file 338 func freeObject(r any) { 339 if r == nil { 340 return 341 } 342 switch handle := r.(type) { 343 case *memsys.SGL: 344 if handle != nil { 345 handle.Free() 346 } 347 case *cos.FileHandle: 348 if handle != nil { 349 // few slices share the same handle, on error all release everything 350 _ = handle.Close() 351 } 352 case *os.File: 353 if handle != nil { 354 cos.Close(handle) 355 } 356 default: 357 debug.FailTypeCast(r) 358 } 359 } 360 361 // removes all temporary slices in case of erasure coding failure 362 func freeSlices(slices []*slice) { 363 for _, s := range slices { 364 if s != nil { 365 s.free() 366 } 367 } 368 } 369 370 // RequestECMeta returns an EC metadata found on a remote target. 371 func RequestECMeta(bck *cmn.Bck, objName string, si *meta.Snode, client *http.Client) (*Metadata, error) { 372 path := apc.URLPathEC.Join(URLMeta, bck.Name, objName) 373 query := url.Values{} 374 query = bck.AddToQuery(query) 375 url := si.URL(cmn.NetIntraData) + path 376 rq, err := http.NewRequest(http.MethodGet, url, http.NoBody) 377 if err != nil { 378 return nil, err 379 } 380 rq.URL.RawQuery = query.Encode() 381 resp, err := client.Do(rq) //nolint:bodyclose // closed inside cos.Close 382 if err != nil { 383 return nil, err 384 } 385 386 defer cos.Close(resp.Body) 387 if resp.StatusCode == http.StatusNotFound { 388 return nil, cos.NewErrNotFound(core.T, bck.Cname(objName)) 389 } 390 if resp.StatusCode != http.StatusOK { 391 return nil, cmn.NewErrFailedTo(core.T, "request ec md", bck.Cname(objName), err) 392 } 393 return MetaFromReader(resp.Body) 394 } 395 396 // Saves the main replica to local drives 397 func writeObject(lom *core.LOM, reader io.Reader, size int64, xctn core.Xact) error { 398 if size > 0 { 399 reader = io.LimitReader(reader, size) 400 } 401 readCloser := io.NopCloser(reader) 402 params := core.AllocPutParams() 403 { 404 params.WorkTag = "ec" 405 params.Reader = readCloser 406 params.SkipEC = true 407 params.Atime = time.Now() 408 params.Size = size 409 params.Xact = xctn 410 params.OWT = cmn.OwtRebalance 411 } 412 err := core.T.PutObject(lom, params) 413 core.FreePutParams(params) 414 return err 415 } 416 417 func validateBckBID(bck *cmn.Bck, bid uint64) error { 418 if bid == 0 { 419 return nil 420 } 421 newBck := meta.CloneBck(bck) 422 err := newBck.Init(core.T.Bowner()) 423 if err == nil && newBck.Props.BID != bid { 424 err = fmt.Errorf("bucket ID mismatch: local %d, sender %d", newBck.Props.BID, bid) 425 } 426 return err 427 } 428 429 // WriteSliceAndMeta saves slice and its metafile 430 func WriteSliceAndMeta(hdr *transport.ObjHdr, args *WriteArgs) error { 431 ct, err := core.NewCTFromBO(&hdr.Bck, hdr.ObjName, core.T.Bowner(), fs.ECSliceType) 432 if err != nil { 433 return err 434 } 435 ct.Lock(true) 436 ctMeta := ct.Clone(fs.ECMetaType) 437 defer func() { 438 ct.Unlock(true) 439 if err == nil { 440 return 441 } 442 if rmErr := cos.RemoveFile(ct.FQN()); rmErr != nil { 443 nlog.Errorf("nested error: save replica -> remove replica: %v", rmErr) 444 } 445 if rmErr := cos.RemoveFile(ctMeta.FQN()); rmErr != nil { 446 nlog.Errorf("nested error: save replica -> remove metafile: %v", rmErr) 447 } 448 }() 449 if args.Generation != 0 { 450 if oldMeta, oldErr := LoadMetadata(ctMeta.FQN()); oldErr == nil && oldMeta.Generation > args.Generation { 451 return nil 452 } 453 } 454 tmpFQN := ct.Make(fs.WorkfileType) 455 if err := ct.Write(args.Reader, hdr.ObjAttrs.Size, tmpFQN); err != nil { 456 return err 457 } 458 if err := ctMeta.Write(bytes.NewReader(args.MD), -1); err != nil { 459 return err 460 } 461 if _, exists := core.T.Bowner().Get().Get(ctMeta.Bck()); !exists { 462 err = fmt.Errorf("slice-and-meta: %s metafile saved while bucket %s was being destroyed", 463 ctMeta.ObjectName(), ctMeta.Bucket()) 464 return err 465 } 466 err = validateBckBID(&hdr.Bck, args.BID) 467 return err 468 } 469 470 // WriteReplicaAndMeta saves replica and its metafile 471 func WriteReplicaAndMeta(lom *core.LOM, args *WriteArgs) (err error) { 472 lom.Lock(false) 473 if args.Generation != 0 { 474 ctMeta := core.NewCTFromLOM(lom, fs.ECMetaType) 475 if oldMeta, oldErr := LoadMetadata(ctMeta.FQN()); oldErr == nil && oldMeta.Generation > args.Generation { 476 lom.Unlock(false) 477 return nil 478 } 479 } 480 lom.Unlock(false) 481 482 if err = writeObject(lom, args.Reader, lom.SizeBytes(true), args.Xact); err != nil { 483 return 484 } 485 if !args.Cksum.IsEmpty() && args.Cksum.Value() != "" { // NOTE: empty value 486 if !lom.EqCksum(args.Cksum) { 487 err = cos.NewErrDataCksum(args.Cksum, lom.Checksum(), lom.Cname()) 488 return 489 } 490 } 491 ctMeta := core.NewCTFromLOM(lom, fs.ECMetaType) 492 ctMeta.Lock(true) 493 494 defer func() { 495 ctMeta.Unlock(true) 496 if err == nil { 497 return 498 } 499 if rmErr := cos.RemoveFile(lom.FQN); rmErr != nil { 500 nlog.Errorf("nested error: save replica -> remove replica: %v", rmErr) 501 } 502 if rmErr := cos.RemoveFile(ctMeta.FQN()); rmErr != nil { 503 nlog.Errorf("nested error: save replica -> remove metafile: %v", rmErr) 504 } 505 }() 506 if err = ctMeta.Write(bytes.NewReader(args.MD), -1); err != nil { 507 return 508 } 509 if _, exists := core.T.Bowner().Get().Get(ctMeta.Bck()); !exists { 510 err = fmt.Errorf("replica-and-meta: %s metafile saved while bucket %s was being destroyed", 511 ctMeta.ObjectName(), ctMeta.Bucket()) 512 return 513 } 514 err = validateBckBID(lom.Bucket(), args.BID) 515 return 516 }