github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/shard/recm.go (about) 1 // Package shard provides Extract(shard), Create(shard), and associated methods 2 // across all suppported archival formats (see cmn/archive/mime.go) 3 /* 4 * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 5 */ 6 package shard 7 8 import ( 9 "bytes" 10 "fmt" 11 "io" 12 "os" 13 "strings" 14 "sync" 15 16 "github.com/NVIDIA/aistore/cmn" 17 "github.com/NVIDIA/aistore/cmn/cos" 18 "github.com/NVIDIA/aistore/cmn/debug" 19 "github.com/NVIDIA/aistore/cmn/nlog" 20 "github.com/NVIDIA/aistore/core" 21 "github.com/NVIDIA/aistore/ext/dsort/ct" 22 "github.com/NVIDIA/aistore/fs" 23 "github.com/NVIDIA/aistore/memsys" 24 "github.com/pkg/errors" 25 ) 26 27 const ( 28 // Extract methods 29 ExtractToMem cos.Bits = 1 << iota 30 ExtractToDisk 31 ExtractToWriter 32 ) 33 34 const recSepa = "|" 35 36 // interface guard 37 var _ RecordExtractor = (*RecordManager)(nil) 38 39 type ( 40 extractRecordArgs struct { 41 shardName string // name of the original shard 42 fileType string // fs content type where the shard is located 43 recordName string // name of the current record 44 r cos.ReadSizer // body of the record 45 w io.Writer // required when method is set to ExtractToWriter 46 metadata []byte // metadata of the record 47 extractMethod cos.Bits // method which needs to be used to extract a record 48 offset int64 // offset of the body in the shard 49 buf []byte // helper buffer for `CopyBuffer` methods 50 } 51 52 // loads content from local or remote target 53 ContentLoader interface { 54 Load(w io.Writer, rec *Record, obj *RecordObj) (int64, error) 55 } 56 57 RecordExtractor interface { 58 RecordWithBuffer(args *extractRecordArgs) (int64, error) 59 } 60 61 RecordManager struct { 62 Records *Records 63 bck cmn.Bck 64 onDuplicatedRecords func(string) error 65 66 extractCreator RW 67 keyExtractor KeyExtractor 68 contents *sync.Map 69 extractionPaths *sync.Map // Keys correspond to all paths to record contents on disk. 70 71 enqueued struct { 72 mu sync.Mutex 73 records []*Records // records received from other targets which are waiting to be merged 74 } 75 } 76 ) 77 78 /////////////////// 79 // RecordManager // 80 /////////////////// 81 82 func NewRecordManager(bck cmn.Bck, extractCreator RW, keyExtractor KeyExtractor, onDupRecs func(string) error) *RecordManager { 83 return &RecordManager{ 84 Records: NewRecords(1000), 85 bck: bck, 86 onDuplicatedRecords: onDupRecs, 87 extractCreator: extractCreator, 88 keyExtractor: keyExtractor, 89 contents: &sync.Map{}, 90 extractionPaths: &sync.Map{}, 91 } 92 } 93 94 func (recm *RecordManager) RecordWithBuffer(args *extractRecordArgs) (size int64, err error) { 95 var ( 96 storeType string 97 contentPath string 98 fullContentPath string 99 mdSize int64 100 ext = cosExt(args.recordName) 101 recordUniqueName = genRecordUname(args.shardName, args.recordName) 102 ) 103 104 // handle record duplications (see m.react) 105 if recm.Records.Exists(recordUniqueName, ext) { 106 msg := fmt.Sprintf("record %q is duplicated", args.recordName) 107 recm.Records.DeleteDup(recordUniqueName, ext) 108 109 err = recm.onDuplicatedRecords(msg) 110 if err != nil { 111 return 0, err // react: abort 112 } 113 // react: ignore or warn 114 } 115 116 debug.Assert(!args.extractMethod.Has(ExtractToWriter) || args.w != nil) 117 118 r, ske, needRead := recm.keyExtractor.PrepareExtractor(args.recordName, args.r, ext) 119 switch { 120 case args.extractMethod.Has(ExtractToMem): 121 mdSize = int64(len(args.metadata)) 122 storeType = SGLStoreType 123 contentPath, fullContentPath = recm.encodeRecordName(storeType, args.shardName, args.recordName) 124 125 sgl := core.T.PageMM().NewSGL(r.Size() + int64(len(args.metadata))) 126 // No need for `io.CopyBuffer` since SGL implements `io.ReaderFrom`. 127 if _, err = io.Copy(sgl, bytes.NewReader(args.metadata)); err != nil { 128 sgl.Free() 129 return 0, errors.WithStack(err) 130 } 131 132 var dst io.Writer = sgl 133 if args.extractMethod.Has(ExtractToWriter) { 134 dst = io.MultiWriter(sgl, args.w) 135 } 136 if size, err = io.CopyBuffer(dst, r, args.buf); err != nil { 137 sgl.Free() 138 return size, errors.WithStack(err) 139 } 140 recm.contents.Store(fullContentPath, sgl) 141 case args.extractMethod.Has(ExtractToDisk) && recm.extractCreator.SupportsOffset(): 142 mdSize, size = recm.extractCreator.MetadataSize(), r.Size() 143 storeType = OffsetStoreType 144 contentPath, _ = recm.encodeRecordName(storeType, args.shardName, args.recordName) 145 146 // If extractor was initialized we need to read the content, since it 147 // may contain information about the sorting/shuffling key. 148 if needRead || args.w != nil { 149 dst := io.Discard 150 if args.w != nil { 151 dst = args.w 152 } 153 if _, err := io.CopyBuffer(dst, r, args.buf); err != nil { 154 return 0, errors.WithStack(err) 155 } 156 } 157 case args.extractMethod.Has(ExtractToDisk): 158 mdSize = int64(len(args.metadata)) 159 storeType = DiskStoreType 160 contentPath, fullContentPath = recm.encodeRecordName(storeType, args.shardName, args.recordName) 161 162 var f *os.File 163 if f, err = cos.CreateFile(fullContentPath); err != nil { 164 return size, errors.WithStack(err) 165 } 166 if size, err = copyMetadataAndData(f, r, args.metadata, args.buf); err != nil { 167 cos.Close(f) 168 return size, errors.WithStack(err) 169 } 170 cos.Close(f) 171 recm.extractionPaths.Store(fullContentPath, struct{}{}) 172 default: 173 debug.Assertf(false, "%d %d", args.extractMethod, args.extractMethod&ExtractToDisk) 174 } 175 176 var key any 177 if key, err = recm.keyExtractor.ExtractKey(ske); err != nil { 178 return size, errors.WithStack(err) 179 } 180 181 if contentPath == "" || storeType == "" { 182 debug.Assertf(false, "shardName: %q, recordName: %q, storeType: %q", args.shardName, args.recordName, storeType) 183 } 184 recm.Records.Insert(&Record{ 185 Key: key, 186 Name: recordUniqueName, 187 DaemonID: core.T.SID(), 188 Objects: []*RecordObj{{ 189 ContentPath: contentPath, 190 ObjectFileType: args.fileType, 191 StoreType: storeType, 192 Offset: args.offset, 193 MetadataSize: mdSize, 194 Size: size, 195 Extension: ext, 196 }}, 197 }) 198 return size, nil 199 } 200 201 func (recm *RecordManager) EnqueueRecords(records *Records) { 202 recm.enqueued.mu.Lock() 203 recm.enqueued.records = append(recm.enqueued.records, records) 204 recm.enqueued.mu.Unlock() 205 } 206 207 func (recm *RecordManager) MergeEnqueuedRecords() { 208 for { 209 recm.enqueued.mu.Lock() 210 lastIdx := len(recm.enqueued.records) - 1 211 if lastIdx < 0 { 212 recm.enqueued.mu.Unlock() 213 break 214 } 215 records := recm.enqueued.records[lastIdx] 216 recm.enqueued.records[lastIdx] = nil 217 recm.enqueued.records = recm.enqueued.records[:lastIdx] 218 recm.enqueued.mu.Unlock() 219 220 recm.Records.merge(records) 221 } 222 cos.FreeMemToOS(false /*force*/) 223 } 224 225 func (recm *RecordManager) encodeRecordName(storeType, shardName, recordName string) (contentPath, fullContentPath string) { 226 switch storeType { 227 case OffsetStoreType: 228 // For offset: 229 // * contentPath = shard name (eg. shard_1.tar) 230 // * fullContentPath = not used 231 return shardName, "" 232 case SGLStoreType: 233 // For sgl: 234 // * contentPath = recordUniqueName with extension (eg. shard_1-record_name.cls) 235 // * fullContentPath = recordUniqueName with extension (eg. shard_1-record_name.cls) 236 recordExt := cosExt(recordName) 237 contentPath := genRecordUname(shardName, recordName) + recordExt 238 return contentPath, contentPath // unique key for record 239 case DiskStoreType: 240 // For disk: 241 // * contentPath = recordUniqueName with extension (eg. shard_1-record_name.cls) 242 // * fullContentPath = fqn to recordUniqueName with extension (eg. <bucket_fqn>/shard_1-record_name.cls) 243 recordExt := cosExt(recordName) 244 contentPath := genRecordUname(shardName, recordName) + recordExt 245 c, err := core.NewCTFromBO(&recm.bck, contentPath, nil) 246 debug.AssertNoErr(err) 247 return contentPath, c.Make(ct.DsortFileType) 248 default: 249 debug.Assert(false, storeType) 250 return "", "" 251 } 252 } 253 254 func (recm *RecordManager) FullContentPath(obj *RecordObj) string { 255 switch obj.StoreType { 256 case OffsetStoreType: 257 // To convert contentPath to fullContentPath we need to make shard name 258 // full FQN. 259 ct, err := core.NewCTFromBO(&recm.bck, obj.ContentPath, nil) 260 debug.AssertNoErr(err) 261 return ct.Make(obj.ObjectFileType) 262 case SGLStoreType: 263 // To convert contentPath to fullContentPath we need to add record 264 // object extension. 265 return obj.ContentPath 266 case DiskStoreType: 267 // To convert contentPath to fullContentPath we need to make record 268 // unique name full FQN. 269 contentPath := obj.ContentPath 270 c, err := core.NewCTFromBO(&recm.bck, contentPath, nil) 271 debug.AssertNoErr(err) 272 return c.Make(ct.DsortFileType) 273 default: 274 debug.Assert(false, obj.StoreType) 275 return "" 276 } 277 } 278 279 func (recm *RecordManager) FreeMem(fullContentPath, newStoreType string, value any, buf []byte) (n int64) { 280 sgl, ok := value.(*memsys.SGL) 281 debug.Assert(ok) 282 283 recordObjExt := cosExt(fullContentPath) 284 contentPath := strings.TrimSuffix(fullContentPath, recordObjExt) 285 286 recm.Records.Lock() 287 defer recm.Records.Unlock() 288 289 // In SGLs `contentPath == recordUniqueName` 290 record, exists := recm.Records.Find(contentPath) 291 if !exists { 292 // Generally should not happen but it is not proven that it cannot. 293 // There is nothing wrong with just returning here though. 294 nlog.Errorln("failed to find", fullContentPath, recordObjExt, contentPath) // TODO: FastV 295 return 296 } 297 298 idx := record.find(recordObjExt) 299 if idx == -1 { 300 // Duplicated records are removed so we cannot assert here. 301 return 302 } 303 obj := record.Objects[idx] 304 305 debug.Assert(obj.StoreType == SGLStoreType, obj.StoreType+" vs "+SGLStoreType) // only SGLs are supported 306 307 switch newStoreType { 308 case OffsetStoreType: 309 shardName, _ := parseRecordUname(record.Name) 310 obj.ContentPath = shardName 311 obj.MetadataSize = recm.extractCreator.MetadataSize() 312 case DiskStoreType: 313 diskPath := recm.FullContentPath(obj) 314 // No matter what the outcome we should store `path` in 315 // `extractionPaths` to make sure that all files, even incomplete ones, 316 // are deleted (if the file will not exist this is not much of a 317 // problem). 318 recm.extractionPaths.Store(diskPath, struct{}{}) 319 320 if _, err := cos.SaveReader(diskPath, sgl, buf, cos.ChecksumNone, -1); err != nil { 321 nlog.Errorln(err) 322 return 323 } 324 default: 325 debug.Assert(false, newStoreType) 326 } 327 328 obj.StoreType = newStoreType 329 recm.contents.Delete(fullContentPath) 330 n = sgl.Size() 331 sgl.Free() 332 return 333 } 334 335 func (recm *RecordManager) RecordContents() *sync.Map { 336 return recm.contents 337 } 338 339 func (recm *RecordManager) ExtractionPaths() *sync.Map { 340 return recm.extractionPaths 341 } 342 343 func (recm *RecordManager) Cleanup() { 344 recm.Records.Drain() 345 recm.extractionPaths.Range(func(k, _ any) bool { 346 if err := fs.RemoveAll(k.(string)); err != nil { 347 nlog.Errorf("could not remove extraction path (%v) from previous run, err: %v", k, err) 348 } 349 recm.extractionPaths.Delete(k) 350 return true 351 }) 352 recm.extractionPaths = nil 353 recm.contents.Range(func(k, v any) bool { 354 if sgl, ok := v.(*memsys.SGL); ok { 355 sgl.Free() 356 } 357 recm.contents.Delete(k) 358 return true 359 }) 360 recm.contents = nil 361 362 // NOTE: may call cos.FreeMemToOS 363 core.T.PageMM().FreeSpec(memsys.FreeSpec{ 364 Totally: true, 365 ToOS: true, 366 MinSize: 1, // force toGC to free all (even small) memory to system 367 }) 368 } 369 370 func copyMetadataAndData(dst io.Writer, src io.Reader, metadata, buf []byte) (int64, error) { 371 // Save metadata to dst 372 if _, err := io.CopyBuffer(dst, bytes.NewReader(metadata), buf); err != nil { 373 return 0, err 374 } 375 // Save data to dst 376 return io.CopyBuffer(dst, src, buf) 377 } 378 379 // slightly altered cos.Ext to handle an additional "stop": record uname separator 380 // (that is, in addition to conventional path separator) 381 func cosExt(path string) (ext string) { 382 for i := len(path) - 1; i >= 0 && !os.IsPathSeparator(path[i]) && path[i] != recSepa[0]; i-- { 383 if path[i] == '.' { 384 ext = path[i:] 385 } 386 } 387 return 388 } 389 390 func genRecordUname(shardName, recordName string) string { 391 recordWithoutExt := strings.TrimSuffix(recordName, cosExt(recordName)) 392 return shardName + recSepa + recordWithoutExt 393 } 394 395 func parseRecordUname(recordUniqueName string) (shardName, recordName string) { 396 splits := strings.SplitN(recordUniqueName, recSepa, 2) 397 return splits[0], splits[1] 398 }