github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/shard/recm.go (about)

     1  // Package shard provides Extract(shard), Create(shard), and associated methods
     2  // across all suppported archival formats (see cmn/archive/mime.go)
     3  /*
     4   * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
     5   */
     6  package shard
     7  
     8  import (
     9  	"bytes"
    10  	"fmt"
    11  	"io"
    12  	"os"
    13  	"strings"
    14  	"sync"
    15  
    16  	"github.com/NVIDIA/aistore/cmn"
    17  	"github.com/NVIDIA/aistore/cmn/cos"
    18  	"github.com/NVIDIA/aistore/cmn/debug"
    19  	"github.com/NVIDIA/aistore/cmn/nlog"
    20  	"github.com/NVIDIA/aistore/core"
    21  	"github.com/NVIDIA/aistore/ext/dsort/ct"
    22  	"github.com/NVIDIA/aistore/fs"
    23  	"github.com/NVIDIA/aistore/memsys"
    24  	"github.com/pkg/errors"
    25  )
    26  
    27  const (
    28  	// Extract methods
    29  	ExtractToMem cos.Bits = 1 << iota
    30  	ExtractToDisk
    31  	ExtractToWriter
    32  )
    33  
    34  const recSepa = "|"
    35  
    36  // interface guard
    37  var _ RecordExtractor = (*RecordManager)(nil)
    38  
    39  type (
    40  	extractRecordArgs struct {
    41  		shardName     string        // name of the original shard
    42  		fileType      string        // fs content type where the shard is located
    43  		recordName    string        // name of the current record
    44  		r             cos.ReadSizer // body of the record
    45  		w             io.Writer     // required when method is set to ExtractToWriter
    46  		metadata      []byte        // metadata of the record
    47  		extractMethod cos.Bits      // method which needs to be used to extract a record
    48  		offset        int64         // offset of the body in the shard
    49  		buf           []byte        // helper buffer for `CopyBuffer` methods
    50  	}
    51  
    52  	// loads content from local or remote target
    53  	ContentLoader interface {
    54  		Load(w io.Writer, rec *Record, obj *RecordObj) (int64, error)
    55  	}
    56  
    57  	RecordExtractor interface {
    58  		RecordWithBuffer(args *extractRecordArgs) (int64, error)
    59  	}
    60  
    61  	RecordManager struct {
    62  		Records             *Records
    63  		bck                 cmn.Bck
    64  		onDuplicatedRecords func(string) error
    65  
    66  		extractCreator  RW
    67  		keyExtractor    KeyExtractor
    68  		contents        *sync.Map
    69  		extractionPaths *sync.Map // Keys correspond to all paths to record contents on disk.
    70  
    71  		enqueued struct {
    72  			mu      sync.Mutex
    73  			records []*Records // records received from other targets which are waiting to be merged
    74  		}
    75  	}
    76  )
    77  
    78  ///////////////////
    79  // RecordManager //
    80  ///////////////////
    81  
    82  func NewRecordManager(bck cmn.Bck, extractCreator RW, keyExtractor KeyExtractor, onDupRecs func(string) error) *RecordManager {
    83  	return &RecordManager{
    84  		Records:             NewRecords(1000),
    85  		bck:                 bck,
    86  		onDuplicatedRecords: onDupRecs,
    87  		extractCreator:      extractCreator,
    88  		keyExtractor:        keyExtractor,
    89  		contents:            &sync.Map{},
    90  		extractionPaths:     &sync.Map{},
    91  	}
    92  }
    93  
    94  func (recm *RecordManager) RecordWithBuffer(args *extractRecordArgs) (size int64, err error) {
    95  	var (
    96  		storeType        string
    97  		contentPath      string
    98  		fullContentPath  string
    99  		mdSize           int64
   100  		ext              = cosExt(args.recordName)
   101  		recordUniqueName = genRecordUname(args.shardName, args.recordName)
   102  	)
   103  
   104  	// handle record duplications (see m.react)
   105  	if recm.Records.Exists(recordUniqueName, ext) {
   106  		msg := fmt.Sprintf("record %q is duplicated", args.recordName)
   107  		recm.Records.DeleteDup(recordUniqueName, ext)
   108  
   109  		err = recm.onDuplicatedRecords(msg)
   110  		if err != nil {
   111  			return 0, err // react: abort
   112  		}
   113  		// react: ignore or warn
   114  	}
   115  
   116  	debug.Assert(!args.extractMethod.Has(ExtractToWriter) || args.w != nil)
   117  
   118  	r, ske, needRead := recm.keyExtractor.PrepareExtractor(args.recordName, args.r, ext)
   119  	switch {
   120  	case args.extractMethod.Has(ExtractToMem):
   121  		mdSize = int64(len(args.metadata))
   122  		storeType = SGLStoreType
   123  		contentPath, fullContentPath = recm.encodeRecordName(storeType, args.shardName, args.recordName)
   124  
   125  		sgl := core.T.PageMM().NewSGL(r.Size() + int64(len(args.metadata)))
   126  		// No need for `io.CopyBuffer` since SGL implements `io.ReaderFrom`.
   127  		if _, err = io.Copy(sgl, bytes.NewReader(args.metadata)); err != nil {
   128  			sgl.Free()
   129  			return 0, errors.WithStack(err)
   130  		}
   131  
   132  		var dst io.Writer = sgl
   133  		if args.extractMethod.Has(ExtractToWriter) {
   134  			dst = io.MultiWriter(sgl, args.w)
   135  		}
   136  		if size, err = io.CopyBuffer(dst, r, args.buf); err != nil {
   137  			sgl.Free()
   138  			return size, errors.WithStack(err)
   139  		}
   140  		recm.contents.Store(fullContentPath, sgl)
   141  	case args.extractMethod.Has(ExtractToDisk) && recm.extractCreator.SupportsOffset():
   142  		mdSize, size = recm.extractCreator.MetadataSize(), r.Size()
   143  		storeType = OffsetStoreType
   144  		contentPath, _ = recm.encodeRecordName(storeType, args.shardName, args.recordName)
   145  
   146  		// If extractor was initialized we need to read the content, since it
   147  		// may contain information about the sorting/shuffling key.
   148  		if needRead || args.w != nil {
   149  			dst := io.Discard
   150  			if args.w != nil {
   151  				dst = args.w
   152  			}
   153  			if _, err := io.CopyBuffer(dst, r, args.buf); err != nil {
   154  				return 0, errors.WithStack(err)
   155  			}
   156  		}
   157  	case args.extractMethod.Has(ExtractToDisk):
   158  		mdSize = int64(len(args.metadata))
   159  		storeType = DiskStoreType
   160  		contentPath, fullContentPath = recm.encodeRecordName(storeType, args.shardName, args.recordName)
   161  
   162  		var f *os.File
   163  		if f, err = cos.CreateFile(fullContentPath); err != nil {
   164  			return size, errors.WithStack(err)
   165  		}
   166  		if size, err = copyMetadataAndData(f, r, args.metadata, args.buf); err != nil {
   167  			cos.Close(f)
   168  			return size, errors.WithStack(err)
   169  		}
   170  		cos.Close(f)
   171  		recm.extractionPaths.Store(fullContentPath, struct{}{})
   172  	default:
   173  		debug.Assertf(false, "%d %d", args.extractMethod, args.extractMethod&ExtractToDisk)
   174  	}
   175  
   176  	var key any
   177  	if key, err = recm.keyExtractor.ExtractKey(ske); err != nil {
   178  		return size, errors.WithStack(err)
   179  	}
   180  
   181  	if contentPath == "" || storeType == "" {
   182  		debug.Assertf(false, "shardName: %q, recordName: %q, storeType: %q", args.shardName, args.recordName, storeType)
   183  	}
   184  	recm.Records.Insert(&Record{
   185  		Key:      key,
   186  		Name:     recordUniqueName,
   187  		DaemonID: core.T.SID(),
   188  		Objects: []*RecordObj{{
   189  			ContentPath:    contentPath,
   190  			ObjectFileType: args.fileType,
   191  			StoreType:      storeType,
   192  			Offset:         args.offset,
   193  			MetadataSize:   mdSize,
   194  			Size:           size,
   195  			Extension:      ext,
   196  		}},
   197  	})
   198  	return size, nil
   199  }
   200  
   201  func (recm *RecordManager) EnqueueRecords(records *Records) {
   202  	recm.enqueued.mu.Lock()
   203  	recm.enqueued.records = append(recm.enqueued.records, records)
   204  	recm.enqueued.mu.Unlock()
   205  }
   206  
   207  func (recm *RecordManager) MergeEnqueuedRecords() {
   208  	for {
   209  		recm.enqueued.mu.Lock()
   210  		lastIdx := len(recm.enqueued.records) - 1
   211  		if lastIdx < 0 {
   212  			recm.enqueued.mu.Unlock()
   213  			break
   214  		}
   215  		records := recm.enqueued.records[lastIdx]
   216  		recm.enqueued.records[lastIdx] = nil
   217  		recm.enqueued.records = recm.enqueued.records[:lastIdx]
   218  		recm.enqueued.mu.Unlock()
   219  
   220  		recm.Records.merge(records)
   221  	}
   222  	cos.FreeMemToOS(false /*force*/)
   223  }
   224  
   225  func (recm *RecordManager) encodeRecordName(storeType, shardName, recordName string) (contentPath, fullContentPath string) {
   226  	switch storeType {
   227  	case OffsetStoreType:
   228  		// For offset:
   229  		//  * contentPath = shard name (eg. shard_1.tar)
   230  		//  * fullContentPath = not used
   231  		return shardName, ""
   232  	case SGLStoreType:
   233  		// For sgl:
   234  		//  * contentPath = recordUniqueName with extension (eg. shard_1-record_name.cls)
   235  		//  * fullContentPath = recordUniqueName with extension (eg. shard_1-record_name.cls)
   236  		recordExt := cosExt(recordName)
   237  		contentPath := genRecordUname(shardName, recordName) + recordExt
   238  		return contentPath, contentPath // unique key for record
   239  	case DiskStoreType:
   240  		// For disk:
   241  		//  * contentPath = recordUniqueName with extension  (eg. shard_1-record_name.cls)
   242  		//  * fullContentPath = fqn to recordUniqueName with extension (eg. <bucket_fqn>/shard_1-record_name.cls)
   243  		recordExt := cosExt(recordName)
   244  		contentPath := genRecordUname(shardName, recordName) + recordExt
   245  		c, err := core.NewCTFromBO(&recm.bck, contentPath, nil)
   246  		debug.AssertNoErr(err)
   247  		return contentPath, c.Make(ct.DsortFileType)
   248  	default:
   249  		debug.Assert(false, storeType)
   250  		return "", ""
   251  	}
   252  }
   253  
   254  func (recm *RecordManager) FullContentPath(obj *RecordObj) string {
   255  	switch obj.StoreType {
   256  	case OffsetStoreType:
   257  		// To convert contentPath to fullContentPath we need to make shard name
   258  		// full FQN.
   259  		ct, err := core.NewCTFromBO(&recm.bck, obj.ContentPath, nil)
   260  		debug.AssertNoErr(err)
   261  		return ct.Make(obj.ObjectFileType)
   262  	case SGLStoreType:
   263  		// To convert contentPath to fullContentPath we need to add record
   264  		// object extension.
   265  		return obj.ContentPath
   266  	case DiskStoreType:
   267  		// To convert contentPath to fullContentPath we need to make record
   268  		// unique name full FQN.
   269  		contentPath := obj.ContentPath
   270  		c, err := core.NewCTFromBO(&recm.bck, contentPath, nil)
   271  		debug.AssertNoErr(err)
   272  		return c.Make(ct.DsortFileType)
   273  	default:
   274  		debug.Assert(false, obj.StoreType)
   275  		return ""
   276  	}
   277  }
   278  
   279  func (recm *RecordManager) FreeMem(fullContentPath, newStoreType string, value any, buf []byte) (n int64) {
   280  	sgl, ok := value.(*memsys.SGL)
   281  	debug.Assert(ok)
   282  
   283  	recordObjExt := cosExt(fullContentPath)
   284  	contentPath := strings.TrimSuffix(fullContentPath, recordObjExt)
   285  
   286  	recm.Records.Lock()
   287  	defer recm.Records.Unlock()
   288  
   289  	// In SGLs `contentPath == recordUniqueName`
   290  	record, exists := recm.Records.Find(contentPath)
   291  	if !exists {
   292  		// Generally should not happen but it is not proven that it cannot.
   293  		// There is nothing wrong with just returning here though.
   294  		nlog.Errorln("failed to find", fullContentPath, recordObjExt, contentPath) // TODO: FastV
   295  		return
   296  	}
   297  
   298  	idx := record.find(recordObjExt)
   299  	if idx == -1 {
   300  		// Duplicated records are removed so we cannot assert here.
   301  		return
   302  	}
   303  	obj := record.Objects[idx]
   304  
   305  	debug.Assert(obj.StoreType == SGLStoreType, obj.StoreType+" vs "+SGLStoreType) // only SGLs are supported
   306  
   307  	switch newStoreType {
   308  	case OffsetStoreType:
   309  		shardName, _ := parseRecordUname(record.Name)
   310  		obj.ContentPath = shardName
   311  		obj.MetadataSize = recm.extractCreator.MetadataSize()
   312  	case DiskStoreType:
   313  		diskPath := recm.FullContentPath(obj)
   314  		// No matter what the outcome we should store `path` in
   315  		// `extractionPaths` to make sure that all files, even incomplete ones,
   316  		// are deleted (if the file will not exist this is not much of a
   317  		// problem).
   318  		recm.extractionPaths.Store(diskPath, struct{}{})
   319  
   320  		if _, err := cos.SaveReader(diskPath, sgl, buf, cos.ChecksumNone, -1); err != nil {
   321  			nlog.Errorln(err)
   322  			return
   323  		}
   324  	default:
   325  		debug.Assert(false, newStoreType)
   326  	}
   327  
   328  	obj.StoreType = newStoreType
   329  	recm.contents.Delete(fullContentPath)
   330  	n = sgl.Size()
   331  	sgl.Free()
   332  	return
   333  }
   334  
   335  func (recm *RecordManager) RecordContents() *sync.Map {
   336  	return recm.contents
   337  }
   338  
   339  func (recm *RecordManager) ExtractionPaths() *sync.Map {
   340  	return recm.extractionPaths
   341  }
   342  
   343  func (recm *RecordManager) Cleanup() {
   344  	recm.Records.Drain()
   345  	recm.extractionPaths.Range(func(k, _ any) bool {
   346  		if err := fs.RemoveAll(k.(string)); err != nil {
   347  			nlog.Errorf("could not remove extraction path (%v) from previous run, err: %v", k, err)
   348  		}
   349  		recm.extractionPaths.Delete(k)
   350  		return true
   351  	})
   352  	recm.extractionPaths = nil
   353  	recm.contents.Range(func(k, v any) bool {
   354  		if sgl, ok := v.(*memsys.SGL); ok {
   355  			sgl.Free()
   356  		}
   357  		recm.contents.Delete(k)
   358  		return true
   359  	})
   360  	recm.contents = nil
   361  
   362  	// NOTE: may call cos.FreeMemToOS
   363  	core.T.PageMM().FreeSpec(memsys.FreeSpec{
   364  		Totally: true,
   365  		ToOS:    true,
   366  		MinSize: 1, // force toGC to free all (even small) memory to system
   367  	})
   368  }
   369  
   370  func copyMetadataAndData(dst io.Writer, src io.Reader, metadata, buf []byte) (int64, error) {
   371  	// Save metadata to dst
   372  	if _, err := io.CopyBuffer(dst, bytes.NewReader(metadata), buf); err != nil {
   373  		return 0, err
   374  	}
   375  	// Save data to dst
   376  	return io.CopyBuffer(dst, src, buf)
   377  }
   378  
   379  // slightly altered cos.Ext to handle an additional "stop": record uname separator
   380  // (that is, in addition to conventional path separator)
   381  func cosExt(path string) (ext string) {
   382  	for i := len(path) - 1; i >= 0 && !os.IsPathSeparator(path[i]) && path[i] != recSepa[0]; i-- {
   383  		if path[i] == '.' {
   384  			ext = path[i:]
   385  		}
   386  	}
   387  	return
   388  }
   389  
   390  func genRecordUname(shardName, recordName string) string {
   391  	recordWithoutExt := strings.TrimSuffix(recordName, cosExt(recordName))
   392  	return shardName + recSepa + recordWithoutExt
   393  }
   394  
   395  func parseRecordUname(recordUniqueName string) (shardName, recordName string) {
   396  	splits := strings.SplitN(recordUniqueName, recSepa, 2)
   397  	return splits[0], splits[1]
   398  }