github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/shard/record.go (about)

     1  // Package shard provides Extract(shard), Create(shard), and associated methods
     2  // across all suppported archival formats (see cmn/archive/mime.go)
     3  /*
     4   * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
     5   */
     6  package shard
     7  
     8  import (
     9  	"encoding/json"
    10  	"sync"
    11  	"unsafe"
    12  
    13  	"github.com/NVIDIA/aistore/cmn/debug"
    14  	"github.com/pkg/errors"
    15  )
    16  
    17  //
    18  // NOTE: changes in this source MAY require re-running `msgp` code generation - see docs/msgp.md for details.
    19  //
    20  // NOTE: Since tags for unexported fields are not supported before generation
    21  //       change `Records.arr` to `Records.Arr` and then rename it back.
    22  //
    23  
    24  // interface guard
    25  var (
    26  	_ json.Marshaler   = (*Records)(nil)
    27  	_ json.Unmarshaler = (*Records)(nil)
    28  )
    29  
    30  const (
    31  	// Values are small to save memory.
    32  	OffsetStoreType = "o"
    33  	SGLStoreType    = "s"
    34  	DiskStoreType   = "d"
    35  )
    36  
    37  type (
    38  	// RecordObj describes single object of record. Objects inside a single record
    39  	// have different extensions.
    40  	RecordObj struct {
    41  		// Can represent, one of the following:
    42  		//  * Shard name - in case offset is used.
    43  		//  * Key for extractCreator's RecordContents - records stored in SGLs.
    44  		//  * Location (full path) on disk where extracted record has been placed.
    45  		//
    46  		// To get path for given object you need to use `FullContentPath` method.
    47  		ContentPath string `msg:"p"  json:"p"`
    48  
    49  		// Filesystem file type where the shard is stored - used to determine
    50  		// location for content path when asking filesystem.
    51  		ObjectFileType string `msg:"ft" json:"ft"`
    52  
    53  		// Determines where the record has been stored, can be either: OffsetStoreType,
    54  		// SGLStoreType, DiskStoreType.
    55  		StoreType string `msg:"st" json:"st"`
    56  
    57  		// If set, determines the offset in shard file where the record begins.
    58  		Offset       int64  `msg:"f,omitempty" json:"f,string,omitempty"`
    59  		MetadataSize int64  `msg:"ms" json:"ms,string"`
    60  		Size         int64  `msg:"s" json:"s,string"`
    61  		Extension    string `msg:"e" json:"e"`
    62  	}
    63  
    64  	// Record represents the metadata corresponding to a single file from a shard.
    65  	Record struct {
    66  		Key      any    `msg:"k" json:"k"` // Used to determine the sorting order.
    67  		Name     string `msg:"n" json:"n"` // Name that uniquely identifies record across all shards.
    68  		DaemonID string `msg:"d" json:"d"` // ID of the target which maintains the contents for this record.
    69  		// All objects associated with given record. Record can be composed of
    70  		// multiple objects which have the same name but different extension.
    71  		Objects []*RecordObj `msg:"o" json:"o"`
    72  	}
    73  
    74  	// Records abstract array of records. It safe to be used concurrently.
    75  	Records struct {
    76  		arr              []*Record           `msg:"a"`
    77  		m                map[string]*Record  `msg:"-"`
    78  		dups             map[string]struct{} `msg:"-"` // contains duplicate object names, if any
    79  		totalObjectCount int                 `msg:"-"` // total number of objects in all records (dups are removed so not counted)
    80  		sync.RWMutex     `msg:"-"`
    81  	}
    82  )
    83  
    84  ////////////
    85  // Record //
    86  ////////////
    87  
    88  // Merges two records into single one. It is required for records to have the
    89  // same Name. Since records should only differ on objects this is the thing that
    90  // is actually merged.
    91  func (r *Record) mergeObjects(other *Record) {
    92  	debug.Assert(r.Name == other.Name, r.Name+" vs "+other.Name)
    93  	if r.Key == nil && other.Key != nil {
    94  		r.Key = other.Key
    95  	}
    96  	r.Objects = append(r.Objects, other.Objects...)
    97  }
    98  
    99  func (r *Record) find(ext string) int {
   100  	for idx, obj := range r.Objects {
   101  		if obj.Extension == ext {
   102  			return idx
   103  		}
   104  	}
   105  	return -1
   106  }
   107  
   108  func (r *Record) exists(ext string) bool {
   109  	return r.find(ext) >= 0
   110  }
   111  
   112  func (r *Record) delete(ext string) (deleted bool) {
   113  	foundIdx := r.find(ext)
   114  	if foundIdx >= 0 {
   115  		// NOTE: We are required to preserve the order.
   116  		r.Objects = append(r.Objects[:foundIdx], r.Objects[foundIdx+1:]...)
   117  		return true
   118  	}
   119  	return false
   120  }
   121  
   122  func (r *Record) TotalSize() int64 {
   123  	size := int64(0)
   124  	for _, obj := range r.Objects {
   125  		size += obj.Size
   126  	}
   127  	return size
   128  }
   129  
   130  func (r *Record) MakeUniqueName(obj *RecordObj) string {
   131  	return r.Name + obj.Extension
   132  }
   133  
   134  /////////////
   135  // Records //
   136  /////////////
   137  
   138  // NewRecords creates new instance of Records struct and allocates n places for
   139  // the actual Record's
   140  func NewRecords(n int) *Records {
   141  	return &Records{
   142  		arr:  make([]*Record, 0, n),
   143  		m:    make(map[string]*Record, 100),
   144  		dups: make(map[string]struct{}, 10),
   145  	}
   146  }
   147  
   148  func (r *Records) Drain() {
   149  	r.Lock()
   150  	r.arr = nil
   151  	r.m = nil
   152  	r.dups = nil
   153  	r.Unlock()
   154  }
   155  
   156  func (r *Records) Insert(records ...*Record) {
   157  	r.Lock()
   158  	for _, record := range records {
   159  		// Checking if record is already registered. If that is the case we need
   160  		// to merge extensions (files with same names but different extensions
   161  		// should be in single record). Otherwise just add new record.
   162  		if existingRecord, ok := r.m[record.Name]; ok {
   163  			existingRecord.mergeObjects(record)
   164  		} else {
   165  			r.arr = append(r.arr, record)
   166  			r.m[record.Name] = record
   167  		}
   168  
   169  		r.totalObjectCount += len(record.Objects)
   170  	}
   171  	r.Unlock()
   172  }
   173  
   174  func (r *Records) DeleteDup(name, ext string) {
   175  	debug.Assert(r.Exists(name, ext), "record: "+name+", "+ext)
   176  	r.Lock()
   177  	if record, ok := r.m[name]; ok {
   178  		if record.delete(ext) {
   179  			r.totalObjectCount--
   180  		}
   181  	}
   182  	r.dups[name+ext] = struct{}{}
   183  	r.Unlock()
   184  }
   185  
   186  // NOTE: must be done under lock
   187  func (r *Records) Find(name string) (record *Record, exists bool) {
   188  	record, exists = r.m[name]
   189  	return
   190  }
   191  
   192  func (r *Records) Exists(name, ext string) (exists bool) {
   193  	r.RLock()
   194  	if record, ok := r.m[name]; ok {
   195  		exists = record.exists(ext)
   196  		if !exists {
   197  			_, exists = r.dups[name+ext]
   198  		}
   199  	}
   200  	r.RUnlock()
   201  	return
   202  }
   203  
   204  func (r *Records) merge(records *Records) {
   205  	r.Insert(records.arr...)
   206  }
   207  
   208  func (r *Records) All() []*Record {
   209  	return r.arr
   210  }
   211  
   212  func (r *Records) Slice(start, end int) *Records {
   213  	return &Records{
   214  		arr: r.arr[start:end],
   215  	}
   216  }
   217  
   218  func (r *Records) Len() int {
   219  	return len(r.arr)
   220  }
   221  
   222  func (r *Records) Swap(i, j int) { r.arr[i], r.arr[j] = r.arr[j], r.arr[i] }
   223  
   224  func (r *Records) Less(i, j int, keyType string) (bool, error) {
   225  	lhs, rhs := r.arr[i].Key, r.arr[j].Key
   226  	if lhs == nil {
   227  		return false, errors.Errorf("key is missing for %q", r.arr[i].Name)
   228  	} else if rhs == nil {
   229  		return false, errors.Errorf("key is missing for %q", r.arr[j].Name)
   230  	}
   231  
   232  	switch keyType {
   233  	case ContentKeyInt:
   234  		ilhs, lok := lhs.(int64)
   235  		irhs, rok := rhs.(int64)
   236  		if lok && rok {
   237  			return ilhs < irhs, nil
   238  		}
   239  		// (motivation: javascript does not support int64 type)
   240  		if !lok {
   241  			ilhs = int64(lhs.(float64))
   242  		} else {
   243  			irhs = int64(rhs.(float64))
   244  		}
   245  		return ilhs < irhs, nil
   246  	case ContentKeyFloat:
   247  		flhs, lok := lhs.(float64)
   248  		frhs, rok := rhs.(float64)
   249  		debug.Assert(lok, lhs)
   250  		debug.Assert(rok, rhs)
   251  		return flhs < frhs, nil
   252  	case ContentKeyString:
   253  		slhs, lok := lhs.(string)
   254  		srhs, rok := rhs.(string)
   255  		debug.Assert(lok, lhs)
   256  		debug.Assert(rok, rhs)
   257  		return slhs < srhs, nil
   258  	}
   259  
   260  	debug.Assertf(false, "lhs: %v, rhs: %v, arr[i]: %v, arr[j]: %v", lhs, rhs, r.arr[i], r.arr[j])
   261  	return false, nil
   262  }
   263  
   264  func (r *Records) TotalObjectCount() int {
   265  	return r.totalObjectCount
   266  }
   267  
   268  func (r *Records) RecordMemorySize() (size uint64) {
   269  	r.Lock()
   270  	defer r.Unlock()
   271  
   272  	var maxSize uint64
   273  	for _, record := range r.arr {
   274  		size = uint64(unsafe.Sizeof(*record))
   275  		size += uint64(len(record.DaemonID))
   276  		size += uint64(len(record.Name))
   277  		size += uint64(unsafe.Sizeof(record.Key))
   278  		maxSize = max(maxSize, size)
   279  
   280  		// If there is record which has at least 1 record object we should get
   281  		// the estimate of it and return the size. Some records might not have
   282  		// at least 1 record object because there were duplicated and in
   283  		// consequence were removed from the record.
   284  		if len(record.Objects) > 0 {
   285  			size += (uint64(unsafe.Sizeof(record.Objects)) +
   286  				uint64(len(record.Objects[0].Extension)) +
   287  				uint64(len(record.Objects[0].ContentPath)) +
   288  				uint64(len(record.Objects[0].ObjectFileType)) +
   289  				uint64(len(record.Objects[0].StoreType))) * uint64(len(record.Objects))
   290  			return size
   291  		}
   292  	}
   293  	return maxSize
   294  }
   295  
   296  func (*Records) MarshalJSON() ([]byte, error) { panic("should not be used") }
   297  func (*Records) UnmarshalJSON([]byte) error   { panic("should not be used") }