github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/shard/record.go (about) 1 // Package shard provides Extract(shard), Create(shard), and associated methods 2 // across all suppported archival formats (see cmn/archive/mime.go) 3 /* 4 * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 5 */ 6 package shard 7 8 import ( 9 "encoding/json" 10 "sync" 11 "unsafe" 12 13 "github.com/NVIDIA/aistore/cmn/debug" 14 "github.com/pkg/errors" 15 ) 16 17 // 18 // NOTE: changes in this source MAY require re-running `msgp` code generation - see docs/msgp.md for details. 19 // 20 // NOTE: Since tags for unexported fields are not supported before generation 21 // change `Records.arr` to `Records.Arr` and then rename it back. 22 // 23 24 // interface guard 25 var ( 26 _ json.Marshaler = (*Records)(nil) 27 _ json.Unmarshaler = (*Records)(nil) 28 ) 29 30 const ( 31 // Values are small to save memory. 32 OffsetStoreType = "o" 33 SGLStoreType = "s" 34 DiskStoreType = "d" 35 ) 36 37 type ( 38 // RecordObj describes single object of record. Objects inside a single record 39 // have different extensions. 40 RecordObj struct { 41 // Can represent, one of the following: 42 // * Shard name - in case offset is used. 43 // * Key for extractCreator's RecordContents - records stored in SGLs. 44 // * Location (full path) on disk where extracted record has been placed. 45 // 46 // To get path for given object you need to use `FullContentPath` method. 47 ContentPath string `msg:"p" json:"p"` 48 49 // Filesystem file type where the shard is stored - used to determine 50 // location for content path when asking filesystem. 51 ObjectFileType string `msg:"ft" json:"ft"` 52 53 // Determines where the record has been stored, can be either: OffsetStoreType, 54 // SGLStoreType, DiskStoreType. 55 StoreType string `msg:"st" json:"st"` 56 57 // If set, determines the offset in shard file where the record begins. 58 Offset int64 `msg:"f,omitempty" json:"f,string,omitempty"` 59 MetadataSize int64 `msg:"ms" json:"ms,string"` 60 Size int64 `msg:"s" json:"s,string"` 61 Extension string `msg:"e" json:"e"` 62 } 63 64 // Record represents the metadata corresponding to a single file from a shard. 65 Record struct { 66 Key any `msg:"k" json:"k"` // Used to determine the sorting order. 67 Name string `msg:"n" json:"n"` // Name that uniquely identifies record across all shards. 68 DaemonID string `msg:"d" json:"d"` // ID of the target which maintains the contents for this record. 69 // All objects associated with given record. Record can be composed of 70 // multiple objects which have the same name but different extension. 71 Objects []*RecordObj `msg:"o" json:"o"` 72 } 73 74 // Records abstract array of records. It safe to be used concurrently. 75 Records struct { 76 arr []*Record `msg:"a"` 77 m map[string]*Record `msg:"-"` 78 dups map[string]struct{} `msg:"-"` // contains duplicate object names, if any 79 totalObjectCount int `msg:"-"` // total number of objects in all records (dups are removed so not counted) 80 sync.RWMutex `msg:"-"` 81 } 82 ) 83 84 //////////// 85 // Record // 86 //////////// 87 88 // Merges two records into single one. It is required for records to have the 89 // same Name. Since records should only differ on objects this is the thing that 90 // is actually merged. 91 func (r *Record) mergeObjects(other *Record) { 92 debug.Assert(r.Name == other.Name, r.Name+" vs "+other.Name) 93 if r.Key == nil && other.Key != nil { 94 r.Key = other.Key 95 } 96 r.Objects = append(r.Objects, other.Objects...) 97 } 98 99 func (r *Record) find(ext string) int { 100 for idx, obj := range r.Objects { 101 if obj.Extension == ext { 102 return idx 103 } 104 } 105 return -1 106 } 107 108 func (r *Record) exists(ext string) bool { 109 return r.find(ext) >= 0 110 } 111 112 func (r *Record) delete(ext string) (deleted bool) { 113 foundIdx := r.find(ext) 114 if foundIdx >= 0 { 115 // NOTE: We are required to preserve the order. 116 r.Objects = append(r.Objects[:foundIdx], r.Objects[foundIdx+1:]...) 117 return true 118 } 119 return false 120 } 121 122 func (r *Record) TotalSize() int64 { 123 size := int64(0) 124 for _, obj := range r.Objects { 125 size += obj.Size 126 } 127 return size 128 } 129 130 func (r *Record) MakeUniqueName(obj *RecordObj) string { 131 return r.Name + obj.Extension 132 } 133 134 ///////////// 135 // Records // 136 ///////////// 137 138 // NewRecords creates new instance of Records struct and allocates n places for 139 // the actual Record's 140 func NewRecords(n int) *Records { 141 return &Records{ 142 arr: make([]*Record, 0, n), 143 m: make(map[string]*Record, 100), 144 dups: make(map[string]struct{}, 10), 145 } 146 } 147 148 func (r *Records) Drain() { 149 r.Lock() 150 r.arr = nil 151 r.m = nil 152 r.dups = nil 153 r.Unlock() 154 } 155 156 func (r *Records) Insert(records ...*Record) { 157 r.Lock() 158 for _, record := range records { 159 // Checking if record is already registered. If that is the case we need 160 // to merge extensions (files with same names but different extensions 161 // should be in single record). Otherwise just add new record. 162 if existingRecord, ok := r.m[record.Name]; ok { 163 existingRecord.mergeObjects(record) 164 } else { 165 r.arr = append(r.arr, record) 166 r.m[record.Name] = record 167 } 168 169 r.totalObjectCount += len(record.Objects) 170 } 171 r.Unlock() 172 } 173 174 func (r *Records) DeleteDup(name, ext string) { 175 debug.Assert(r.Exists(name, ext), "record: "+name+", "+ext) 176 r.Lock() 177 if record, ok := r.m[name]; ok { 178 if record.delete(ext) { 179 r.totalObjectCount-- 180 } 181 } 182 r.dups[name+ext] = struct{}{} 183 r.Unlock() 184 } 185 186 // NOTE: must be done under lock 187 func (r *Records) Find(name string) (record *Record, exists bool) { 188 record, exists = r.m[name] 189 return 190 } 191 192 func (r *Records) Exists(name, ext string) (exists bool) { 193 r.RLock() 194 if record, ok := r.m[name]; ok { 195 exists = record.exists(ext) 196 if !exists { 197 _, exists = r.dups[name+ext] 198 } 199 } 200 r.RUnlock() 201 return 202 } 203 204 func (r *Records) merge(records *Records) { 205 r.Insert(records.arr...) 206 } 207 208 func (r *Records) All() []*Record { 209 return r.arr 210 } 211 212 func (r *Records) Slice(start, end int) *Records { 213 return &Records{ 214 arr: r.arr[start:end], 215 } 216 } 217 218 func (r *Records) Len() int { 219 return len(r.arr) 220 } 221 222 func (r *Records) Swap(i, j int) { r.arr[i], r.arr[j] = r.arr[j], r.arr[i] } 223 224 func (r *Records) Less(i, j int, keyType string) (bool, error) { 225 lhs, rhs := r.arr[i].Key, r.arr[j].Key 226 if lhs == nil { 227 return false, errors.Errorf("key is missing for %q", r.arr[i].Name) 228 } else if rhs == nil { 229 return false, errors.Errorf("key is missing for %q", r.arr[j].Name) 230 } 231 232 switch keyType { 233 case ContentKeyInt: 234 ilhs, lok := lhs.(int64) 235 irhs, rok := rhs.(int64) 236 if lok && rok { 237 return ilhs < irhs, nil 238 } 239 // (motivation: javascript does not support int64 type) 240 if !lok { 241 ilhs = int64(lhs.(float64)) 242 } else { 243 irhs = int64(rhs.(float64)) 244 } 245 return ilhs < irhs, nil 246 case ContentKeyFloat: 247 flhs, lok := lhs.(float64) 248 frhs, rok := rhs.(float64) 249 debug.Assert(lok, lhs) 250 debug.Assert(rok, rhs) 251 return flhs < frhs, nil 252 case ContentKeyString: 253 slhs, lok := lhs.(string) 254 srhs, rok := rhs.(string) 255 debug.Assert(lok, lhs) 256 debug.Assert(rok, rhs) 257 return slhs < srhs, nil 258 } 259 260 debug.Assertf(false, "lhs: %v, rhs: %v, arr[i]: %v, arr[j]: %v", lhs, rhs, r.arr[i], r.arr[j]) 261 return false, nil 262 } 263 264 func (r *Records) TotalObjectCount() int { 265 return r.totalObjectCount 266 } 267 268 func (r *Records) RecordMemorySize() (size uint64) { 269 r.Lock() 270 defer r.Unlock() 271 272 var maxSize uint64 273 for _, record := range r.arr { 274 size = uint64(unsafe.Sizeof(*record)) 275 size += uint64(len(record.DaemonID)) 276 size += uint64(len(record.Name)) 277 size += uint64(unsafe.Sizeof(record.Key)) 278 maxSize = max(maxSize, size) 279 280 // If there is record which has at least 1 record object we should get 281 // the estimate of it and return the size. Some records might not have 282 // at least 1 record object because there were duplicated and in 283 // consequence were removed from the record. 284 if len(record.Objects) > 0 { 285 size += (uint64(unsafe.Sizeof(record.Objects)) + 286 uint64(len(record.Objects[0].Extension)) + 287 uint64(len(record.Objects[0].ContentPath)) + 288 uint64(len(record.Objects[0].ObjectFileType)) + 289 uint64(len(record.Objects[0].StoreType))) * uint64(len(record.Objects)) 290 return size 291 } 292 } 293 return maxSize 294 } 295 296 func (*Records) MarshalJSON() ([]byte, error) { panic("should not be used") } 297 func (*Records) UnmarshalJSON([]byte) error { panic("should not be used") }