github.com/siglens/siglens@v0.0.0-20240328180423-f7ce9ae441ed/pkg/segment/reader/segread/multicolreader.go (about) 1 /* 2 Copyright 2023. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package segread 18 19 import ( 20 "errors" 21 "fmt" 22 "os" 23 "sort" 24 25 "github.com/cespare/xxhash" 26 "github.com/siglens/siglens/pkg/blob" 27 "github.com/siglens/siglens/pkg/common/fileutils" 28 "github.com/siglens/siglens/pkg/config" 29 "github.com/siglens/siglens/pkg/segment/structs" 30 "github.com/siglens/siglens/pkg/segment/utils" 31 "github.com/siglens/siglens/pkg/segment/writer" 32 toputils "github.com/siglens/siglens/pkg/utils" 33 34 log "github.com/sirupsen/logrus" 35 ) 36 37 /* 38 Defines holder struct and functions to construct & manage SegmentFileReaders 39 across multiple columns 40 */ 41 type MultiColSegmentReader struct { 42 allFileReaders []*SegmentFileReader 43 allColsReverseIndex map[string]int 44 timeStampKey string // timestamp key 45 segKey string // segment key 46 timeReader *TimeRangeReader 47 48 AllColums []*ColumnInfo 49 allColInfoReverseIndex map[string]*ColumnInfo 50 } 51 52 type ColumnInfo struct { 53 ColumnName string 54 count int 55 } 56 57 /* 58 Defines holder struct and functions to construct & manage SegmentFileReaders 59 across multiple columns 60 */ 61 type SharedMultiColReaders struct { 62 MultiColReaders []*MultiColSegmentReader 63 allFDs map[string]*os.File // all fds shared across MultiColSegmentReaders 64 allInUseFiles []string // all files that need to be released by blob 65 numReaders int 66 numOpenFDs int64 67 } 68 69 /* 70 Initialize a new MultipleColumnSegmentReader. This can be used to load & read any number of columns at once across any blocks 71 72 Caller is responsible for calling .CloseAll() to close all the fds. 73 74 Can also be used to get the timestamp for any arbitrary record in the Segment 75 */ 76 func initNewMultiColumnReader(segKey string, colFDs map[string]*os.File, blockMetadata map[uint16]*structs.BlockMetadataHolder, 77 blockSummaries []*structs.BlockSummary, qid uint64) (*MultiColSegmentReader, error) { 78 79 readCols := make([]*ColumnInfo, 0) 80 readColsReverseIndex := make(map[string]*ColumnInfo) 81 colRevserseIndex := make(map[string]int) 82 allFileReaders := make([]*SegmentFileReader, len(colFDs)) 83 84 tsKey := config.GetTimeStampKey() 85 var idx int = 0 86 retVal := &MultiColSegmentReader{ 87 allFileReaders: allFileReaders, 88 allColsReverseIndex: colRevserseIndex, 89 timeStampKey: tsKey, 90 segKey: segKey, 91 } 92 93 for colName, colFD := range colFDs { 94 if colName == tsKey { 95 blkRecCount := make(map[uint16]uint16) 96 for blkIdx, blkSum := range blockSummaries { 97 blkRecCount[uint16(blkIdx)] = blkSum.RecCount 98 } 99 currTimeReader, err := InitNewTimeReaderWithFD(colFD, tsKey, blockMetadata, blkRecCount, qid) 100 if err != nil { 101 log.Errorf("qid=%d, initNewMultiColumnReader: failed initialize timestamp reader for using timestamp key %s and segkey %s. Error: %v", 102 qid, tsKey, segKey, err) 103 } else { 104 retVal.timeReader = currTimeReader 105 } 106 continue 107 } 108 109 segReader, err := InitNewSegFileReader(colFD, colName, blockMetadata, qid, blockSummaries) 110 if err != nil { 111 log.Errorf("qid=%d, initNewMultiColumnReader: failed initialize segfile reader for column %s Using file %s. Error: %v", 112 qid, colName, colFD.Name(), err) 113 continue 114 } 115 allFileReaders[idx] = segReader 116 colRevserseIndex[colName] = idx 117 currCol := &ColumnInfo{ColumnName: colName, count: 0} 118 readCols = append(readCols, currCol) 119 readColsReverseIndex[colName] = currCol 120 idx++ 121 } 122 123 retVal.allFileReaders = retVal.allFileReaders[:idx] 124 retVal.AllColums = readCols[:idx] 125 retVal.allColInfoReverseIndex = readColsReverseIndex 126 return retVal, nil 127 } 128 129 /* 130 Inializes N MultiColumnSegmentReaders, each of which share the same file descriptor. 131 132 Only columns that exist will be loaded, not guaranteed to load all columnns in colNames 133 It is up to the caller to close the open FDs using .Close() 134 */ 135 func InitSharedMultiColumnReaders(segKey string, colNames map[string]bool, blockMetadata map[uint16]*structs.BlockMetadataHolder, 136 blockSummaries []*structs.BlockSummary, numReaders int, qid uint64) (*SharedMultiColReaders, error) { 137 allInUseSegSetFiles := make([]string, 0) 138 139 maxOpenFds := int64(0) 140 for cname := range colNames { 141 if cname != "*" { 142 maxOpenFds += 1 143 } 144 } 145 maxOpenFds += 2 + 1 // for time rollup files 146 allFDs := make(map[string]*os.File) 147 sharedReader := &SharedMultiColReaders{ 148 MultiColReaders: make([]*MultiColSegmentReader, numReaders), 149 numReaders: numReaders, 150 numOpenFDs: maxOpenFds, 151 allFDs: allFDs, 152 } 153 154 err := fileutils.GLOBAL_FD_LIMITER.TryAcquireWithBackoff(maxOpenFds, 10, fmt.Sprintf("InitSharedMultiColumnReaders.qid=%d", qid)) 155 if err != nil { 156 log.Errorf("qid=%d, Failed to acquire resources to be able to open %+v FDs. Error: %+v", qid, maxOpenFds, err) 157 return sharedReader, err 158 } 159 bulkDownloadFiles := make(map[string]string) 160 var fName string 161 for cname := range colNames { 162 if cname == "" { 163 return nil, fmt.Errorf("InitSharedMultiColumnReaders: unknown seg set col") 164 } else if cname == "*" { 165 continue 166 } else { 167 fName = fmt.Sprintf("%v_%v.csg", segKey, xxhash.Sum64String(cname)) 168 } 169 bulkDownloadFiles[fName] = cname 170 } 171 err = blob.BulkDownloadSegmentBlob(bulkDownloadFiles, true) 172 if err != nil { 173 log.Errorf("qid=%d, initNewMultiColumnReader failed to bulk download seg files. err=%v", qid, err) 174 return nil, err 175 } 176 177 for fName, colName := range bulkDownloadFiles { 178 fName := fName 179 currFd, err := os.OpenFile(fName, os.O_RDONLY, 0644) 180 if err != nil { 181 log.Errorf("qid=%d, initNewMultiColumnReader: failed to open file %s for columns %s. Error: %v.", 182 qid, fName, colName, err) 183 continue 184 } 185 sharedReader.allFDs[colName] = currFd 186 allInUseSegSetFiles = append(allInUseSegSetFiles, fName) 187 } 188 189 for i := 0; i < numReaders; i++ { 190 currReader, err := initNewMultiColumnReader(segKey, sharedReader.allFDs, blockMetadata, blockSummaries, qid) 191 if err != nil { 192 sharedReader.Close() 193 err := blob.SetSegSetFilesAsNotInUse(allInUseSegSetFiles) 194 if err != nil { 195 log.Errorf("qid=%d, Failed to release needed segment files from local storage %+v! Err: %+v", qid, allInUseSegSetFiles, err) 196 } 197 return sharedReader, err 198 } 199 sharedReader.MultiColReaders[i] = currReader 200 } 201 sharedReader.allInUseFiles = allInUseSegSetFiles 202 return sharedReader, nil 203 } 204 205 // Returns all buffers to the pools, closes all FDs shared across multi readers, and updates global semaphore 206 func (scr *SharedMultiColReaders) Close() { 207 208 for _, multiReader := range scr.MultiColReaders { 209 multiReader.returnBuffers() 210 } 211 for _, reader := range scr.allFDs { 212 if reader != nil { 213 err := reader.Close() 214 if err != nil { 215 log.Errorf("Failed to close fd! err: %+v", err) 216 } 217 } 218 } 219 err := blob.SetSegSetFilesAsNotInUse(scr.allInUseFiles) 220 if err != nil { 221 log.Errorf("Failed to release needed segment files from local storage %+v! Err: %+v", scr.allInUseFiles, err) 222 } 223 fileutils.GLOBAL_FD_LIMITER.Release(scr.numOpenFDs) 224 } 225 226 func (mcsr *MultiColSegmentReader) GetTimeStampForRecord(blockNum uint16, recordNum uint16, qid uint64) (uint64, error) { 227 228 if mcsr.timeReader == nil { 229 log.Errorf("qid=%v, Tried to get timestamp using a multi reader wihout an initialized timeReader", qid) 230 return 0, errors.New("uninitialized timerange reader") 231 } 232 return mcsr.timeReader.GetTimeStampForRecord(blockNum, recordNum, qid) 233 } 234 235 func (mcsr *MultiColSegmentReader) GetAllTimeStampsForBlock(blockNum uint16) ([]uint64, error) { 236 237 if mcsr.timeReader == nil { 238 log.Errorf("Tried to get all block timestamps using a multi reader wihout an initialized timeReader") 239 return nil, errors.New("uninitialized timerange reader") 240 } 241 return mcsr.timeReader.GetAllTimeStampsForBlock(blockNum) 242 } 243 244 // Reads the raw value and returns the []byte in TLV format (type-[length]-value encoding) 245 func (mcsr *MultiColSegmentReader) ReadRawRecordFromColumnFile(col string, blockNum uint16, recordNum uint16, qid uint64) ([]byte, error) { 246 247 if col == mcsr.timeStampKey { 248 ts, err := mcsr.GetTimeStampForRecord(blockNum, recordNum, qid) 249 if err != nil { 250 return nil, err 251 } 252 retVal := make([]byte, 9) 253 copy(retVal[0:], utils.VALTYPE_ENC_UINT64[:]) 254 copy(retVal[1:], toputils.Uint64ToBytesLittleEndian(ts)) 255 return retVal, nil 256 } 257 keyIndex, ok := mcsr.allColsReverseIndex[col] 258 if !ok { 259 // Debug to avoid log flood for when the column does not exist 260 log.Debugf("ReadRawRecordFromColumnFile: failed to find column %s in muli col reader. All cols %+v", col, mcsr.allColsReverseIndex) 261 return nil, errors.New("column not found in MultipleColumnSegmentReader") 262 } 263 264 return mcsr.allFileReaders[keyIndex].ReadRecordFromBlock(blockNum, recordNum) 265 } 266 267 // Reads the request value and converts it to a *utils.CValueEnclosure 268 func (mcsr *MultiColSegmentReader) ExtractValueFromColumnFile(col string, blockNum uint16, recordNum uint16, 269 qid uint64) (*utils.CValueEnclosure, error) { 270 if col == mcsr.timeStampKey { 271 ts, err := mcsr.GetTimeStampForRecord(blockNum, recordNum, qid) 272 if err != nil { 273 return &utils.CValueEnclosure{}, err 274 } 275 276 return &utils.CValueEnclosure{ 277 Dtype: utils.SS_DT_UNSIGNED_NUM, 278 CVal: ts, 279 }, nil 280 } 281 282 rawVal, err := mcsr.ReadRawRecordFromColumnFile(col, blockNum, recordNum, qid) 283 if err != nil { 284 return &utils.CValueEnclosure{ 285 Dtype: utils.SS_DT_BACKFILL, 286 CVal: nil, 287 }, err 288 } 289 290 cval, _, err := writer.GetCvalFromRec(rawVal, qid) 291 return &cval, err 292 } 293 294 func (mcsr *MultiColSegmentReader) returnBuffers() { 295 296 if mcsr.allFileReaders != nil { 297 for _, reader := range mcsr.allFileReaders { 298 if reader != nil { 299 reader.returnBuffers() 300 } 301 } 302 } 303 if mcsr.timeReader != nil { 304 mcsr.timeReader.returnBuffers() 305 } 306 } 307 308 func (mcsr *MultiColSegmentReader) IncrementColumnUsage(colName string) { 309 mcsr.allColInfoReverseIndex[colName].count++ 310 } 311 312 // reorders mcsr.AllColumns to be ordered on usage 313 func (mcsr *MultiColSegmentReader) ReorderColumnUsage() { 314 sort.Slice(mcsr.AllColums, func(i, j int) bool { 315 return mcsr.AllColums[i].count > mcsr.AllColums[j].count 316 }) 317 } 318 319 func (mcsr *MultiColSegmentReader) IsBlkDictEncoded(cname string, 320 blkNum uint16) (bool, error) { 321 322 // reads the csg file and decides whether this particular block is encoded via dictionary encoding 323 // or raw csg encoding, and returns if it is dict-enc, along with the map of each dict-key => recNums pairing 324 325 keyIndex, ok := mcsr.allColsReverseIndex[cname] 326 if !ok { 327 // Debug to avoid log flood for when the column does not exist 328 log.Debugf("IsBlkDictEncoded: failed to find column %s in muli col reader. All cols %+v", cname, mcsr.allColsReverseIndex) 329 return false, errors.New("column not found in MultipleColumnSegmentReader") 330 } 331 332 return mcsr.allFileReaders[keyIndex].IsBlkDictEncoded(blkNum) 333 } 334 335 /* 336 parameters: 337 338 results: map of recNum -> colName -> colValue to be filled in. 339 col: columnName 340 blockNum: blocknum to search for 341 rnMap: map of recordNumbers to for which to find the colValue for the given colname 342 343 returns: 344 345 bool: if we are able to find the requested column in dict encoding 346 */ 347 func (mcsr *MultiColSegmentReader) GetDictEncCvalsFromColFile(results map[uint16]map[string]interface{}, 348 col string, blockNum uint16, orderedRecNums []uint16, qid uint64) bool { 349 350 keyIndex, ok := mcsr.allColsReverseIndex[col] 351 if !ok { 352 return false 353 } 354 355 return mcsr.allFileReaders[keyIndex].GetDictEncCvalsFromColFile(results, blockNum, orderedRecNums) 356 } 357 358 func (mcsr *MultiColSegmentReader) ApplySearchToMatchFilterDictCsg(match *structs.MatchFilter, 359 bsh *structs.BlockSearchHelper, cname string) (bool, error) { 360 361 keyIndex, ok := mcsr.allColsReverseIndex[cname] 362 if !ok { 363 return false, errors.New("could not find sfr for cname") 364 } 365 366 return mcsr.allFileReaders[keyIndex].ApplySearchToMatchFilterDictCsg(match, bsh) 367 } 368 369 func (mcsr *MultiColSegmentReader) ApplySearchToExpressionFilterDictCsg(qValDte *utils.DtypeEnclosure, 370 fop utils.FilterOperator, isRegexSearch bool, bsh *structs.BlockSearchHelper, 371 cname string) (bool, error) { 372 373 keyIndex, ok := mcsr.allColsReverseIndex[cname] 374 if !ok { 375 return false, errors.New("could not find sfr for cname") 376 } 377 378 return mcsr.allFileReaders[keyIndex].ApplySearchToExpressionFilterDictCsg(qValDte, 379 fop, isRegexSearch, bsh) 380 } 381 382 func (mcsr *MultiColSegmentReader) IsColPresent(cname string) bool { 383 _, ok := mcsr.allColsReverseIndex[cname] 384 return ok 385 }