github.com/siglens/siglens@v0.0.0-20240328180423-f7ce9ae441ed/pkg/segment/reader/segread/multicolreader.go (about)

     1  /*
     2  Copyright 2023.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package segread
    18  
    19  import (
    20  	"errors"
    21  	"fmt"
    22  	"os"
    23  	"sort"
    24  
    25  	"github.com/cespare/xxhash"
    26  	"github.com/siglens/siglens/pkg/blob"
    27  	"github.com/siglens/siglens/pkg/common/fileutils"
    28  	"github.com/siglens/siglens/pkg/config"
    29  	"github.com/siglens/siglens/pkg/segment/structs"
    30  	"github.com/siglens/siglens/pkg/segment/utils"
    31  	"github.com/siglens/siglens/pkg/segment/writer"
    32  	toputils "github.com/siglens/siglens/pkg/utils"
    33  
    34  	log "github.com/sirupsen/logrus"
    35  )
    36  
    37  /*
    38  Defines holder struct and functions to construct & manage SegmentFileReaders
    39  across multiple columns
    40  */
    41  type MultiColSegmentReader struct {
    42  	allFileReaders      []*SegmentFileReader
    43  	allColsReverseIndex map[string]int
    44  	timeStampKey        string // timestamp key
    45  	segKey              string // segment key
    46  	timeReader          *TimeRangeReader
    47  
    48  	AllColums              []*ColumnInfo
    49  	allColInfoReverseIndex map[string]*ColumnInfo
    50  }
    51  
    52  type ColumnInfo struct {
    53  	ColumnName string
    54  	count      int
    55  }
    56  
    57  /*
    58  Defines holder struct and functions to construct & manage SegmentFileReaders
    59  across multiple columns
    60  */
    61  type SharedMultiColReaders struct {
    62  	MultiColReaders []*MultiColSegmentReader
    63  	allFDs          map[string]*os.File // all fds shared across MultiColSegmentReaders
    64  	allInUseFiles   []string            // all files that need to be released by blob
    65  	numReaders      int
    66  	numOpenFDs      int64
    67  }
    68  
    69  /*
    70  Initialize a new MultipleColumnSegmentReader. This can be used to load & read any number of columns at once across any blocks
    71  
    72  Caller is responsible for calling .CloseAll() to close all the fds.
    73  
    74  Can also be used to get the timestamp for any arbitrary record in the Segment
    75  */
    76  func initNewMultiColumnReader(segKey string, colFDs map[string]*os.File, blockMetadata map[uint16]*structs.BlockMetadataHolder,
    77  	blockSummaries []*structs.BlockSummary, qid uint64) (*MultiColSegmentReader, error) {
    78  
    79  	readCols := make([]*ColumnInfo, 0)
    80  	readColsReverseIndex := make(map[string]*ColumnInfo)
    81  	colRevserseIndex := make(map[string]int)
    82  	allFileReaders := make([]*SegmentFileReader, len(colFDs))
    83  
    84  	tsKey := config.GetTimeStampKey()
    85  	var idx int = 0
    86  	retVal := &MultiColSegmentReader{
    87  		allFileReaders:      allFileReaders,
    88  		allColsReverseIndex: colRevserseIndex,
    89  		timeStampKey:        tsKey,
    90  		segKey:              segKey,
    91  	}
    92  
    93  	for colName, colFD := range colFDs {
    94  		if colName == tsKey {
    95  			blkRecCount := make(map[uint16]uint16)
    96  			for blkIdx, blkSum := range blockSummaries {
    97  				blkRecCount[uint16(blkIdx)] = blkSum.RecCount
    98  			}
    99  			currTimeReader, err := InitNewTimeReaderWithFD(colFD, tsKey, blockMetadata, blkRecCount, qid)
   100  			if err != nil {
   101  				log.Errorf("qid=%d, initNewMultiColumnReader: failed initialize timestamp reader for using timestamp key %s and segkey %s. Error: %v",
   102  					qid, tsKey, segKey, err)
   103  			} else {
   104  				retVal.timeReader = currTimeReader
   105  			}
   106  			continue
   107  		}
   108  
   109  		segReader, err := InitNewSegFileReader(colFD, colName, blockMetadata, qid, blockSummaries)
   110  		if err != nil {
   111  			log.Errorf("qid=%d, initNewMultiColumnReader: failed initialize segfile reader for column %s Using file %s. Error: %v",
   112  				qid, colName, colFD.Name(), err)
   113  			continue
   114  		}
   115  		allFileReaders[idx] = segReader
   116  		colRevserseIndex[colName] = idx
   117  		currCol := &ColumnInfo{ColumnName: colName, count: 0}
   118  		readCols = append(readCols, currCol)
   119  		readColsReverseIndex[colName] = currCol
   120  		idx++
   121  	}
   122  
   123  	retVal.allFileReaders = retVal.allFileReaders[:idx]
   124  	retVal.AllColums = readCols[:idx]
   125  	retVal.allColInfoReverseIndex = readColsReverseIndex
   126  	return retVal, nil
   127  }
   128  
   129  /*
   130  Inializes N MultiColumnSegmentReaders, each of which share the same file descriptor.
   131  
   132  Only columns that exist will be loaded, not guaranteed to load all columnns in colNames
   133  It is up to the caller to close the open FDs using .Close()
   134  */
   135  func InitSharedMultiColumnReaders(segKey string, colNames map[string]bool, blockMetadata map[uint16]*structs.BlockMetadataHolder,
   136  	blockSummaries []*structs.BlockSummary, numReaders int, qid uint64) (*SharedMultiColReaders, error) {
   137  	allInUseSegSetFiles := make([]string, 0)
   138  
   139  	maxOpenFds := int64(0)
   140  	for cname := range colNames {
   141  		if cname != "*" {
   142  			maxOpenFds += 1
   143  		}
   144  	}
   145  	maxOpenFds += 2 + 1 // for time rollup files
   146  	allFDs := make(map[string]*os.File)
   147  	sharedReader := &SharedMultiColReaders{
   148  		MultiColReaders: make([]*MultiColSegmentReader, numReaders),
   149  		numReaders:      numReaders,
   150  		numOpenFDs:      maxOpenFds,
   151  		allFDs:          allFDs,
   152  	}
   153  
   154  	err := fileutils.GLOBAL_FD_LIMITER.TryAcquireWithBackoff(maxOpenFds, 10, fmt.Sprintf("InitSharedMultiColumnReaders.qid=%d", qid))
   155  	if err != nil {
   156  		log.Errorf("qid=%d, Failed to acquire resources to be able to open %+v FDs. Error: %+v", qid, maxOpenFds, err)
   157  		return sharedReader, err
   158  	}
   159  	bulkDownloadFiles := make(map[string]string)
   160  	var fName string
   161  	for cname := range colNames {
   162  		if cname == "" {
   163  			return nil, fmt.Errorf("InitSharedMultiColumnReaders: unknown seg set col")
   164  		} else if cname == "*" {
   165  			continue
   166  		} else {
   167  			fName = fmt.Sprintf("%v_%v.csg", segKey, xxhash.Sum64String(cname))
   168  		}
   169  		bulkDownloadFiles[fName] = cname
   170  	}
   171  	err = blob.BulkDownloadSegmentBlob(bulkDownloadFiles, true)
   172  	if err != nil {
   173  		log.Errorf("qid=%d, initNewMultiColumnReader failed to bulk download seg files. err=%v", qid, err)
   174  		return nil, err
   175  	}
   176  
   177  	for fName, colName := range bulkDownloadFiles {
   178  		fName := fName
   179  		currFd, err := os.OpenFile(fName, os.O_RDONLY, 0644)
   180  		if err != nil {
   181  			log.Errorf("qid=%d, initNewMultiColumnReader: failed to open file %s for columns %s. Error: %v.",
   182  				qid, fName, colName, err)
   183  			continue
   184  		}
   185  		sharedReader.allFDs[colName] = currFd
   186  		allInUseSegSetFiles = append(allInUseSegSetFiles, fName)
   187  	}
   188  
   189  	for i := 0; i < numReaders; i++ {
   190  		currReader, err := initNewMultiColumnReader(segKey, sharedReader.allFDs, blockMetadata, blockSummaries, qid)
   191  		if err != nil {
   192  			sharedReader.Close()
   193  			err := blob.SetSegSetFilesAsNotInUse(allInUseSegSetFiles)
   194  			if err != nil {
   195  				log.Errorf("qid=%d, Failed to release needed segment files from local storage %+v!  Err: %+v", qid, allInUseSegSetFiles, err)
   196  			}
   197  			return sharedReader, err
   198  		}
   199  		sharedReader.MultiColReaders[i] = currReader
   200  	}
   201  	sharedReader.allInUseFiles = allInUseSegSetFiles
   202  	return sharedReader, nil
   203  }
   204  
   205  // Returns all buffers to the pools, closes all FDs shared across multi readers, and updates global semaphore
   206  func (scr *SharedMultiColReaders) Close() {
   207  
   208  	for _, multiReader := range scr.MultiColReaders {
   209  		multiReader.returnBuffers()
   210  	}
   211  	for _, reader := range scr.allFDs {
   212  		if reader != nil {
   213  			err := reader.Close()
   214  			if err != nil {
   215  				log.Errorf("Failed to close fd! err: %+v", err)
   216  			}
   217  		}
   218  	}
   219  	err := blob.SetSegSetFilesAsNotInUse(scr.allInUseFiles)
   220  	if err != nil {
   221  		log.Errorf("Failed to release needed segment files from local storage %+v!  Err: %+v", scr.allInUseFiles, err)
   222  	}
   223  	fileutils.GLOBAL_FD_LIMITER.Release(scr.numOpenFDs)
   224  }
   225  
   226  func (mcsr *MultiColSegmentReader) GetTimeStampForRecord(blockNum uint16, recordNum uint16, qid uint64) (uint64, error) {
   227  
   228  	if mcsr.timeReader == nil {
   229  		log.Errorf("qid=%v, Tried to get timestamp using a multi reader wihout an initialized timeReader", qid)
   230  		return 0, errors.New("uninitialized timerange reader")
   231  	}
   232  	return mcsr.timeReader.GetTimeStampForRecord(blockNum, recordNum, qid)
   233  }
   234  
   235  func (mcsr *MultiColSegmentReader) GetAllTimeStampsForBlock(blockNum uint16) ([]uint64, error) {
   236  
   237  	if mcsr.timeReader == nil {
   238  		log.Errorf("Tried to get all block timestamps using a multi reader wihout an initialized timeReader")
   239  		return nil, errors.New("uninitialized timerange reader")
   240  	}
   241  	return mcsr.timeReader.GetAllTimeStampsForBlock(blockNum)
   242  }
   243  
   244  // Reads the raw value and returns the []byte in TLV format (type-[length]-value encoding)
   245  func (mcsr *MultiColSegmentReader) ReadRawRecordFromColumnFile(col string, blockNum uint16, recordNum uint16, qid uint64) ([]byte, error) {
   246  
   247  	if col == mcsr.timeStampKey {
   248  		ts, err := mcsr.GetTimeStampForRecord(blockNum, recordNum, qid)
   249  		if err != nil {
   250  			return nil, err
   251  		}
   252  		retVal := make([]byte, 9)
   253  		copy(retVal[0:], utils.VALTYPE_ENC_UINT64[:])
   254  		copy(retVal[1:], toputils.Uint64ToBytesLittleEndian(ts))
   255  		return retVal, nil
   256  	}
   257  	keyIndex, ok := mcsr.allColsReverseIndex[col]
   258  	if !ok {
   259  		// Debug to avoid log flood for when the column does not exist
   260  		log.Debugf("ReadRawRecordFromColumnFile: failed to find column %s in muli col reader. All cols %+v", col, mcsr.allColsReverseIndex)
   261  		return nil, errors.New("column not found in MultipleColumnSegmentReader")
   262  	}
   263  
   264  	return mcsr.allFileReaders[keyIndex].ReadRecordFromBlock(blockNum, recordNum)
   265  }
   266  
   267  // Reads the request value and converts it to a *utils.CValueEnclosure
   268  func (mcsr *MultiColSegmentReader) ExtractValueFromColumnFile(col string, blockNum uint16, recordNum uint16,
   269  	qid uint64) (*utils.CValueEnclosure, error) {
   270  	if col == mcsr.timeStampKey {
   271  		ts, err := mcsr.GetTimeStampForRecord(blockNum, recordNum, qid)
   272  		if err != nil {
   273  			return &utils.CValueEnclosure{}, err
   274  		}
   275  
   276  		return &utils.CValueEnclosure{
   277  			Dtype: utils.SS_DT_UNSIGNED_NUM,
   278  			CVal:  ts,
   279  		}, nil
   280  	}
   281  
   282  	rawVal, err := mcsr.ReadRawRecordFromColumnFile(col, blockNum, recordNum, qid)
   283  	if err != nil {
   284  		return &utils.CValueEnclosure{
   285  			Dtype: utils.SS_DT_BACKFILL,
   286  			CVal:  nil,
   287  		}, err
   288  	}
   289  
   290  	cval, _, err := writer.GetCvalFromRec(rawVal, qid)
   291  	return &cval, err
   292  }
   293  
   294  func (mcsr *MultiColSegmentReader) returnBuffers() {
   295  
   296  	if mcsr.allFileReaders != nil {
   297  		for _, reader := range mcsr.allFileReaders {
   298  			if reader != nil {
   299  				reader.returnBuffers()
   300  			}
   301  		}
   302  	}
   303  	if mcsr.timeReader != nil {
   304  		mcsr.timeReader.returnBuffers()
   305  	}
   306  }
   307  
   308  func (mcsr *MultiColSegmentReader) IncrementColumnUsage(colName string) {
   309  	mcsr.allColInfoReverseIndex[colName].count++
   310  }
   311  
   312  // reorders mcsr.AllColumns to be ordered on usage
   313  func (mcsr *MultiColSegmentReader) ReorderColumnUsage() {
   314  	sort.Slice(mcsr.AllColums, func(i, j int) bool {
   315  		return mcsr.AllColums[i].count > mcsr.AllColums[j].count
   316  	})
   317  }
   318  
   319  func (mcsr *MultiColSegmentReader) IsBlkDictEncoded(cname string,
   320  	blkNum uint16) (bool, error) {
   321  
   322  	// reads the csg file and decides whether this particular block is encoded via dictionary encoding
   323  	// or raw csg encoding, and returns if it is dict-enc, along with the map of each dict-key => recNums pairing
   324  
   325  	keyIndex, ok := mcsr.allColsReverseIndex[cname]
   326  	if !ok {
   327  		// Debug to avoid log flood for when the column does not exist
   328  		log.Debugf("IsBlkDictEncoded: failed to find column %s in muli col reader. All cols %+v", cname, mcsr.allColsReverseIndex)
   329  		return false, errors.New("column not found in MultipleColumnSegmentReader")
   330  	}
   331  
   332  	return mcsr.allFileReaders[keyIndex].IsBlkDictEncoded(blkNum)
   333  }
   334  
   335  /*
   336  parameters:
   337  
   338  	results:  map of recNum -> colName -> colValue to be filled in.
   339  	col:      columnName
   340  	blockNum: blocknum to search for
   341  	rnMap:    map of recordNumbers to for which to find the colValue for the given colname
   342  
   343  returns:
   344  
   345  	bool: if we are able to find the requested column in dict encoding
   346  */
   347  func (mcsr *MultiColSegmentReader) GetDictEncCvalsFromColFile(results map[uint16]map[string]interface{},
   348  	col string, blockNum uint16, orderedRecNums []uint16, qid uint64) bool {
   349  
   350  	keyIndex, ok := mcsr.allColsReverseIndex[col]
   351  	if !ok {
   352  		return false
   353  	}
   354  
   355  	return mcsr.allFileReaders[keyIndex].GetDictEncCvalsFromColFile(results, blockNum, orderedRecNums)
   356  }
   357  
   358  func (mcsr *MultiColSegmentReader) ApplySearchToMatchFilterDictCsg(match *structs.MatchFilter,
   359  	bsh *structs.BlockSearchHelper, cname string) (bool, error) {
   360  
   361  	keyIndex, ok := mcsr.allColsReverseIndex[cname]
   362  	if !ok {
   363  		return false, errors.New("could not find sfr for cname")
   364  	}
   365  
   366  	return mcsr.allFileReaders[keyIndex].ApplySearchToMatchFilterDictCsg(match, bsh)
   367  }
   368  
   369  func (mcsr *MultiColSegmentReader) ApplySearchToExpressionFilterDictCsg(qValDte *utils.DtypeEnclosure,
   370  	fop utils.FilterOperator, isRegexSearch bool, bsh *structs.BlockSearchHelper,
   371  	cname string) (bool, error) {
   372  
   373  	keyIndex, ok := mcsr.allColsReverseIndex[cname]
   374  	if !ok {
   375  		return false, errors.New("could not find sfr for cname")
   376  	}
   377  
   378  	return mcsr.allFileReaders[keyIndex].ApplySearchToExpressionFilterDictCsg(qValDte,
   379  		fop, isRegexSearch, bsh)
   380  }
   381  
   382  func (mcsr *MultiColSegmentReader) IsColPresent(cname string) bool {
   383  	_, ok := mcsr.allColsReverseIndex[cname]
   384  	return ok
   385  }