github.com/siglens/siglens@v0.0.0-20240328180423-f7ce9ae441ed/pkg/segment/writer/agiletree.go (about)

     1  /*
     2  Copyright 2023.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package writer
    18  
    19  import (
    20  	"fmt"
    21  	"math"
    22  
    23  	"github.com/siglens/siglens/pkg/segment/utils"
    24  	toputils "github.com/siglens/siglens/pkg/utils"
    25  	log "github.com/sirupsen/logrus"
    26  )
    27  
    28  type StarTree struct {
    29  	Root *Node
    30  }
    31  
    32  // its ok for this to be int, since this will be used as an index in arrays
    33  const (
    34  	MeasFnMinIdx int = iota // has to be always zero based
    35  	MeasFnMaxIdx
    36  	MeasFnSumIdx
    37  	MeasFnCountIdx
    38  	// Note: anytimes you add a Fn, make sure to adjust the IdxToAgFn array
    39  	// Note: always keep this last since it is used for indexing into aggValues
    40  	TotalMeasFns
    41  )
    42  
    43  var IdxToAgFn []utils.AggregateFunctions = []utils.AggregateFunctions{
    44  	utils.Min, utils.Max,
    45  	utils.Sum, utils.Count}
    46  
    47  func AgFnToIdx(fn utils.AggregateFunctions) int {
    48  	switch fn {
    49  	case utils.Min:
    50  		return MeasFnMinIdx
    51  	case utils.Max:
    52  		return MeasFnMaxIdx
    53  	case utils.Sum:
    54  		return MeasFnSumIdx
    55  	case utils.Count:
    56  		return MeasFnCountIdx
    57  	}
    58  	log.Errorf("AgFnToIdx: invalid fn: %v", fn)
    59  	return MeasFnCountIdx
    60  }
    61  
    62  var one = utils.CValueEnclosure{Dtype: utils.SS_DT_UNSIGNED_NUM, CVal: uint64(1)}
    63  
    64  type Node struct {
    65  	myKey     uint32
    66  	parent    *Node
    67  	children  map[uint32]*Node
    68  	aggValues []utils.CValueEnclosure
    69  }
    70  
    71  type StarTreeBuilder struct {
    72  	groupByKeys       []string
    73  	numGroupByCols    uint16
    74  	mColNames         []string
    75  	nodeCount         int
    76  	nodePool          []Node
    77  	tree              *StarTree
    78  	segDictMap        []map[string]uint32 // "mac" ==> enc-2
    79  	segDictEncRev     [][]string          // [colNum]["ios", "mac", "win" ...] , [0][enc2] --> "mac"
    80  	segDictLastNum    []uint32            // for each ColNum maintains the lastEnc increasing seq
    81  	wipRecNumToColEnc [][]uint32          //maintain working buffer per wipBlock
    82  	buf               []byte
    83  }
    84  
    85  func (stb *StarTreeBuilder) GetNodeCount() int {
    86  	return stb.nodeCount
    87  }
    88  
    89  /*
    90  ResetSegTree
    91  
    92  	Current assumptions:
    93  
    94  	All groupBy columns that contain strings are dictionaryEncoded.
    95  	Any column with len(col.deMap) != 0 is assumed to be dictionary encoded
    96  	It is also assumed that no other values than the dic encoded strings appear in that column
    97  
    98  	When storing all other values, their raw byte values are converted to an unsigned integer,
    99  	and then converted to uint64 to have a consistent size
   100  
   101  parameters:
   102  
   103  	wipBlock: segstore's wip block
   104  	groupByKeys: groupBy column Names
   105  	mColNames: colnames of measure columns
   106  
   107  returns:
   108  */
   109  func (stb *StarTreeBuilder) ResetSegTree(block *WipBlock, groupByKeys []string, mColNames []string) {
   110  
   111  	stb.groupByKeys = groupByKeys
   112  	numGroupByCols := uint16(len(groupByKeys))
   113  	stb.numGroupByCols = numGroupByCols
   114  	stb.mColNames = mColNames
   115  
   116  	stb.resetNodeData(block)
   117  
   118  	root := stb.newNode()
   119  	root.myKey = math.MaxUint32 // give max for root
   120  	stb.tree = &StarTree{Root: root}
   121  
   122  	sizeToAdd := int(numGroupByCols) - len(stb.segDictEncRev)
   123  	if sizeToAdd <= 0 {
   124  		stb.segDictEncRev = stb.segDictEncRev[:numGroupByCols]
   125  		stb.segDictMap = stb.segDictMap[:numGroupByCols]
   126  		stb.wipRecNumToColEnc = stb.wipRecNumToColEnc[:stb.numGroupByCols]
   127  		stb.segDictLastNum = stb.segDictLastNum[:stb.numGroupByCols]
   128  	} else {
   129  		newArr := make([][]string, sizeToAdd)
   130  		stb.segDictEncRev = append(stb.segDictEncRev, newArr...)
   131  		newArr2 := make([][]uint32, sizeToAdd)
   132  		stb.wipRecNumToColEnc = append(stb.wipRecNumToColEnc, newArr2...)
   133  		stb.segDictMap = append(stb.segDictMap, make([]map[string]uint32, sizeToAdd)...)
   134  		stb.segDictLastNum = append(stb.segDictLastNum, make([]uint32, sizeToAdd)...)
   135  	}
   136  
   137  	for colNum := uint16(0); colNum < numGroupByCols; colNum++ {
   138  		if stb.segDictEncRev[colNum] == nil {
   139  			// we know each col won't have more encodings than max node limit
   140  			stb.segDictEncRev[colNum] = make([]string, MaxAgileTreeNodeCount)
   141  		}
   142  		if stb.segDictMap[colNum] == nil {
   143  			stb.segDictMap[colNum] = make(map[string]uint32)
   144  		}
   145  		stb.segDictLastNum[colNum] = 0
   146  		for cv := range stb.segDictMap[colNum] {
   147  			delete(stb.segDictMap[colNum], cv)
   148  		}
   149  	}
   150  
   151  	if len(stb.buf) <= 0 {
   152  		stb.buf = make([]byte, 1_000_000) // initial start size
   153  	}
   154  }
   155  
   156  func (stb *StarTreeBuilder) setColValEnc(colNum int, colVal string) uint32 {
   157  	// todo a zero copy version of map lookups needed
   158  	enc, ok := stb.segDictMap[colNum][colVal]
   159  	if !ok {
   160  		enc = stb.segDictLastNum[colNum]
   161  		stb.segDictMap[colNum][colVal] = enc
   162  		stb.segDictEncRev[colNum][enc] = colVal
   163  		stb.segDictLastNum[colNum]++
   164  	}
   165  	return enc
   166  }
   167  
   168  // helper function to reset node data for builder reuse
   169  func (stb *StarTreeBuilder) resetNodeData(wip *WipBlock) {
   170  
   171  	for _, node := range stb.nodePool {
   172  		node.parent = nil
   173  		for k := range node.children {
   174  			delete(node.children, k)
   175  		}
   176  		node.aggValues = nil
   177  	}
   178  	stb.nodeCount = 0
   179  }
   180  
   181  func (stb *StarTreeBuilder) newNode() *Node {
   182  
   183  	if stb.nodeCount >= len(stb.nodePool) {
   184  		stb.nodePool = append(stb.nodePool, Node{})
   185  	}
   186  	ans := stb.nodePool[stb.nodeCount]
   187  	stb.nodeCount += 1
   188  
   189  	if ans.children == nil {
   190  		ans.children = make(map[uint32]*Node)
   191  	}
   192  
   193  	return &ans
   194  }
   195  
   196  func (stb *StarTreeBuilder) Aggregate(cur *Node) error {
   197  
   198  	first := true
   199  
   200  	lenAggValues := len(stb.mColNames) * TotalMeasFns
   201  
   202  	if len(cur.children) != 0 {
   203  		cur.aggValues = make([]utils.CValueEnclosure, lenAggValues)
   204  	}
   205  
   206  	var err error
   207  	for _, child := range cur.children {
   208  		err = stb.Aggregate(child)
   209  		if err != nil {
   210  			return err
   211  		}
   212  
   213  		if first {
   214  			copy(cur.aggValues[:lenAggValues], child.aggValues[:lenAggValues])
   215  			first = false
   216  			continue
   217  		}
   218  
   219  		for mcNum := range stb.mColNames {
   220  			midx := mcNum * TotalMeasFns
   221  			agidx := midx + MeasFnMinIdx
   222  			cur.aggValues[agidx], err = utils.Reduce(cur.aggValues[agidx], child.aggValues[agidx], utils.Min)
   223  			if err != nil {
   224  				log.Errorf("Aggregate: error in aggregating min err:%v", err)
   225  				return err
   226  			}
   227  			agidx = midx + MeasFnMaxIdx
   228  			cur.aggValues[agidx], err = utils.Reduce(cur.aggValues[agidx], child.aggValues[agidx], utils.Max)
   229  			if err != nil {
   230  				log.Errorf("Aggregate: error in aggregating max err:%v", err)
   231  				return err
   232  			}
   233  			agidx = midx + MeasFnSumIdx
   234  			cur.aggValues[agidx], err = utils.Reduce(cur.aggValues[agidx], child.aggValues[agidx], utils.Sum)
   235  			if err != nil {
   236  				log.Errorf("Aggregate: error in aggregating sum err:%v", err)
   237  				return err
   238  			}
   239  			agidx = midx + MeasFnCountIdx
   240  			cur.aggValues[agidx], err = utils.Reduce(cur.aggValues[agidx], child.aggValues[agidx], utils.Count)
   241  			if err != nil {
   242  				log.Errorf("Aggregate: error in aggregating count err:%v", err)
   243  				return err
   244  			}
   245  		}
   246  	}
   247  
   248  	return nil
   249  }
   250  
   251  func (stb *StarTreeBuilder) insertIntoTree(node *Node, colVals []uint32, recNum uint16, idx uint) *Node {
   252  	child, keyExists := node.children[colVals[idx]]
   253  	if !keyExists {
   254  		child = stb.newNode()
   255  		child.myKey = colVals[idx]
   256  		child.parent = node
   257  		node.children[colVals[idx]] = child
   258  	}
   259  
   260  	if idx+1 != uint(len(colVals)) {
   261  		return stb.insertIntoTree(child, colVals, recNum, idx+1)
   262  	} else {
   263  		return child
   264  	}
   265  }
   266  
   267  func (stb *StarTreeBuilder) creatEnc(wip *WipBlock) error {
   268  
   269  	numRecs := wip.blockSummary.RecCount
   270  
   271  	for colNum, colName := range stb.groupByKeys {
   272  		sizeToAdd := int(numRecs) - len(stb.wipRecNumToColEnc[colNum])
   273  		if sizeToAdd > 0 {
   274  			newArr := make([]uint32, sizeToAdd)
   275  			stb.wipRecNumToColEnc[colNum] = append(stb.wipRecNumToColEnc[colNum], newArr...)
   276  		}
   277  
   278  		cwip := wip.colWips[colName]
   279  		if cwip.deCount < wipCardLimit {
   280  			for rawKey, indices := range cwip.deMap {
   281  				enc := stb.setColValEnc(colNum, rawKey)
   282  				for _, recNum := range indices {
   283  					stb.wipRecNumToColEnc[colNum][recNum] = enc
   284  				}
   285  			}
   286  			continue // done with this dict encoded column
   287  		}
   288  
   289  		// read the non-dict way
   290  		idx := uint32(0)
   291  		for recNum := uint16(0); recNum < numRecs; recNum++ {
   292  			cVal, endIdx, err := getColByteSlice(cwip.cbuf[idx:], 0) // todo pass qid here
   293  			if err != nil {
   294  				log.Errorf("populateLeafsWithMeasVals: Could not extract val for cname: %v, idx: %v",
   295  					colName, idx)
   296  				return err
   297  			}
   298  			idx += uint32(endIdx)
   299  			enc := stb.setColValEnc(colNum, string(cVal))
   300  			stb.wipRecNumToColEnc[colNum][recNum] = enc
   301  		}
   302  		if idx < cwip.cbufidx {
   303  			log.Errorf("creatEnc: passed thru all recNums, but idx: %v is not equal to cbufidx: %v",
   304  				idx, cwip.cbufidx)
   305  		}
   306  	}
   307  	return nil
   308  }
   309  
   310  func (stb *StarTreeBuilder) buildTreeStructure(wip *WipBlock) error {
   311  
   312  	numRecs := wip.blockSummary.RecCount
   313  
   314  	sizeToAdd := int(numRecs) - len(stb.nodePool)
   315  	if sizeToAdd > 0 {
   316  		newArr := make([]Node, sizeToAdd)
   317  		stb.nodePool = append(stb.nodePool, newArr...)
   318  	}
   319  
   320  	curColValues := make([]uint32, stb.numGroupByCols)
   321  	lenAggValues := len(stb.mColNames) * TotalMeasFns
   322  	measCidx := make([]uint32, len(stb.mColNames))
   323  
   324  	for recNum := uint16(0); recNum < numRecs; recNum += 1 {
   325  		for colNum := range stb.groupByKeys {
   326  			curColValues[colNum] = stb.wipRecNumToColEnc[colNum][recNum]
   327  		}
   328  		node := stb.insertIntoTree(stb.tree.Root, curColValues[:stb.numGroupByCols], recNum, 0)
   329  		for mcNum, mcName := range stb.mColNames {
   330  			cwip := wip.colWips[mcName]
   331  			midx := mcNum * TotalMeasFns
   332  			cVal, err := getMeasCval(cwip, recNum, measCidx, mcNum, mcName)
   333  			if err != nil {
   334  				log.Errorf("buildTreeStructure: Could not get measure for cname: %v, err: %v",
   335  					mcName, err)
   336  			}
   337  			err = stb.addMeasures(cVal, lenAggValues, midx, node)
   338  			if err != nil {
   339  				log.Errorf("buildTreeStructure: Could not add measure for cname: %v", mcName)
   340  				return err
   341  			}
   342  		}
   343  	}
   344  	return nil
   345  }
   346  
   347  func (stb *StarTreeBuilder) addMeasures(val utils.CValueEnclosure,
   348  	lenAggValues int, midx int, node *Node) error {
   349  
   350  	if node.aggValues == nil {
   351  		node.aggValues = make([]utils.CValueEnclosure, lenAggValues)
   352  	}
   353  
   354  	var err error
   355  	// always calculate all meas Fns
   356  	agvidx := midx + MeasFnMinIdx
   357  	node.aggValues[agvidx], err = utils.Reduce(node.aggValues[agvidx], val, utils.Min)
   358  	if err != nil {
   359  		log.Errorf("addMeasures: error in min err:%v", err)
   360  		return err
   361  	}
   362  	agvidx = midx + MeasFnMaxIdx
   363  	node.aggValues[agvidx], err = utils.Reduce(node.aggValues[agvidx], val, utils.Max)
   364  	if err != nil {
   365  		log.Errorf("addMeasures: error in max err:%v", err)
   366  		return err
   367  	}
   368  	agvidx = midx + MeasFnSumIdx
   369  	node.aggValues[agvidx], err = utils.Reduce(node.aggValues[agvidx], val, utils.Sum)
   370  	if err != nil {
   371  		log.Errorf("addMeasures: error in sum err:%v", err)
   372  		return err
   373  	}
   374  
   375  	agvidx = midx + MeasFnCountIdx
   376  	// for count we always use 1 instead of val
   377  	node.aggValues[agvidx], err = utils.Reduce(node.aggValues[agvidx], one, utils.Count)
   378  	if err != nil {
   379  		log.Errorf("addMeasures: error in count err:%v", err)
   380  		return err
   381  	}
   382  	return nil
   383  }
   384  
   385  /*
   386  ComputeStarTree
   387  
   388  	Current assumptions:
   389  
   390  	All groupBy columns that contain strings are dictionaryEncoded.
   391  	Any column with len(col.deMap) != 0 is assumed to be dictionary encoded
   392  	It is also assumed that no other values than the dic encoded strings appear in that column
   393  
   394  	When storing all other values, their raw byte values are converted to an unsigned integer,
   395  	and then converted to uint64 to have a consistent size
   396  
   397  parameters:
   398  
   399  	wipBlock: segstore's wip block
   400  
   401  returns:
   402  
   403  	StarTree: ptr to StarTree
   404  */
   405  func (stb *StarTreeBuilder) ComputeStarTree(wip *WipBlock) error {
   406  
   407  	err := stb.creatEnc(wip)
   408  	if err != nil {
   409  		return err
   410  	}
   411  
   412  	err = stb.buildTreeStructure(wip)
   413  	if err != nil {
   414  		return err
   415  	}
   416  
   417  	//	stb.logStarTreeSummary([]*Node{stb.tree.Root}, 0)
   418  	//stb.logStarTreeIds(tree.Root, -1)
   419  
   420  	return nil
   421  }
   422  
   423  /*
   424  func (stb *StarTreeBuilder) logStarTreeSummary(nodes []*Node, level int) {
   425  	nextLevel := []*Node{}
   426  	for _, n := range nodes {
   427  		for _, child := range n.children {
   428  			nextLevel = append(nextLevel, child)
   429  		}
   430  	}
   431  
   432  	log.Infof("logStarTreeSummary: level %d has %d nodes", level, len(nodes))
   433  	if len(nextLevel) > 0 {
   434  		stb.logStarTreeSummary(nextLevel, level+1)
   435  	}
   436  }
   437  */
   438  
   439  /*
   440  func (stb *StarTreeBuilder) logStarTreeIds(node *Node, level int) {
   441  
   442  	log.Infof("logStarTreeIds: level %d nodeId: %v, numChilds: %v", level, node.myKey, len(node.children))
   443  
   444  	for _, child := range node.children {
   445  		stb.logStarTreeIds(child, level+1)
   446  	}
   447  	}
   448  */
   449  
   450  func getMeasCval(cwip *ColWip, recNum uint16, cIdx []uint32, colNum int,
   451  	colName string) (utils.CValueEnclosure, error) {
   452  
   453  	if cwip.deCount < wipCardLimit {
   454  		for dword, recNumsArr := range cwip.deMap {
   455  			if toputils.BinarySearchUint16(recNum, recNumsArr) {
   456  				mcVal, _, err := GetCvalFromRec([]byte(dword)[0:], 0)
   457  				if err != nil {
   458  					log.Errorf("getMeasCval: Could not extract val for cname: %v, dword: %v",
   459  						colName, dword)
   460  					return utils.CValueEnclosure{}, err
   461  				}
   462  				return mcVal, nil
   463  			}
   464  		}
   465  		return utils.CValueEnclosure{}, fmt.Errorf("could not find recNum: %v", recNum)
   466  	}
   467  
   468  	cVal, endIdx, err := GetCvalFromRec(cwip.cbuf[cIdx[colNum]:], 0) // todo pass qid
   469  	if err != nil {
   470  		log.Errorf("getMeasCval: Could not extract val for cname: %v, idx: %v",
   471  			colName, cIdx[colNum])
   472  		return utils.CValueEnclosure{}, err
   473  	}
   474  	cIdx[colNum] += uint32(endIdx)
   475  	return cVal, nil
   476  }