github.com/siglens/siglens@v0.0.0-20240328180423-f7ce9ae441ed/pkg/segment/writer/agiletree.go (about) 1 /* 2 Copyright 2023. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package writer 18 19 import ( 20 "fmt" 21 "math" 22 23 "github.com/siglens/siglens/pkg/segment/utils" 24 toputils "github.com/siglens/siglens/pkg/utils" 25 log "github.com/sirupsen/logrus" 26 ) 27 28 type StarTree struct { 29 Root *Node 30 } 31 32 // its ok for this to be int, since this will be used as an index in arrays 33 const ( 34 MeasFnMinIdx int = iota // has to be always zero based 35 MeasFnMaxIdx 36 MeasFnSumIdx 37 MeasFnCountIdx 38 // Note: anytimes you add a Fn, make sure to adjust the IdxToAgFn array 39 // Note: always keep this last since it is used for indexing into aggValues 40 TotalMeasFns 41 ) 42 43 var IdxToAgFn []utils.AggregateFunctions = []utils.AggregateFunctions{ 44 utils.Min, utils.Max, 45 utils.Sum, utils.Count} 46 47 func AgFnToIdx(fn utils.AggregateFunctions) int { 48 switch fn { 49 case utils.Min: 50 return MeasFnMinIdx 51 case utils.Max: 52 return MeasFnMaxIdx 53 case utils.Sum: 54 return MeasFnSumIdx 55 case utils.Count: 56 return MeasFnCountIdx 57 } 58 log.Errorf("AgFnToIdx: invalid fn: %v", fn) 59 return MeasFnCountIdx 60 } 61 62 var one = utils.CValueEnclosure{Dtype: utils.SS_DT_UNSIGNED_NUM, CVal: uint64(1)} 63 64 type Node struct { 65 myKey uint32 66 parent *Node 67 children map[uint32]*Node 68 aggValues []utils.CValueEnclosure 69 } 70 71 type StarTreeBuilder struct { 72 groupByKeys []string 73 numGroupByCols uint16 74 mColNames []string 75 nodeCount int 76 nodePool []Node 77 tree *StarTree 78 segDictMap []map[string]uint32 // "mac" ==> enc-2 79 segDictEncRev [][]string // [colNum]["ios", "mac", "win" ...] , [0][enc2] --> "mac" 80 segDictLastNum []uint32 // for each ColNum maintains the lastEnc increasing seq 81 wipRecNumToColEnc [][]uint32 //maintain working buffer per wipBlock 82 buf []byte 83 } 84 85 func (stb *StarTreeBuilder) GetNodeCount() int { 86 return stb.nodeCount 87 } 88 89 /* 90 ResetSegTree 91 92 Current assumptions: 93 94 All groupBy columns that contain strings are dictionaryEncoded. 95 Any column with len(col.deMap) != 0 is assumed to be dictionary encoded 96 It is also assumed that no other values than the dic encoded strings appear in that column 97 98 When storing all other values, their raw byte values are converted to an unsigned integer, 99 and then converted to uint64 to have a consistent size 100 101 parameters: 102 103 wipBlock: segstore's wip block 104 groupByKeys: groupBy column Names 105 mColNames: colnames of measure columns 106 107 returns: 108 */ 109 func (stb *StarTreeBuilder) ResetSegTree(block *WipBlock, groupByKeys []string, mColNames []string) { 110 111 stb.groupByKeys = groupByKeys 112 numGroupByCols := uint16(len(groupByKeys)) 113 stb.numGroupByCols = numGroupByCols 114 stb.mColNames = mColNames 115 116 stb.resetNodeData(block) 117 118 root := stb.newNode() 119 root.myKey = math.MaxUint32 // give max for root 120 stb.tree = &StarTree{Root: root} 121 122 sizeToAdd := int(numGroupByCols) - len(stb.segDictEncRev) 123 if sizeToAdd <= 0 { 124 stb.segDictEncRev = stb.segDictEncRev[:numGroupByCols] 125 stb.segDictMap = stb.segDictMap[:numGroupByCols] 126 stb.wipRecNumToColEnc = stb.wipRecNumToColEnc[:stb.numGroupByCols] 127 stb.segDictLastNum = stb.segDictLastNum[:stb.numGroupByCols] 128 } else { 129 newArr := make([][]string, sizeToAdd) 130 stb.segDictEncRev = append(stb.segDictEncRev, newArr...) 131 newArr2 := make([][]uint32, sizeToAdd) 132 stb.wipRecNumToColEnc = append(stb.wipRecNumToColEnc, newArr2...) 133 stb.segDictMap = append(stb.segDictMap, make([]map[string]uint32, sizeToAdd)...) 134 stb.segDictLastNum = append(stb.segDictLastNum, make([]uint32, sizeToAdd)...) 135 } 136 137 for colNum := uint16(0); colNum < numGroupByCols; colNum++ { 138 if stb.segDictEncRev[colNum] == nil { 139 // we know each col won't have more encodings than max node limit 140 stb.segDictEncRev[colNum] = make([]string, MaxAgileTreeNodeCount) 141 } 142 if stb.segDictMap[colNum] == nil { 143 stb.segDictMap[colNum] = make(map[string]uint32) 144 } 145 stb.segDictLastNum[colNum] = 0 146 for cv := range stb.segDictMap[colNum] { 147 delete(stb.segDictMap[colNum], cv) 148 } 149 } 150 151 if len(stb.buf) <= 0 { 152 stb.buf = make([]byte, 1_000_000) // initial start size 153 } 154 } 155 156 func (stb *StarTreeBuilder) setColValEnc(colNum int, colVal string) uint32 { 157 // todo a zero copy version of map lookups needed 158 enc, ok := stb.segDictMap[colNum][colVal] 159 if !ok { 160 enc = stb.segDictLastNum[colNum] 161 stb.segDictMap[colNum][colVal] = enc 162 stb.segDictEncRev[colNum][enc] = colVal 163 stb.segDictLastNum[colNum]++ 164 } 165 return enc 166 } 167 168 // helper function to reset node data for builder reuse 169 func (stb *StarTreeBuilder) resetNodeData(wip *WipBlock) { 170 171 for _, node := range stb.nodePool { 172 node.parent = nil 173 for k := range node.children { 174 delete(node.children, k) 175 } 176 node.aggValues = nil 177 } 178 stb.nodeCount = 0 179 } 180 181 func (stb *StarTreeBuilder) newNode() *Node { 182 183 if stb.nodeCount >= len(stb.nodePool) { 184 stb.nodePool = append(stb.nodePool, Node{}) 185 } 186 ans := stb.nodePool[stb.nodeCount] 187 stb.nodeCount += 1 188 189 if ans.children == nil { 190 ans.children = make(map[uint32]*Node) 191 } 192 193 return &ans 194 } 195 196 func (stb *StarTreeBuilder) Aggregate(cur *Node) error { 197 198 first := true 199 200 lenAggValues := len(stb.mColNames) * TotalMeasFns 201 202 if len(cur.children) != 0 { 203 cur.aggValues = make([]utils.CValueEnclosure, lenAggValues) 204 } 205 206 var err error 207 for _, child := range cur.children { 208 err = stb.Aggregate(child) 209 if err != nil { 210 return err 211 } 212 213 if first { 214 copy(cur.aggValues[:lenAggValues], child.aggValues[:lenAggValues]) 215 first = false 216 continue 217 } 218 219 for mcNum := range stb.mColNames { 220 midx := mcNum * TotalMeasFns 221 agidx := midx + MeasFnMinIdx 222 cur.aggValues[agidx], err = utils.Reduce(cur.aggValues[agidx], child.aggValues[agidx], utils.Min) 223 if err != nil { 224 log.Errorf("Aggregate: error in aggregating min err:%v", err) 225 return err 226 } 227 agidx = midx + MeasFnMaxIdx 228 cur.aggValues[agidx], err = utils.Reduce(cur.aggValues[agidx], child.aggValues[agidx], utils.Max) 229 if err != nil { 230 log.Errorf("Aggregate: error in aggregating max err:%v", err) 231 return err 232 } 233 agidx = midx + MeasFnSumIdx 234 cur.aggValues[agidx], err = utils.Reduce(cur.aggValues[agidx], child.aggValues[agidx], utils.Sum) 235 if err != nil { 236 log.Errorf("Aggregate: error in aggregating sum err:%v", err) 237 return err 238 } 239 agidx = midx + MeasFnCountIdx 240 cur.aggValues[agidx], err = utils.Reduce(cur.aggValues[agidx], child.aggValues[agidx], utils.Count) 241 if err != nil { 242 log.Errorf("Aggregate: error in aggregating count err:%v", err) 243 return err 244 } 245 } 246 } 247 248 return nil 249 } 250 251 func (stb *StarTreeBuilder) insertIntoTree(node *Node, colVals []uint32, recNum uint16, idx uint) *Node { 252 child, keyExists := node.children[colVals[idx]] 253 if !keyExists { 254 child = stb.newNode() 255 child.myKey = colVals[idx] 256 child.parent = node 257 node.children[colVals[idx]] = child 258 } 259 260 if idx+1 != uint(len(colVals)) { 261 return stb.insertIntoTree(child, colVals, recNum, idx+1) 262 } else { 263 return child 264 } 265 } 266 267 func (stb *StarTreeBuilder) creatEnc(wip *WipBlock) error { 268 269 numRecs := wip.blockSummary.RecCount 270 271 for colNum, colName := range stb.groupByKeys { 272 sizeToAdd := int(numRecs) - len(stb.wipRecNumToColEnc[colNum]) 273 if sizeToAdd > 0 { 274 newArr := make([]uint32, sizeToAdd) 275 stb.wipRecNumToColEnc[colNum] = append(stb.wipRecNumToColEnc[colNum], newArr...) 276 } 277 278 cwip := wip.colWips[colName] 279 if cwip.deCount < wipCardLimit { 280 for rawKey, indices := range cwip.deMap { 281 enc := stb.setColValEnc(colNum, rawKey) 282 for _, recNum := range indices { 283 stb.wipRecNumToColEnc[colNum][recNum] = enc 284 } 285 } 286 continue // done with this dict encoded column 287 } 288 289 // read the non-dict way 290 idx := uint32(0) 291 for recNum := uint16(0); recNum < numRecs; recNum++ { 292 cVal, endIdx, err := getColByteSlice(cwip.cbuf[idx:], 0) // todo pass qid here 293 if err != nil { 294 log.Errorf("populateLeafsWithMeasVals: Could not extract val for cname: %v, idx: %v", 295 colName, idx) 296 return err 297 } 298 idx += uint32(endIdx) 299 enc := stb.setColValEnc(colNum, string(cVal)) 300 stb.wipRecNumToColEnc[colNum][recNum] = enc 301 } 302 if idx < cwip.cbufidx { 303 log.Errorf("creatEnc: passed thru all recNums, but idx: %v is not equal to cbufidx: %v", 304 idx, cwip.cbufidx) 305 } 306 } 307 return nil 308 } 309 310 func (stb *StarTreeBuilder) buildTreeStructure(wip *WipBlock) error { 311 312 numRecs := wip.blockSummary.RecCount 313 314 sizeToAdd := int(numRecs) - len(stb.nodePool) 315 if sizeToAdd > 0 { 316 newArr := make([]Node, sizeToAdd) 317 stb.nodePool = append(stb.nodePool, newArr...) 318 } 319 320 curColValues := make([]uint32, stb.numGroupByCols) 321 lenAggValues := len(stb.mColNames) * TotalMeasFns 322 measCidx := make([]uint32, len(stb.mColNames)) 323 324 for recNum := uint16(0); recNum < numRecs; recNum += 1 { 325 for colNum := range stb.groupByKeys { 326 curColValues[colNum] = stb.wipRecNumToColEnc[colNum][recNum] 327 } 328 node := stb.insertIntoTree(stb.tree.Root, curColValues[:stb.numGroupByCols], recNum, 0) 329 for mcNum, mcName := range stb.mColNames { 330 cwip := wip.colWips[mcName] 331 midx := mcNum * TotalMeasFns 332 cVal, err := getMeasCval(cwip, recNum, measCidx, mcNum, mcName) 333 if err != nil { 334 log.Errorf("buildTreeStructure: Could not get measure for cname: %v, err: %v", 335 mcName, err) 336 } 337 err = stb.addMeasures(cVal, lenAggValues, midx, node) 338 if err != nil { 339 log.Errorf("buildTreeStructure: Could not add measure for cname: %v", mcName) 340 return err 341 } 342 } 343 } 344 return nil 345 } 346 347 func (stb *StarTreeBuilder) addMeasures(val utils.CValueEnclosure, 348 lenAggValues int, midx int, node *Node) error { 349 350 if node.aggValues == nil { 351 node.aggValues = make([]utils.CValueEnclosure, lenAggValues) 352 } 353 354 var err error 355 // always calculate all meas Fns 356 agvidx := midx + MeasFnMinIdx 357 node.aggValues[agvidx], err = utils.Reduce(node.aggValues[agvidx], val, utils.Min) 358 if err != nil { 359 log.Errorf("addMeasures: error in min err:%v", err) 360 return err 361 } 362 agvidx = midx + MeasFnMaxIdx 363 node.aggValues[agvidx], err = utils.Reduce(node.aggValues[agvidx], val, utils.Max) 364 if err != nil { 365 log.Errorf("addMeasures: error in max err:%v", err) 366 return err 367 } 368 agvidx = midx + MeasFnSumIdx 369 node.aggValues[agvidx], err = utils.Reduce(node.aggValues[agvidx], val, utils.Sum) 370 if err != nil { 371 log.Errorf("addMeasures: error in sum err:%v", err) 372 return err 373 } 374 375 agvidx = midx + MeasFnCountIdx 376 // for count we always use 1 instead of val 377 node.aggValues[agvidx], err = utils.Reduce(node.aggValues[agvidx], one, utils.Count) 378 if err != nil { 379 log.Errorf("addMeasures: error in count err:%v", err) 380 return err 381 } 382 return nil 383 } 384 385 /* 386 ComputeStarTree 387 388 Current assumptions: 389 390 All groupBy columns that contain strings are dictionaryEncoded. 391 Any column with len(col.deMap) != 0 is assumed to be dictionary encoded 392 It is also assumed that no other values than the dic encoded strings appear in that column 393 394 When storing all other values, their raw byte values are converted to an unsigned integer, 395 and then converted to uint64 to have a consistent size 396 397 parameters: 398 399 wipBlock: segstore's wip block 400 401 returns: 402 403 StarTree: ptr to StarTree 404 */ 405 func (stb *StarTreeBuilder) ComputeStarTree(wip *WipBlock) error { 406 407 err := stb.creatEnc(wip) 408 if err != nil { 409 return err 410 } 411 412 err = stb.buildTreeStructure(wip) 413 if err != nil { 414 return err 415 } 416 417 // stb.logStarTreeSummary([]*Node{stb.tree.Root}, 0) 418 //stb.logStarTreeIds(tree.Root, -1) 419 420 return nil 421 } 422 423 /* 424 func (stb *StarTreeBuilder) logStarTreeSummary(nodes []*Node, level int) { 425 nextLevel := []*Node{} 426 for _, n := range nodes { 427 for _, child := range n.children { 428 nextLevel = append(nextLevel, child) 429 } 430 } 431 432 log.Infof("logStarTreeSummary: level %d has %d nodes", level, len(nodes)) 433 if len(nextLevel) > 0 { 434 stb.logStarTreeSummary(nextLevel, level+1) 435 } 436 } 437 */ 438 439 /* 440 func (stb *StarTreeBuilder) logStarTreeIds(node *Node, level int) { 441 442 log.Infof("logStarTreeIds: level %d nodeId: %v, numChilds: %v", level, node.myKey, len(node.children)) 443 444 for _, child := range node.children { 445 stb.logStarTreeIds(child, level+1) 446 } 447 } 448 */ 449 450 func getMeasCval(cwip *ColWip, recNum uint16, cIdx []uint32, colNum int, 451 colName string) (utils.CValueEnclosure, error) { 452 453 if cwip.deCount < wipCardLimit { 454 for dword, recNumsArr := range cwip.deMap { 455 if toputils.BinarySearchUint16(recNum, recNumsArr) { 456 mcVal, _, err := GetCvalFromRec([]byte(dword)[0:], 0) 457 if err != nil { 458 log.Errorf("getMeasCval: Could not extract val for cname: %v, dword: %v", 459 colName, dword) 460 return utils.CValueEnclosure{}, err 461 } 462 return mcVal, nil 463 } 464 } 465 return utils.CValueEnclosure{}, fmt.Errorf("could not find recNum: %v", recNum) 466 } 467 468 cVal, endIdx, err := GetCvalFromRec(cwip.cbuf[cIdx[colNum]:], 0) // todo pass qid 469 if err != nil { 470 log.Errorf("getMeasCval: Could not extract val for cname: %v, idx: %v", 471 colName, cIdx[colNum]) 472 return utils.CValueEnclosure{}, err 473 } 474 cIdx[colNum] += uint32(endIdx) 475 return cVal, nil 476 }