github.com/siglens/siglens@v0.0.0-20240328180423-f7ce9ae441ed/pkg/segment/reader/segread/agiletreereader.go (about) 1 /* 2 Copyright 2023. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package segread 18 19 import ( 20 "errors" 21 "fmt" 22 "os" 23 "strings" 24 25 "github.com/siglens/siglens/pkg/segment/results/blockresults" 26 "github.com/siglens/siglens/pkg/segment/structs" 27 "github.com/siglens/siglens/pkg/segment/utils" 28 "github.com/siglens/siglens/pkg/segment/writer" 29 toputils "github.com/siglens/siglens/pkg/utils" 30 log "github.com/sirupsen/logrus" 31 ) 32 33 const MAX_NODE_PTRS = 80_000 34 35 type AgileTreeReader struct { 36 segKey string 37 metaFd *os.File // meta file descriptor 38 levDataFd *os.File // level data file descriptor 39 isMetaLoaded bool 40 metaFileBuffer []byte // buffer re-used for file reads values 41 metaBuf []byte // meta buff block 42 treeMeta *StarTreeMetadata 43 buckets aggsTreeBuckets 44 } 45 46 type aggsTreeBuckets struct { 47 bucketLimit uint64 48 saveBuckets bool 49 rawVals map[string]struct{} 50 } 51 52 type StarTreeMetadata struct { 53 groupByKeys []string 54 numGroupByCols uint16 55 measureColNames []string // store only index of mcol, and calculate all stats for them 56 57 // allDictEncodings[colName] has information about the ith groupby column. 58 // allDictEncodings[colName][num] will give the raw encoding that num references in the agileTree 59 allDictEncodings map[string]map[uint32][]byte 60 levsOffsets []int64 // stores where each level starts in the file, uses fileOffsetFromStart 61 levsSizes []uint32 // stores the size of each level 62 } 63 64 // returns a new AgileTreeReader and any errors encountered 65 // The returned AgileTreeReader must call .Close() when finished using it to close the fd 66 func InitNewAgileTreeReader(segKey string, qid uint64) (*AgileTreeReader, error) { 67 68 // Open the FD for AgileTree 69 // todo add download code for agileTree file 70 // fName, err := blob.DownloadSegmentBlobAsInUse(segKey, colName, structs.Str) 71 fName := segKey + ".strm" 72 fd, err := os.OpenFile(fName, os.O_RDONLY, 0644) 73 if err != nil { 74 log.Infof("qid=%d, InitNewAgileTreeReader: failed to open STR %s for Error: %v.", 75 qid, fName, err) 76 return nil, err 77 } 78 79 return &AgileTreeReader{ 80 segKey: segKey, 81 metaFd: fd, 82 metaFileBuffer: *fileReadBufferPool.Get().(*[]byte), 83 isMetaLoaded: false, 84 buckets: aggsTreeBuckets{}, 85 }, nil 86 } 87 88 func (str *AgileTreeReader) GetBuckets() map[string]struct{} { 89 return str.buckets.rawVals 90 } 91 92 func (str *AgileTreeReader) SetBuckets(buckets map[string]struct{}) { 93 str.buckets.rawVals = buckets 94 } 95 96 func (str *AgileTreeReader) SetBucketLimit(bucketLimit uint64) { 97 str.buckets.bucketLimit = bucketLimit 98 99 // If the bucketLimit is 0, then there is no limit. If there is a limit, we 100 // need to save the buckets between each segment so if we hit the limit, 101 // we make sure to read all the same buckets between all the segments. 102 str.buckets.saveBuckets = bucketLimit > 0 103 } 104 105 func (str *AgileTreeReader) Close() error { 106 if str.metaFd != nil { 107 str.metaFd.Close() 108 } 109 if str.levDataFd != nil { 110 str.levDataFd.Close() 111 } 112 113 str.returnBuffers() 114 return nil 115 } 116 117 func (str *AgileTreeReader) returnBuffers() { 118 fileReadBufferPool.Put(&str.metaFileBuffer) 119 } 120 121 func (str *AgileTreeReader) resetBlkVars() { 122 123 str.treeMeta = nil 124 str.isMetaLoaded = false 125 } 126 127 /* 128 parameters: 129 130 none 131 132 returns: 133 134 err 135 */ 136 func (str *AgileTreeReader) ReadTreeMeta() error { 137 138 if str.isMetaLoaded { 139 return nil 140 } 141 142 str.resetBlkVars() 143 144 finfo, err := os.Stat(str.metaFd.Name()) 145 if err != nil { 146 log.Errorf("ReadTreeMeta could not get file size error: %+v", err) 147 return err 148 } 149 fileSize := uint32(finfo.Size()) 150 151 if uint32(len(str.metaFileBuffer)) < fileSize { 152 newArr := make([]byte, fileSize-uint32(len(str.metaFileBuffer))) 153 str.metaFileBuffer = append(str.metaFileBuffer, newArr...) 154 } 155 156 _, err = str.metaFd.ReadAt(str.metaFileBuffer[:fileSize], 0) 157 if err != nil { 158 log.Errorf("ReadTreeMeta read file error: %+v", err) 159 return err 160 } 161 162 if str.metaFileBuffer[0] != utils.STAR_TREE_BLOCK[0] { 163 log.Errorf("ReadTreeMeta: received an unknown encoding type for agileTree: %v", 164 str.metaFileBuffer[0]) 165 return errors.New("received non-agileTree encoding") 166 } 167 168 idx := uint32(0) 169 str.metaBuf = str.metaFileBuffer[0:fileSize] 170 idx += 1 171 172 // LenMetaData 173 lenMeta := toputils.BytesToUint32LittleEndian(str.metaBuf[idx : idx+4]) 174 idx += 4 175 176 // MetaData 177 meta, err := str.decodeMetadata(str.metaBuf[idx : idx+lenMeta]) 178 if err != nil { 179 return err 180 } 181 idx += lenMeta 182 183 // read levsOffsets and levsSizes 184 meta.levsOffsets = make([]int64, meta.numGroupByCols+1) 185 meta.levsSizes = make([]uint32, meta.numGroupByCols+1) 186 for i := range meta.levsOffsets { 187 meta.levsOffsets[i] = toputils.BytesToInt64LittleEndian(str.metaBuf[idx : idx+8]) 188 idx += 8 189 meta.levsSizes[i] = toputils.BytesToUint32LittleEndian(str.metaBuf[idx : idx+4]) 190 idx += 4 191 } 192 193 str.treeMeta = meta 194 str.isMetaLoaded = true 195 196 return nil 197 } 198 199 /* 200 parameters: 201 202 grpColNames: Names of GroupByColNames 203 mColNames: Names of MeasureColumns 204 205 returns: 206 207 bool: if grp and mcol are present and query is fully answerable by AgileTree 208 error: error if any 209 210 Func: If any colname either in grp or measure is not present will return false 211 */ 212 func (str *AgileTreeReader) CanUseAgileTree(grpReq *structs.GroupByRequest) (bool, error) { 213 214 if len(grpReq.GroupByColumns) == 0 && len(grpReq.MeasureOperations) == 0 { 215 return false, nil 216 } 217 218 if !str.isMetaLoaded { 219 err := str.ReadTreeMeta() 220 if err != nil { 221 return false, err 222 } 223 } 224 225 // walk through grpColnames 226 for _, cname := range grpReq.GroupByColumns { 227 ok := toputils.SearchStr(cname, str.treeMeta.groupByKeys) 228 if !ok { 229 return false, nil 230 } 231 } 232 233 // walk through measure colname 234 for _, m := range grpReq.MeasureOperations { 235 if m.MeasureCol == "*" && m.MeasureFunc == utils.Count { 236 continue // we treat count(*) as just as a bucket count 237 } 238 found := false 239 for _, treeMCname := range str.treeMeta.measureColNames { 240 if m.MeasureCol == treeMCname { 241 found = true 242 break 243 } 244 } 245 if !found { 246 return false, nil 247 } 248 } 249 return true, nil 250 } 251 252 func (str *AgileTreeReader) decodeMetadata(buf []byte) (*StarTreeMetadata, error) { 253 254 tmeta := StarTreeMetadata{} 255 256 idx := uint32(0) 257 258 // Len of groupByKeys 259 tmeta.numGroupByCols = toputils.BytesToUint16LittleEndian(buf[idx : idx+2]) 260 idx += 2 261 262 tmeta.groupByKeys = make([]string, tmeta.numGroupByCols) 263 for i := uint16(0); i < tmeta.numGroupByCols; i++ { 264 // grp str len 265 l1 := toputils.BytesToUint16LittleEndian(buf[idx : idx+2]) 266 idx += 2 267 268 // grp actual str 269 tmeta.groupByKeys[i] = string(buf[idx : idx+uint32(l1)]) 270 idx += uint32(l1) 271 } 272 273 // Len of MeasureColNames 274 lenMcolNames := toputils.BytesToUint16LittleEndian(buf[idx : idx+2]) 275 idx += 2 276 277 tmeta.measureColNames = make([]string, lenMcolNames) 278 279 for i := uint16(0); i < lenMcolNames; i++ { 280 // Mcol Len 281 l1 := toputils.BytesToUint16LittleEndian(buf[idx : idx+2]) 282 idx += 2 283 284 // Mcol strname 285 tmeta.measureColNames[i] = string(buf[idx : idx+uint32(l1)]) 286 idx += uint32(l1) 287 } 288 tmeta.allDictEncodings = make(map[string]map[uint32][]byte, tmeta.numGroupByCols) 289 290 var soff, eoff uint32 291 for j := uint16(0); j < tmeta.numGroupByCols; j++ { 292 293 // colname strlen 294 l1 := toputils.BytesToUint16LittleEndian(buf[idx : idx+2]) 295 idx += 2 296 297 // colname str : we only store the offsets to save on string copy 298 soff = idx 299 idx += uint32(l1) 300 eoff = idx 301 302 // numKeys 303 numDictEncodings := toputils.BytesToUint32LittleEndian(buf[idx : idx+4]) 304 idx += 4 305 306 if numDictEncodings == 0 { 307 log.Errorf("decodeMetadata: numDictEncodings was 0 for cname: %v", string(buf[soff:eoff])) 308 continue 309 } 310 311 dictEncoding := make(map[uint32][]byte, numDictEncodings) 312 313 for i := uint32(0); i < numDictEncodings; i += 1 { 314 // enc col val strlen 315 l1 := toputils.BytesToUint16LittleEndian(buf[idx : idx+2]) 316 idx += 2 317 318 // enc col val str 319 dictEncoding[i] = buf[idx : idx+uint32(l1)] 320 idx += uint32(l1) 321 } 322 tmeta.allDictEncodings[string(buf[soff:eoff])] = dictEncoding 323 324 } 325 326 return &tmeta, nil 327 } 328 329 // returns the level that the column name will exist in tree. 330 // This assumes that level -1 is the root node 331 func (str *AgileTreeReader) getLevelForColumn(colName string) (int, error) { 332 for idx, name := range str.treeMeta.groupByKeys { 333 if name == colName { 334 return idx + 1, nil // root is at level 0 so add 1 335 } 336 } 337 return 0, fmt.Errorf("column %+v not found in tree", colName) 338 } 339 340 func (str *AgileTreeReader) getRawVal(key uint32, dictEncoding map[uint32][]byte) ([]byte, error) { 341 rawVal, ok := dictEncoding[key] 342 if !ok { 343 return []byte{}, fmt.Errorf("failed to find raw value for idx %+v which has %+v keys", key, len(dictEncoding)) 344 } 345 return rawVal, nil 346 } 347 348 func (str *AgileTreeReader) decodeNodeDetailsJit(buf []byte, numAggValues int, 349 desiredLevel uint16, combiner map[string][]utils.NumTypeEnclosure, 350 measResIndices []int, lenMri int, grpTreeLevels []uint16, grpColNames []string) error { 351 352 var wvInt64 int64 353 var wvFloat64 float64 354 idx := uint32(0) 355 356 // level 357 curLevel := toputils.BytesToUint16LittleEndian(buf[idx : idx+2]) 358 idx += 2 359 360 // numNodes at this level 361 numNodes := toputils.BytesToUint32LittleEndian(buf[idx : idx+4]) 362 idx += 4 363 364 if curLevel != desiredLevel { 365 log.Errorf("decodeNodeDetailsJit wanted level: %v, but read level: %v", desiredLevel, curLevel) 366 return fmt.Errorf("decodeNodeDetailsJit wanted level: %v, but read level: %v", desiredLevel, curLevel) 367 } 368 369 usedDictEncodings := make([]map[uint32][]byte, len(grpTreeLevels)) 370 for i, grpCol := range grpColNames { 371 usedDictEncodings[i] = str.treeMeta.allDictEncodings[grpCol] 372 } 373 374 // Allocate all the memory we need for the group by keys upfront to avoid 375 // many small allocations. This also allows us to convert a byte slice to 376 // a string without copying; this uses the unsafe package, but we never 377 // change that region of the byte slice, so it's safe. 378 wvBuf := make([]byte, len(grpTreeLevels)*4*int(numNodes)) 379 wvIdx := uint32(0) 380 381 newBuckets := 0 382 383 for i := uint32(0); i < numNodes; i++ { 384 // get mapkey 385 386 myKey := buf[idx : idx+4] 387 idx += 4 388 389 kidx := uint32(0) 390 for _, grpLev := range grpTreeLevels { 391 if grpLev == desiredLevel { 392 copy(wvBuf[wvIdx+kidx:], myKey) 393 } else { 394 // The next four bytes of buf is the parent's node key, the 395 // next four after that is the grandparent's node key, etc. 396 ancestorLevel := desiredLevel - grpLev 397 offset := uint32(ancestorLevel-1) * 4 398 copy(wvBuf[wvIdx+kidx:], buf[idx+offset:idx+offset+4]) 399 } 400 kidx += 4 401 } 402 wvNodeKey := toputils.UnsafeByteSliceToString(wvBuf[wvIdx : wvIdx+kidx]) 403 wvIdx += kidx 404 idx += uint32(desiredLevel-1) * 4 405 406 aggVal, ok := combiner[wvNodeKey] 407 if !ok { 408 // Check if we hit the bucket limit. bucketLimit == 0 is a special 409 // case and means there is no limit. 410 if str.buckets.bucketLimit > 0 { 411 rawVal, _ := str.decodeRawValBytes(wvNodeKey, usedDictEncodings, grpColNames) 412 _, existingBucket := str.buckets.rawVals[rawVal] 413 if !existingBucket { 414 if uint64(len(str.buckets.rawVals))+uint64(newBuckets) >= str.buckets.bucketLimit { 415 // We've reached the bucket limit, so we shouldn't add another. 416 // However, we need to continue reading the AgileTree because 417 // we might reach another node that has data for a bucket we've 418 // already added. 419 idx += uint32(numAggValues) * 9 420 continue 421 } else { 422 newBuckets += 1 423 } 424 } 425 } 426 427 aggVal = make([]utils.NumTypeEnclosure, lenMri) 428 combiner[wvNodeKey] = aggVal 429 } 430 431 if aggVal == nil { 432 aggVal = make([]utils.NumTypeEnclosure, lenMri) 433 } 434 435 for j := 0; j < lenMri; j++ { 436 agIdx := idx // set to the start of aggValue for this node's data 437 agIdx += uint32(measResIndices[j]) * 9 // jump to the AgValue for this meas's index 438 439 dtype := utils.SS_DTYPE(buf[agIdx]) 440 agIdx += 1 441 442 switch dtype { 443 case utils.SS_DT_UNSIGNED_NUM, utils.SS_DT_SIGNED_NUM: 444 wvInt64 = toputils.BytesToInt64LittleEndian(buf[agIdx : agIdx+8]) 445 case utils.SS_DT_FLOAT: 446 wvFloat64 = toputils.BytesToFloat64LittleEndian(buf[agIdx : agIdx+8]) 447 case utils.SS_DT_BACKFILL: 448 default: 449 return fmt.Errorf("decodeNodeDetailsJit: unsupported Dtype: %v", dtype) 450 } 451 452 // remainder will give us MeasFnIdx 453 fn := writer.IdxToAgFn[measResIndices[j]%writer.TotalMeasFns] 454 err := aggVal[j].ReduceFast(dtype, wvInt64, wvFloat64, fn) 455 if err != nil { 456 log.Errorf("decodeNodeDetailsJit: Failed to reduce aggregation for err: %v", err) 457 } 458 } 459 idx += uint32(numAggValues) * 9 460 } 461 462 return nil 463 } 464 465 // applies groupby results and returns requested measure operations 466 // first applies the first groupby column. For all returned nodes, apply second & so on until no more groupby exists 467 func (str *AgileTreeReader) ApplyGroupByJit(grpColNames []string, 468 internalMops []*structs.MeasureAggregator, blkResults *blockresults.BlockResults, 469 qid uint64, agileTreeBuf []byte) error { 470 471 // make sure meta is loaded 472 _ = str.ReadTreeMeta() 473 474 var maxGrpLevel uint16 475 grpTreeLevels := make([]uint16, len(grpColNames)) 476 for i, grpByCol := range grpColNames { 477 level, err := str.getLevelForColumn(grpByCol) 478 if err != nil { 479 log.Errorf("qid=%v, ApplyGroupByJit: failed to get level in tree for column %s: %v", qid, 480 grpByCol, err) 481 return err 482 } 483 maxGrpLevel = utils.MaxUint16(maxGrpLevel, uint16(level)) 484 grpTreeLevels[i] = uint16(level) 485 } 486 487 measResIndices := make([]int, 0) 488 489 // Always retrieve count. 490 // If count is asked we return count twice, but thats a small price to pay for simpler code 491 measResIndices = append(measResIndices, writer.MeasFnCountIdx) 492 493 for _, mops := range internalMops { 494 found := false 495 tcidx := 0 // var for tree's column name index 496 for i, treeMCname := range str.treeMeta.measureColNames { 497 if mops.MeasureCol == treeMCname { 498 found = true 499 tcidx = i 500 break 501 } 502 } 503 if !found { 504 log.Errorf("qid=%v, ApplyGroupByJit: Tree could not find mcol: %v", qid, mops.MeasureCol) 505 return fmt.Errorf("qid=%v, ApplyGroupByJit: Tree could not find mcol: %v", 506 qid, mops.MeasureCol) 507 } 508 fnidx := writer.AgFnToIdx(mops.MeasureFunc) // What MeasFn idx this translates to 509 measResIndices = append(measResIndices, tcidx*writer.TotalMeasFns+fnidx) // see where it is in agileTree 510 } 511 512 combiner := make(map[string][]utils.NumTypeEnclosure) 513 514 err := str.computeAggsJit(combiner, maxGrpLevel, measResIndices, agileTreeBuf, 515 grpTreeLevels, grpColNames) 516 if err != nil { 517 log.Errorf("qid=%v, ApplyGroupByJit: failed to apply aggs-jit: %v", qid, err) 518 return err 519 } 520 521 usedDictEncodings := make([]map[uint32][]byte, len(grpColNames)) 522 for i, grpCol := range grpColNames { 523 usedDictEncodings[i] = str.treeMeta.allDictEncodings[grpCol] 524 } 525 526 for mkey, ntAgvals := range combiner { 527 if len(ntAgvals) == 0 { 528 continue 529 } 530 rawVal, err := str.decodeRawValBytes(mkey, usedDictEncodings, grpColNames) 531 if err != nil { 532 log.Errorf("qid=%v, ApplyGroupByJit: Failed to get raw value for a agileTree key! %+v", qid, err) 533 return err 534 } 535 536 if str.buckets.saveBuckets { 537 if str.buckets.rawVals == nil { 538 str.buckets.rawVals = make(map[string]struct{}) 539 } 540 541 str.buckets.rawVals[rawVal] = struct{}{} 542 } 543 544 cvaggvalues := make([]utils.CValueEnclosure, len(internalMops)) 545 resCvIdx := 0 546 var colCntVal uint64 547 extVal := utils.CValueEnclosure{} 548 for i := 0; i < len(ntAgvals); i++ { 549 switch ntAgvals[i].Ntype { 550 case utils.SS_DT_SIGNED_NUM, utils.SS_DT_UNSIGNED_NUM: 551 extVal.Dtype = utils.SS_DT_SIGNED_NUM 552 extVal.CVal = ntAgvals[i].IntgrVal 553 case utils.SS_DT_FLOAT: 554 extVal.Dtype = utils.SS_DT_FLOAT 555 extVal.CVal = ntAgvals[i].FloatVal 556 } 557 // todo count is stored multiple times in the nodeAggvalue (per measCol), store only once 558 if i == 0 { // count is always at index 0 559 colCntVal = uint64(extVal.CVal.(int64)) 560 } else { 561 cvaggvalues[resCvIdx] = extVal 562 resCvIdx++ 563 } 564 } 565 blkResults.AddMeasureResultsToKeyAgileTree(string(rawVal), cvaggvalues, qid, colCntVal) 566 } 567 568 return nil 569 } 570 571 func (str *AgileTreeReader) computeAggsJit(combiner map[string][]utils.NumTypeEnclosure, 572 desiredLevel uint16, measResIndices []int, agileTreeBuf []byte, grpTreeLevels []uint16, 573 grpColNames []string) error { 574 575 numAggValues := len(str.treeMeta.measureColNames) * writer.TotalMeasFns 576 577 fName := str.segKey + ".strl" 578 fd, err := os.OpenFile(fName, os.O_RDONLY, 0644) 579 if err != nil { 580 log.Infof("computeAggsJit: failed to open STRLev %v Error: %v.", 581 fName, err) 582 return err 583 } 584 str.levDataFd = fd 585 586 myLevsOff := str.treeMeta.levsOffsets[desiredLevel] 587 myLevsSize := int64(str.treeMeta.levsSizes[desiredLevel]) 588 589 sizeToAdd := myLevsSize - int64(len(agileTreeBuf)) 590 if sizeToAdd > 0 { 591 newArr := make([]byte, sizeToAdd) 592 agileTreeBuf = append(agileTreeBuf, newArr...) 593 } 594 595 _, err = str.levDataFd.ReadAt(agileTreeBuf[:myLevsSize], myLevsOff) 596 if err != nil { 597 log.Errorf("computeAggsJit read file error: %+v", err) 598 return err 599 } 600 601 // assumes root is at level -1 602 err = str.decodeNodeDetailsJit(agileTreeBuf[0:myLevsSize], numAggValues, desiredLevel, 603 combiner, measResIndices, len(measResIndices), grpTreeLevels, grpColNames) 604 return err 605 } 606 607 func (str *AgileTreeReader) decodeRawValBytes(mkey string, usedGrpDictEncodings []map[uint32][]byte, 608 grpColNames []string) (string, error) { 609 610 // Estimate how much space we need for the string builder to avoid 611 // reallocations. An int or float groupby column will take 9 bytes, and 612 // a string groupby column could take more or less space. 613 var sb strings.Builder 614 sb.Grow(len(usedGrpDictEncodings) * 16) 615 616 buf := []byte(mkey) 617 idx := uint32(0) 618 for i, dictEncoding := range usedGrpDictEncodings { 619 nk := toputils.BytesToUint32LittleEndian(buf[idx : idx+4]) 620 idx += 4 621 622 cname := grpColNames[i] 623 rawVal, err := str.getRawVal(nk, dictEncoding) 624 if err != nil { 625 log.Errorf("decodeRawValBytes: Failed to get raw value for nk:%v, came: %v, err: %+v", 626 nk, cname, err) 627 return "", err 628 } 629 sb.Write(rawVal) 630 } 631 return sb.String(), nil 632 }