github.com/siglens/siglens@v0.0.0-20240328180423-f7ce9ae441ed/pkg/segment/structs/segsearchstructs.go (about)

     1  /*
     2  Copyright 2023.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8  	http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package structs
    18  
    19  import (
    20  	"bytes"
    21  	"errors"
    22  	"fmt"
    23  	"regexp"
    24  	"strings"
    25  
    26  	"github.com/bits-and-blooms/bloom/v3"
    27  	dtu "github.com/siglens/siglens/pkg/common/dtypeutils"
    28  	"github.com/siglens/siglens/pkg/config"
    29  	"github.com/siglens/siglens/pkg/segment/utils"
    30  	. "github.com/siglens/siglens/pkg/segment/utils"
    31  	log "github.com/sirupsen/logrus"
    32  )
    33  
    34  type SearchQueryType uint8
    35  
    36  const (
    37  	MatchAll                   SearchQueryType = iota // match all data
    38  	MatchWords                                        // match words in a single column
    39  	MatchWordsAllColumns                              // match words in any column
    40  	SimpleExpression                                  // simple expression has one column name an operator and a value to compare
    41  	RegexExpression                                   // regex expression has one column and a regex string column value
    42  	RegexExpressionAllColumns                         // apply RegexExpression on all columns
    43  	SimpleExpressionAllColumns                        // apply SimpleExpression on all columns
    44  	ComplexExpression                                 // complex expression relates multiple columns
    45  	MatchDictArraySingleColumn
    46  	MatchDictArrayAllColumns
    47  )
    48  
    49  type SegType uint8
    50  
    51  const (
    52  	UNKNOWN SegType = iota
    53  	RAW_SEARCH
    54  	PQS
    55  	UNROTATED_PQS
    56  	UNROTATED_RAW_SEARCH
    57  	SEGMENT_STATS_SEARCH
    58  	UNROTATED_SEGMENT_STATS_SEARCH
    59  	METRICS_SEARCH
    60  	UNROTATED_METRICS_SEARCH
    61  )
    62  
    63  func (s SegType) String() string {
    64  	switch s {
    65  	case RAW_SEARCH:
    66  		return "RAW_SEARCH"
    67  	case PQS:
    68  		return "PQS"
    69  	case UNROTATED_PQS:
    70  		return "UNROTATED_PQS"
    71  	case UNROTATED_RAW_SEARCH:
    72  		return "UNROTATED_RAW_SEARCH"
    73  	case SEGMENT_STATS_SEARCH:
    74  		return "SEGMENT_STATS_SEARCH"
    75  	case UNROTATED_SEGMENT_STATS_SEARCH:
    76  		return "UNROTATED_SEGMENT_STATS_SEARCH"
    77  	case METRICS_SEARCH:
    78  		return "METRICS_SEARCH"
    79  	case UNROTATED_METRICS_SEARCH:
    80  		return "UNROTATED_METRICS_SEARCH"
    81  	default:
    82  		return "UNKNOWN"
    83  	}
    84  }
    85  
    86  // A flattened expression input used for searching
    87  // TODO: flatten SearchExpressionInput with just []byte input
    88  type SearchExpressionInput struct {
    89  	ColumnName      string          // columnName to search for
    90  	ComplexRelation *Expression     // complex relations that have columns defined in both sides
    91  	ColumnValue     *DtypeEnclosure // column value: "0", "abc", "abcd*", "0.213"
    92  }
    93  
    94  // A flattened expression used for searching
    95  // leftSearchInput will always be defined, rightSearchInput may not be depending on filterOp
    96  type SearchExpression struct {
    97  	LeftSearchInput  *SearchExpressionInput
    98  	FilterOp         FilterOperator
    99  	RightSearchInput *SearchExpressionInput
   100  	SearchInfo       *SearchInfo
   101  }
   102  
   103  type SearchInfo struct {
   104  	ColEncoding []byte
   105  	QValDte     *DtypeEnclosure
   106  }
   107  
   108  type SearchMetadataHolder struct {
   109  	BlockSummaries     []*BlockSummary
   110  	BlockSummariesFile string
   111  	SearchTotalMemory  uint64 // total memory that this search would take, BlockSummaries + raw search buffers
   112  }
   113  
   114  type BlockMetadataHolder struct {
   115  	BlkNum            uint16
   116  	ColumnBlockOffset map[string]int64
   117  	ColumnBlockLen    map[string]uint32
   118  }
   119  
   120  // a struct for raw search to apply search on specific blocks within a file
   121  type SegmentSearchRequest struct {
   122  	SegmentKey         string
   123  	SearchMetadata     *SearchMetadataHolder
   124  	AllBlocksToSearch  map[uint16]*BlockMetadataHolder // maps all blocks needed to search to the BlockMetadataHolder needed to read
   125  	VirtualTableName   string
   126  	AllPossibleColumns map[string]bool // all possible columns for the segKey
   127  	LatestEpochMS      uint64          // latest epoch time - used for query planning
   128  	SType              SegType
   129  	CmiPassedCnames    map[uint16]map[string]bool // maps blkNum -> colName -> true that have passed the cmi check
   130  	HasMatchedRrc      bool                       // flag to denote matches, so that we decide whether to send a websocket update
   131  }
   132  
   133  // a holder struct for holding a cmi for a single block. Based on CmiType, either Bf or Ranges will be defined
   134  type CmiContainer struct {
   135  	CmiType uint8
   136  	Loaded  bool
   137  	Bf      *bloom.BloomFilter
   138  	Ranges  map[string]*Numbers
   139  }
   140  
   141  // even if only one block will be searched and parallelism=10, we will spawn 10 buffers, although 9 wont be used
   142  // TODO: more accurate block summaries and colmeta sizing
   143  func (ssr *SegmentSearchRequest) GetMaxSearchMemorySize(sNode *SearchNode, parallelismPerFile int64, bitsetMinSize uint16) uint64 {
   144  
   145  	// bitset size worst case is min(15000*num blocks, total record count)
   146  	var totalBits uint64
   147  	for i := 0; i < len(ssr.SearchMetadata.BlockSummaries); i++ {
   148  		if _, ok := ssr.AllBlocksToSearch[uint16(i)]; !ok {
   149  			continue
   150  		}
   151  		if ssr.SearchMetadata.BlockSummaries[i].RecCount > bitsetMinSize {
   152  			totalBits += uint64(ssr.SearchMetadata.BlockSummaries[i].RecCount)
   153  		} else {
   154  			totalBits += uint64(bitsetMinSize)
   155  		}
   156  	}
   157  	totalSize := uint64(totalBits / 8)
   158  
   159  	// for raw search & aggs its hard to calculate as memory for multi readers comes from a pool,
   160  	// hence we assume that there will be enough memory in the pool & in the buffer
   161  	if ssr.SearchMetadata == nil {
   162  		return uint64(totalSize)
   163  	}
   164  
   165  	totalSize += ssr.SearchMetadata.SearchTotalMemory
   166  	return totalSize
   167  }
   168  
   169  // function used to nil out block sum and colmeta
   170  func (ssr *SegmentSearchRequest) CleanSearchMetadata() {
   171  	if ssr.SearchMetadata == nil {
   172  		return
   173  	}
   174  	ssr.SearchMetadata.BlockSummaries = nil
   175  }
   176  
   177  /*
   178  *
   179  
   180  	Logical operator only dictates how the block numbers should be resolved
   181  
   182  	the CMIPassed names will always be unioned.
   183  
   184  *
   185  */
   186  func (ssr *SegmentSearchRequest) JoinRequest(toJoin *SegmentSearchRequest, op LogicalOperator) {
   187  	// merge blocksearch info
   188  	if op == And {
   189  		for blockNum := range ssr.AllBlocksToSearch {
   190  			if _, ok := toJoin.AllBlocksToSearch[blockNum]; !ok {
   191  				delete(ssr.AllBlocksToSearch, blockNum)
   192  				delete(ssr.CmiPassedCnames, blockNum)
   193  				continue
   194  			}
   195  			for cname := range toJoin.CmiPassedCnames[blockNum] {
   196  				ssr.CmiPassedCnames[blockNum][cname] = true
   197  			}
   198  		}
   199  	} else {
   200  		for blockNum, blockMeta := range toJoin.AllBlocksToSearch {
   201  			ssr.AllBlocksToSearch[blockNum] = blockMeta
   202  			if _, ok := ssr.CmiPassedCnames[blockNum]; !ok {
   203  				ssr.CmiPassedCnames[blockNum] = make(map[string]bool)
   204  			}
   205  
   206  			for cname := range toJoin.CmiPassedCnames[blockNum] {
   207  				ssr.CmiPassedCnames[blockNum][cname] = true
   208  			}
   209  		}
   210  	}
   211  	// merge columns
   212  	ssr.JoinColumnInfo(toJoin)
   213  }
   214  
   215  // merges toJoin.SearchColumns with ssr.SearchColumns
   216  func (ssr *SegmentSearchRequest) JoinColumnInfo(toJoin *SegmentSearchRequest) {
   217  	// merge columns
   218  	for col := range toJoin.AllPossibleColumns {
   219  		ssr.AllPossibleColumns[col] = true
   220  	}
   221  }
   222  
   223  func (searchExp *SearchExpression) IsMatchAll() bool {
   224  
   225  	if searchExp.FilterOp != Equals {
   226  		return false
   227  	}
   228  	if searchExp.LeftSearchInput == nil || searchExp.RightSearchInput == nil {
   229  		return false // both left and right need to be defined
   230  	}
   231  
   232  	var colName string
   233  	var colValue *DtypeEnclosure
   234  	if len(searchExp.LeftSearchInput.ColumnName) > 0 {
   235  		colName = searchExp.LeftSearchInput.ColumnName
   236  	} else {
   237  		colName = searchExp.RightSearchInput.ColumnName
   238  	}
   239  
   240  	if searchExp.LeftSearchInput.ColumnValue != nil {
   241  		colValue = searchExp.LeftSearchInput.ColumnValue
   242  	} else if searchExp.RightSearchInput != nil && searchExp.RightSearchInput.ColumnValue != nil {
   243  		colValue = searchExp.RightSearchInput.ColumnValue
   244  	}
   245  	if colValue == nil {
   246  		return false
   247  	}
   248  
   249  	return colName == "*" && colValue.IsFullWildcard()
   250  }
   251  
   252  func (searchExp *SearchExpression) GetExpressionType() SearchQueryType {
   253  	if searchExp.LeftSearchInput.ComplexRelation != nil {
   254  		return ComplexExpression
   255  	}
   256  	if searchExp.RightSearchInput != nil && searchExp.RightSearchInput.ComplexRelation != nil {
   257  		return ComplexExpression
   258  	}
   259  	// at this point, all expressions are some kind of expression
   260  	var colName string
   261  	var colVal *DtypeEnclosure
   262  	if len(searchExp.LeftSearchInput.ColumnName) > 0 {
   263  		colName = searchExp.LeftSearchInput.ColumnName
   264  	} else {
   265  		colName = searchExp.RightSearchInput.ColumnName
   266  	}
   267  	if searchExp.LeftSearchInput.ColumnValue != nil {
   268  		colVal = searchExp.LeftSearchInput.ColumnValue
   269  	} else {
   270  		colVal = searchExp.RightSearchInput.ColumnValue
   271  	}
   272  	wildcardColName := colName == "*"
   273  	if colVal == nil {
   274  		if wildcardColName {
   275  			return RegexExpression
   276  		}
   277  		return SimpleExpression
   278  	}
   279  	regexCol := colVal.IsRegex()
   280  	if wildcardColName {
   281  		if regexCol {
   282  			return RegexExpressionAllColumns
   283  		} else {
   284  			return SimpleExpressionAllColumns
   285  		}
   286  	}
   287  	if regexCol {
   288  		return RegexExpression
   289  	} else {
   290  		return SimpleExpression
   291  	}
   292  }
   293  
   294  // parse a FilterInput to a friendly SearchInput for raw searching/expression matching
   295  func getSearchInputFromFilterInput(filter *FilterInput, qid uint64) *SearchExpressionInput {
   296  
   297  	searchInput := SearchExpressionInput{}
   298  
   299  	if filter == nil {
   300  		return &searchInput
   301  	}
   302  
   303  	if len(filter.SubtreeResult) > 0 { // if filterSubtree is defined, only literal in search input
   304  		val, err := CreateDtypeEnclosure(filter.SubtreeResult, qid)
   305  		if err != nil {
   306  			// TODO: handle error
   307  			log.Errorf("qid=%d, getSearchInputFromFilterInput: Error creating dtype enclosure: %v", qid, err)
   308  		}
   309  		searchInput.ColumnValue = val
   310  		return &searchInput
   311  	}
   312  
   313  	if filter.Expression.RightInput == nil { // rightInput is nil, meaning only left expressionInput is defined and only has columnName or
   314  		expInput := filter.Expression.LeftInput
   315  
   316  		if len(expInput.ColumnName) > 0 {
   317  			searchInput.ColumnName = expInput.ColumnName
   318  		} else {
   319  			searchInput.ColumnValue = expInput.ColumnValue
   320  		}
   321  	} else {
   322  		searchInput.ComplexRelation = filter.Expression
   323  	}
   324  
   325  	return &searchInput
   326  }
   327  
   328  func GetSearchQueryFromFilterCriteria(criteria *FilterCriteria, qid uint64) *SearchQuery {
   329  
   330  	if criteria.MatchFilter != nil {
   331  		return extractSearchQueryFromMatchFilter(criteria.MatchFilter)
   332  	} else {
   333  		sq := extractSearchQueryFromExpressionFilter(criteria.ExpressionFilter, qid)
   334  
   335  		var colVal *DtypeEnclosure
   336  		if sq.ExpressionFilter.LeftSearchInput.ColumnValue != nil {
   337  			colVal = sq.ExpressionFilter.LeftSearchInput.ColumnValue
   338  		} else if sq.ExpressionFilter.RightSearchInput.ColumnValue != nil {
   339  			colVal = sq.ExpressionFilter.RightSearchInput.ColumnValue
   340  		}
   341  
   342  		if colVal != nil && colVal.Dtype == SS_DT_STRING && colVal.StringVal == "*" {
   343  			sq.SearchType = MatchAll
   344  		}
   345  		return sq
   346  	}
   347  }
   348  
   349  func extractSearchQueryFromMatchFilter(match *MatchFilter) *SearchQuery {
   350  	var qType SearchQueryType
   351  	currQuery := &SearchQuery{
   352  		MatchFilter: match,
   353  	}
   354  	if match.MatchType == MATCH_DICT_ARRAY {
   355  		if match.MatchColumn == "*" {
   356  			qType = MatchDictArrayAllColumns
   357  		} else {
   358  			qType = MatchDictArraySingleColumn
   359  		}
   360  		currQuery.SearchType = qType
   361  	} else if match.MatchColumn == "*" {
   362  		qType = MatchWordsAllColumns
   363  		if match.MatchOperator == And {
   364  			if len(match.MatchWords) == 1 && bytes.Equal(match.MatchWords[0], STAR_BYTE) {
   365  				qType = MatchAll
   366  			}
   367  		} else if match.MatchOperator == Or {
   368  			for _, word := range match.MatchWords {
   369  				if bytes.Equal(word, STAR_BYTE) {
   370  					qType = MatchAll
   371  					break
   372  				}
   373  			}
   374  		}
   375  		currQuery.SearchType = qType
   376  	} else {
   377  		currQuery.SearchType = MatchWords
   378  	}
   379  	if match.MatchPhrase != nil && bytes.Contains(match.MatchPhrase, []byte("*")) {
   380  		cval := dtu.ReplaceWildcardStarWithRegex(string(match.MatchPhrase))
   381  		rexpC, err := regexp.Compile(cval)
   382  		if err != nil {
   383  			log.Errorf("extractSearchQueryFromMatchFilter: regexp compile failed, err=%v", err)
   384  		} else {
   385  			currQuery.MatchFilter.Regexp = rexpC
   386  		}
   387  	}
   388  
   389  	return currQuery
   390  }
   391  
   392  func extractSearchQueryFromExpressionFilter(exp *ExpressionFilter, qid uint64) *SearchQuery {
   393  	leftSearchInput := getSearchInputFromFilterInput(exp.LeftInput, qid)
   394  	rightSearchInput := getSearchInputFromFilterInput(exp.RightInput, qid)
   395  	sq := &SearchQuery{
   396  		ExpressionFilter: &SearchExpression{
   397  			LeftSearchInput:  leftSearchInput,
   398  			FilterOp:         exp.FilterOperator,
   399  			RightSearchInput: rightSearchInput,
   400  		},
   401  	}
   402  	expType := getSearchTypeFromSearchExpression(sq.ExpressionFilter)
   403  	sq.SearchType = expType
   404  
   405  	if sq.SearchType == RegexExpression || sq.SearchType == RegexExpressionAllColumns {
   406  		if sq.ExpressionFilter.LeftSearchInput.ColumnValue != nil &&
   407  			sq.ExpressionFilter.LeftSearchInput.ColumnValue.Dtype == SS_DT_STRING {
   408  
   409  			cval := dtu.ReplaceWildcardStarWithRegex(sq.ExpressionFilter.LeftSearchInput.ColumnValue.StringVal)
   410  			rexpC, err := regexp.Compile(cval)
   411  			if err != nil {
   412  				log.Errorf("extractSearchQueryFromExpressionFilter: regexp compile failed, err=%v", err)
   413  			} else {
   414  				sq.ExpressionFilter.LeftSearchInput.ColumnValue.SetRegexp(rexpC)
   415  			}
   416  		}
   417  	}
   418  	return sq
   419  }
   420  
   421  func getSearchTypeFromSearchExpression(searchExp *SearchExpression) SearchQueryType {
   422  
   423  	if searchExp.IsMatchAll() {
   424  		return MatchAll
   425  	}
   426  	return searchExp.GetExpressionType()
   427  }
   428  
   429  // extract all columns from SearchInput
   430  // ex: SearchExpressionInput{columnName="abc"} -> abc
   431  // ex: SearchExpressionInput{complexRelation={literal=2,op=mult,columnName="def"}} -> "def"
   432  func (search *SearchExpressionInput) getAllColumnsInSearch() map[string]string {
   433  
   434  	allColumns := make(map[string]string)
   435  
   436  	if len(search.ColumnName) > 0 {
   437  		allColumns[string(search.ColumnName)] = ""
   438  	}
   439  
   440  	if search.ComplexRelation != nil {
   441  		exp := search.ComplexRelation
   442  		if exp.LeftInput != nil && len(exp.LeftInput.ColumnName) > 0 {
   443  			allColumns[exp.LeftInput.ColumnName] = ""
   444  		}
   445  
   446  		if exp.RightInput != nil && len(exp.RightInput.ColumnName) > 0 {
   447  			allColumns[exp.RightInput.ColumnName] = ""
   448  		}
   449  	}
   450  	return allColumns
   451  }
   452  
   453  func (searchExp *SearchExpression) getAllColumnsInSearch() map[string]string {
   454  
   455  	allColumns := searchExp.LeftSearchInput.getAllColumnsInSearch()
   456  
   457  	if searchExp.RightSearchInput != nil {
   458  		rightColumns := searchExp.RightSearchInput.getAllColumnsInSearch()
   459  
   460  		for key, val := range rightColumns {
   461  			allColumns[key] = val
   462  		}
   463  	}
   464  
   465  	return allColumns
   466  }
   467  
   468  // returns a map with keys,  a boolean, and error
   469  // the map will contain only non wildcarded keys,
   470  // if bool is true, the searchExpression contained a wildcard
   471  func (searchExp *SearchExpression) GetAllBlockBloomKeysToSearch() (map[string]bool, bool, error) {
   472  	if searchExp.FilterOp != Equals {
   473  		return nil, false, errors.New("relation is not simple key1:value1")
   474  	}
   475  	if searchExp.LeftSearchInput != nil && searchExp.LeftSearchInput.ComplexRelation != nil {
   476  		// complex relations are not supported for blockbloom
   477  		return nil, false, errors.New("relation is not simple key1:value1")
   478  	}
   479  	if searchExp.RightSearchInput != nil && searchExp.RightSearchInput.ComplexRelation != nil {
   480  		return nil, false, errors.New("relation is not simple key1:value1")
   481  	}
   482  	allKeys := make(map[string]bool)
   483  	var colVal *DtypeEnclosure
   484  	if searchExp.LeftSearchInput != nil && searchExp.LeftSearchInput.ColumnValue != nil {
   485  		colVal = searchExp.LeftSearchInput.ColumnValue
   486  	} else if searchExp.RightSearchInput != nil && searchExp.RightSearchInput.ColumnValue != nil {
   487  		colVal = searchExp.RightSearchInput.ColumnValue
   488  	}
   489  
   490  	if colVal == nil {
   491  		return nil, false, errors.New("unable to extract column name and value from request")
   492  	}
   493  
   494  	if colVal.IsRegex() {
   495  		return allKeys, true, nil
   496  	}
   497  	if len(colVal.StringVal) == 0 {
   498  		return allKeys, false, errors.New("unable to extract column name and value from request")
   499  	}
   500  	allKeys[colVal.StringVal] = true
   501  	return allKeys, false, nil
   502  }
   503  
   504  func (match *MatchFilter) GetAllBlockBloomKeysToSearch() (map[string]bool, bool, LogicalOperator) {
   505  	allKeys := make(map[string]bool)
   506  	wildcardExists := false
   507  	if match.MatchType == MATCH_DICT_ARRAY {
   508  		mKey := match.MatchDictArray.MatchKey
   509  		mVal := match.MatchDictArray.MatchValue
   510  		var mValStr string
   511  		switch mVal.Dtype {
   512  		case utils.SS_DT_BOOL:
   513  			mValStr = fmt.Sprintf("%v", mVal.BoolVal)
   514  		case utils.SS_DT_STRING:
   515  			mValStr = fmt.Sprintf("%v", mVal.StringVal)
   516  		case utils.SS_DT_UNSIGNED_NUM:
   517  			mValStr = fmt.Sprintf("%v", mVal.UnsignedVal)
   518  		case utils.SS_DT_SIGNED_NUM:
   519  			mValStr = fmt.Sprintf("%v", mVal.SignedVal)
   520  		case utils.SS_DT_FLOAT:
   521  			mValStr = fmt.Sprintf("%v", mVal.FloatVal)
   522  		}
   523  
   524  		allKeys[string(mKey)] = true
   525  		allKeys[mValStr] = true
   526  		return allKeys, wildcardExists, And
   527  	} else {
   528  		for _, literal := range match.MatchWords {
   529  
   530  			if strings.Contains(string(literal), "*") {
   531  				wildcardExists = true
   532  				continue
   533  			}
   534  			allKeys[string(literal)] = true
   535  		}
   536  		// if only one matchWord then do And so that CMI logic will only pass blocks that pass
   537  		// bloom check
   538  		if len(allKeys) == 1 {
   539  			return allKeys, wildcardExists, And
   540  		}
   541  	}
   542  	return allKeys, wildcardExists, match.MatchOperator
   543  }
   544  
   545  func (ef *SearchExpression) IsTimeRangeFilter() bool {
   546  	if ef.IsMatchAll() {
   547  		return true
   548  	}
   549  	if ef.LeftSearchInput != nil && len(ef.LeftSearchInput.ColumnName) > 0 {
   550  		if ef.LeftSearchInput.ColumnName != config.GetTimeStampKey() {
   551  			return false
   552  		}
   553  	}
   554  	if ef.RightSearchInput != nil && len(ef.RightSearchInput.ColumnName) > 0 {
   555  		if ef.RightSearchInput.ColumnName != config.GetTimeStampKey() {
   556  			return false
   557  		}
   558  	}
   559  	return true
   560  }