github.com/matrixorigin/matrixone@v0.7.0/pkg/vm/engine/disttae/stats.go (about)

     1  // Copyright 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package disttae
    16  
    17  import (
    18  	"context"
    19  	"github.com/matrixorigin/matrixone/pkg/container/types"
    20  	"github.com/matrixorigin/matrixone/pkg/pb/plan"
    21  	plan2 "github.com/matrixorigin/matrixone/pkg/sql/plan"
    22  	"github.com/matrixorigin/matrixone/pkg/sql/util"
    23  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/compute"
    24  	"github.com/matrixorigin/matrixone/pkg/vm/process"
    25  	"math"
    26  )
    27  
    28  func estimateOutCntBySortOrder(tableCnt, cost float64, sortOrder int) float64 {
    29  	if sortOrder == -1 {
    30  		return cost
    31  	}
    32  	// coefficient is 0.5 when tableCnt equals cost, and 1 when tableCnt >> cost
    33  	coefficient1 := math.Pow(0.5, cost/tableCnt)
    34  	// coefficient is 0.25 when tableCnt is small, and 1 when very large table.
    35  	coefficient2 := math.Pow(0.2, (1 / math.Log10(tableCnt)))
    36  
    37  	outCnt := cost * coefficient1 * coefficient2
    38  	if sortOrder == 0 {
    39  		return outCnt * 0.95
    40  	} else if sortOrder == 1 {
    41  		return outCnt * 0.75
    42  	} else if sortOrder == 2 {
    43  		return outCnt * 0.55
    44  	} else {
    45  		return outCnt * 0.35
    46  	}
    47  
    48  }
    49  
    50  func estimateOutCntForEquality(expr *plan.Expr, sortKeyName string, tableCnt, cost float64, ndvMap map[string]float64) float64 {
    51  	// only filter like func(col)>1 , or (col=1) or (col=2) can estimate outcnt
    52  	// and only 1 colRef is allowd in the filter. otherwise, no good method to calculate
    53  	ret, col := plan2.CheckFilter(expr)
    54  	if !ret {
    55  		return cost / 5
    56  	}
    57  	sortOrder := util.GetClusterByColumnOrder(sortKeyName, col.Name)
    58  	//if col is clusterby, we assume most of the rows in blocks we read is needed
    59  	//otherwise, deduce selectivity according to ndv
    60  	if sortOrder != -1 {
    61  		return estimateOutCntBySortOrder(tableCnt, cost, sortOrder)
    62  	} else {
    63  		if ndv, ok := ndvMap[col.Name]; ok {
    64  			return tableCnt / ndv
    65  		} else {
    66  			return tableCnt / 100
    67  		}
    68  	}
    69  }
    70  
    71  // estimate output lines for a filter
    72  func estimateOutCnt(expr *plan.Expr, sortKeyName string, tableCnt, cost float64, ndvMap map[string]float64) float64 {
    73  	if expr == nil {
    74  		return cost
    75  	}
    76  	var outcnt float64
    77  	switch exprImpl := expr.Expr.(type) {
    78  	case *plan.Expr_F:
    79  		funcName := exprImpl.F.Func.ObjName
    80  		switch funcName {
    81  		case "=":
    82  			outcnt = estimateOutCntForEquality(expr, sortKeyName, tableCnt, cost, ndvMap)
    83  		case ">", "<", ">=", "<=":
    84  			//for filters like a>1, no good way to estimate, return 3 * equality
    85  			outcnt = estimateOutCntForEquality(expr, sortKeyName, tableCnt, cost, ndvMap) * 3
    86  		case "and":
    87  			//get the smaller one of two children, and tune it down a little bit
    88  			out1 := estimateOutCnt(exprImpl.F.Args[0], sortKeyName, tableCnt, cost, ndvMap)
    89  			out2 := estimateOutCnt(exprImpl.F.Args[1], sortKeyName, tableCnt, cost, ndvMap)
    90  			outcnt = math.Min(out1, out2) * 0.8
    91  		case "or":
    92  			//get the bigger one of two children, and tune it up a little bit
    93  			out1 := estimateOutCnt(exprImpl.F.Args[0], sortKeyName, tableCnt, cost, ndvMap)
    94  			out2 := estimateOutCnt(exprImpl.F.Args[1], sortKeyName, tableCnt, cost, ndvMap)
    95  			outcnt = math.Max(out1, out2) * 1.5
    96  		default:
    97  			//no good way to estimate, just 0.1*cost
    98  			outcnt = cost * 0.1
    99  		}
   100  	}
   101  	if outcnt > cost {
   102  		//outcnt must be smaller than cost
   103  		return cost
   104  	}
   105  	return outcnt
   106  }
   107  
   108  func calcNdv(minVal, maxVal any, distinctValNum, blockNumTotal, tableCnt float64, t types.Type) float64 {
   109  	ndv1 := calcNdvUsingMinMax(minVal, maxVal, t)
   110  	ndv2 := calcNdvUsingDistinctValNum(distinctValNum, blockNumTotal, tableCnt)
   111  	if ndv1 <= 0 {
   112  		return ndv2
   113  	}
   114  	return math.Min(ndv1, ndv2)
   115  }
   116  
   117  // treat distinct val in zonemap like a sample , then estimate the ndv
   118  // more blocks, more accurate
   119  func calcNdvUsingDistinctValNum(distinctValNum, blockNumTotal, tableCnt float64) float64 {
   120  	// coefficient is 0.1 when 1 block, and 1 when many blocks.
   121  	coefficient := math.Pow(0.1, (1 / math.Log10(blockNumTotal*10)))
   122  	// very little distinctValNum, assume ndv is very low
   123  	if distinctValNum <= 1 {
   124  		return 1 // only one value
   125  	} else if distinctValNum == 2 {
   126  		return 2 / coefficient //if only 1 block, ndv is 20. if many block
   127  	} else if distinctValNum <= 10 && distinctValNum/blockNumTotal < 0.2 {
   128  		return distinctValNum / coefficient
   129  	}
   130  	// assume ndv is high
   131  	// ndvRate is from 0 to 1. 1 means unique key, and 0 means ndv is only 1
   132  	ndvRate := (distinctValNum / blockNumTotal) / 2
   133  	ndv := tableCnt * ndvRate * coefficient
   134  	if ndv < 1 {
   135  		ndv = 1
   136  	}
   137  	return ndv
   138  }
   139  
   140  func calcNdvUsingMinMax(minVal, maxVal any, t types.Type) float64 {
   141  	switch t.Oid {
   142  	case types.T_bool:
   143  		return 2
   144  	case types.T_int8:
   145  		return float64(maxVal.(int8)-minVal.(int8)) + 1
   146  	case types.T_int16:
   147  		return float64(maxVal.(int16)-minVal.(int16)) + 1
   148  	case types.T_int32:
   149  		return float64(maxVal.(int32)-minVal.(int32)) + 1
   150  	case types.T_int64:
   151  		return float64(maxVal.(int64)-minVal.(int64)) + 1
   152  	case types.T_uint8:
   153  		return float64(maxVal.(uint8)-minVal.(uint8)) + 1
   154  	case types.T_uint16:
   155  		return float64(maxVal.(uint16)-minVal.(uint16)) + 1
   156  	case types.T_uint32:
   157  		return float64(maxVal.(uint32)-minVal.(uint32)) + 1
   158  	case types.T_uint64:
   159  		return float64(maxVal.(uint64)-minVal.(uint64)) + 1
   160  	case types.T_decimal64:
   161  		return maxVal.(types.Decimal64).Sub(minVal.(types.Decimal64)).ToFloat64() + 1
   162  	case types.T_decimal128:
   163  		return maxVal.(types.Decimal128).Sub(minVal.(types.Decimal128)).ToFloat64() + 1
   164  	case types.T_float32:
   165  		return float64(maxVal.(float32)-minVal.(float32)) + 1
   166  	case types.T_float64:
   167  		return maxVal.(float64) - minVal.(float64) + 1
   168  	case types.T_timestamp:
   169  		return float64(maxVal.(types.Timestamp)-minVal.(types.Timestamp)) + 1
   170  	case types.T_date:
   171  		return float64(maxVal.(types.Date)-minVal.(types.Date)) + 1
   172  	case types.T_time:
   173  		return float64(maxVal.(types.Time)-minVal.(types.Time)) + 1
   174  	case types.T_datetime:
   175  		return float64(maxVal.(types.Datetime)-minVal.(types.Datetime)) + 1
   176  	case types.T_uuid, types.T_char, types.T_varchar, types.T_blob, types.T_json, types.T_text:
   177  		return -1
   178  	default:
   179  		return -1
   180  	}
   181  }
   182  
   183  func getColumnsNDVFromZoneMap(ctx context.Context, columns []int, blocks *[][]BlockMeta, blockNumTotal int, tableCnt float64, tableDef *plan.TableDef) (map[string]float64, error) {
   184  	lenCols := len(columns)
   185  	dataTypes := make([]types.Type, lenCols)
   186  	maxVal := make([]any, lenCols)         //maxvalue of all blocks for column
   187  	minVal := make([]any, lenCols)         //minvalue of all blocks for column
   188  	valMap := make([]map[any]int, lenCols) // all distinct value in blocks zonemap
   189  	for i := range columns {
   190  		valMap[i] = make(map[any]int, blockNumTotal)
   191  	}
   192  
   193  	//first, get info needed from zonemap
   194  	var init bool
   195  	for i := range *blocks {
   196  		for j := range (*blocks)[i] {
   197  			zonemapVal, blkTypes, err := getZonemapDataFromMeta(ctx, columns, (*blocks)[i][j], tableDef)
   198  			if err != nil {
   199  				return nil, err
   200  			}
   201  			if !init {
   202  				init = true
   203  				for i := range zonemapVal {
   204  					minVal[i] = zonemapVal[i][0]
   205  					maxVal[i] = zonemapVal[i][1]
   206  					dataTypes[i] = types.T(blkTypes[i]).ToType()
   207  				}
   208  			}
   209  
   210  			for colIdx := range zonemapVal {
   211  				currentBlockMin := zonemapVal[colIdx][0]
   212  				currentBlockMax := zonemapVal[colIdx][1]
   213  				if s, ok := currentBlockMin.([]uint8); ok {
   214  					valMap[colIdx][string(s)] = 1
   215  				} else {
   216  					valMap[colIdx][currentBlockMin] = 1
   217  				}
   218  				if s, ok := currentBlockMax.([]uint8); ok {
   219  					valMap[colIdx][string(s)] = 1
   220  				} else {
   221  					valMap[colIdx][currentBlockMax] = 1
   222  				}
   223  				if compute.CompareGeneric(currentBlockMin, minVal[colIdx], dataTypes[colIdx]) < 0 {
   224  					minVal[i] = zonemapVal[i][0]
   225  				}
   226  				if compute.CompareGeneric(currentBlockMax, maxVal[colIdx], dataTypes[colIdx]) > 0 {
   227  					maxVal[i] = zonemapVal[i][1]
   228  				}
   229  			}
   230  		}
   231  	}
   232  
   233  	//calc ndv with min,max,distinct value in zonemap, blocknumer and column type
   234  	ndvMap := make(map[string]float64, lenCols) //return ndvs
   235  	for i := range columns {
   236  		colName := tableDef.Cols[columns[i]].Name
   237  		ndvMap[colName] = calcNdv(minVal[i], maxVal[i], float64(len(valMap[i])), float64(blockNumTotal), tableCnt, dataTypes[i])
   238  	}
   239  	return ndvMap, nil
   240  }
   241  
   242  // calculate the stats for scan node.
   243  // we need to get the zonemap from cn, and eval the filters with zonemap
   244  func CalcStats(ctx context.Context, blocks *[][]BlockMeta, expr *plan.Expr, tableDef *plan.TableDef, proc *process.Process, sortKeyName string) (*plan.Stats, error) {
   245  	var blockNumNeed, blockNumTotal int
   246  	var tableCnt, cost int64
   247  	exprMono := plan2.CheckExprIsMonotonic(ctx, expr)
   248  	columnMap, columns, maxCol := plan2.GetColumnsByExpr(expr, tableDef)
   249  	for i := range *blocks {
   250  		for j := range (*blocks)[i] {
   251  			blockNumTotal++
   252  			tableCnt += (*blocks)[i][j].Rows
   253  			if !exprMono || needRead(ctx, expr, (*blocks)[i][j], tableDef, columnMap, columns, maxCol, proc) {
   254  				cost += (*blocks)[i][j].Rows
   255  				blockNumNeed++
   256  			}
   257  		}
   258  	}
   259  	stats := new(plan.Stats)
   260  	stats.BlockNum = int32(blockNumNeed)
   261  	stats.TableCnt = float64(tableCnt)
   262  	stats.Cost = float64(cost)
   263  	if expr != nil {
   264  		ndvMap, err := getColumnsNDVFromZoneMap(ctx, columns, blocks, blockNumTotal, stats.TableCnt, tableDef)
   265  		if err != nil {
   266  			return plan2.DefaultStats(), nil
   267  		}
   268  		stats.Outcnt = estimateOutCnt(expr, sortKeyName, stats.TableCnt, stats.Cost, ndvMap)
   269  	} else {
   270  		stats.Outcnt = stats.TableCnt
   271  	}
   272  	stats.Selectivity = stats.Outcnt / stats.TableCnt
   273  	return stats, nil
   274  }