github.com/matrixorigin/matrixone@v0.7.0/pkg/vm/engine/disttae/stats.go (about) 1 // Copyright 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package disttae 16 17 import ( 18 "context" 19 "github.com/matrixorigin/matrixone/pkg/container/types" 20 "github.com/matrixorigin/matrixone/pkg/pb/plan" 21 plan2 "github.com/matrixorigin/matrixone/pkg/sql/plan" 22 "github.com/matrixorigin/matrixone/pkg/sql/util" 23 "github.com/matrixorigin/matrixone/pkg/vm/engine/tae/compute" 24 "github.com/matrixorigin/matrixone/pkg/vm/process" 25 "math" 26 ) 27 28 func estimateOutCntBySortOrder(tableCnt, cost float64, sortOrder int) float64 { 29 if sortOrder == -1 { 30 return cost 31 } 32 // coefficient is 0.5 when tableCnt equals cost, and 1 when tableCnt >> cost 33 coefficient1 := math.Pow(0.5, cost/tableCnt) 34 // coefficient is 0.25 when tableCnt is small, and 1 when very large table. 35 coefficient2 := math.Pow(0.2, (1 / math.Log10(tableCnt))) 36 37 outCnt := cost * coefficient1 * coefficient2 38 if sortOrder == 0 { 39 return outCnt * 0.95 40 } else if sortOrder == 1 { 41 return outCnt * 0.75 42 } else if sortOrder == 2 { 43 return outCnt * 0.55 44 } else { 45 return outCnt * 0.35 46 } 47 48 } 49 50 func estimateOutCntForEquality(expr *plan.Expr, sortKeyName string, tableCnt, cost float64, ndvMap map[string]float64) float64 { 51 // only filter like func(col)>1 , or (col=1) or (col=2) can estimate outcnt 52 // and only 1 colRef is allowd in the filter. otherwise, no good method to calculate 53 ret, col := plan2.CheckFilter(expr) 54 if !ret { 55 return cost / 5 56 } 57 sortOrder := util.GetClusterByColumnOrder(sortKeyName, col.Name) 58 //if col is clusterby, we assume most of the rows in blocks we read is needed 59 //otherwise, deduce selectivity according to ndv 60 if sortOrder != -1 { 61 return estimateOutCntBySortOrder(tableCnt, cost, sortOrder) 62 } else { 63 if ndv, ok := ndvMap[col.Name]; ok { 64 return tableCnt / ndv 65 } else { 66 return tableCnt / 100 67 } 68 } 69 } 70 71 // estimate output lines for a filter 72 func estimateOutCnt(expr *plan.Expr, sortKeyName string, tableCnt, cost float64, ndvMap map[string]float64) float64 { 73 if expr == nil { 74 return cost 75 } 76 var outcnt float64 77 switch exprImpl := expr.Expr.(type) { 78 case *plan.Expr_F: 79 funcName := exprImpl.F.Func.ObjName 80 switch funcName { 81 case "=": 82 outcnt = estimateOutCntForEquality(expr, sortKeyName, tableCnt, cost, ndvMap) 83 case ">", "<", ">=", "<=": 84 //for filters like a>1, no good way to estimate, return 3 * equality 85 outcnt = estimateOutCntForEquality(expr, sortKeyName, tableCnt, cost, ndvMap) * 3 86 case "and": 87 //get the smaller one of two children, and tune it down a little bit 88 out1 := estimateOutCnt(exprImpl.F.Args[0], sortKeyName, tableCnt, cost, ndvMap) 89 out2 := estimateOutCnt(exprImpl.F.Args[1], sortKeyName, tableCnt, cost, ndvMap) 90 outcnt = math.Min(out1, out2) * 0.8 91 case "or": 92 //get the bigger one of two children, and tune it up a little bit 93 out1 := estimateOutCnt(exprImpl.F.Args[0], sortKeyName, tableCnt, cost, ndvMap) 94 out2 := estimateOutCnt(exprImpl.F.Args[1], sortKeyName, tableCnt, cost, ndvMap) 95 outcnt = math.Max(out1, out2) * 1.5 96 default: 97 //no good way to estimate, just 0.1*cost 98 outcnt = cost * 0.1 99 } 100 } 101 if outcnt > cost { 102 //outcnt must be smaller than cost 103 return cost 104 } 105 return outcnt 106 } 107 108 func calcNdv(minVal, maxVal any, distinctValNum, blockNumTotal, tableCnt float64, t types.Type) float64 { 109 ndv1 := calcNdvUsingMinMax(minVal, maxVal, t) 110 ndv2 := calcNdvUsingDistinctValNum(distinctValNum, blockNumTotal, tableCnt) 111 if ndv1 <= 0 { 112 return ndv2 113 } 114 return math.Min(ndv1, ndv2) 115 } 116 117 // treat distinct val in zonemap like a sample , then estimate the ndv 118 // more blocks, more accurate 119 func calcNdvUsingDistinctValNum(distinctValNum, blockNumTotal, tableCnt float64) float64 { 120 // coefficient is 0.1 when 1 block, and 1 when many blocks. 121 coefficient := math.Pow(0.1, (1 / math.Log10(blockNumTotal*10))) 122 // very little distinctValNum, assume ndv is very low 123 if distinctValNum <= 1 { 124 return 1 // only one value 125 } else if distinctValNum == 2 { 126 return 2 / coefficient //if only 1 block, ndv is 20. if many block 127 } else if distinctValNum <= 10 && distinctValNum/blockNumTotal < 0.2 { 128 return distinctValNum / coefficient 129 } 130 // assume ndv is high 131 // ndvRate is from 0 to 1. 1 means unique key, and 0 means ndv is only 1 132 ndvRate := (distinctValNum / blockNumTotal) / 2 133 ndv := tableCnt * ndvRate * coefficient 134 if ndv < 1 { 135 ndv = 1 136 } 137 return ndv 138 } 139 140 func calcNdvUsingMinMax(minVal, maxVal any, t types.Type) float64 { 141 switch t.Oid { 142 case types.T_bool: 143 return 2 144 case types.T_int8: 145 return float64(maxVal.(int8)-minVal.(int8)) + 1 146 case types.T_int16: 147 return float64(maxVal.(int16)-minVal.(int16)) + 1 148 case types.T_int32: 149 return float64(maxVal.(int32)-minVal.(int32)) + 1 150 case types.T_int64: 151 return float64(maxVal.(int64)-minVal.(int64)) + 1 152 case types.T_uint8: 153 return float64(maxVal.(uint8)-minVal.(uint8)) + 1 154 case types.T_uint16: 155 return float64(maxVal.(uint16)-minVal.(uint16)) + 1 156 case types.T_uint32: 157 return float64(maxVal.(uint32)-minVal.(uint32)) + 1 158 case types.T_uint64: 159 return float64(maxVal.(uint64)-minVal.(uint64)) + 1 160 case types.T_decimal64: 161 return maxVal.(types.Decimal64).Sub(minVal.(types.Decimal64)).ToFloat64() + 1 162 case types.T_decimal128: 163 return maxVal.(types.Decimal128).Sub(minVal.(types.Decimal128)).ToFloat64() + 1 164 case types.T_float32: 165 return float64(maxVal.(float32)-minVal.(float32)) + 1 166 case types.T_float64: 167 return maxVal.(float64) - minVal.(float64) + 1 168 case types.T_timestamp: 169 return float64(maxVal.(types.Timestamp)-minVal.(types.Timestamp)) + 1 170 case types.T_date: 171 return float64(maxVal.(types.Date)-minVal.(types.Date)) + 1 172 case types.T_time: 173 return float64(maxVal.(types.Time)-minVal.(types.Time)) + 1 174 case types.T_datetime: 175 return float64(maxVal.(types.Datetime)-minVal.(types.Datetime)) + 1 176 case types.T_uuid, types.T_char, types.T_varchar, types.T_blob, types.T_json, types.T_text: 177 return -1 178 default: 179 return -1 180 } 181 } 182 183 func getColumnsNDVFromZoneMap(ctx context.Context, columns []int, blocks *[][]BlockMeta, blockNumTotal int, tableCnt float64, tableDef *plan.TableDef) (map[string]float64, error) { 184 lenCols := len(columns) 185 dataTypes := make([]types.Type, lenCols) 186 maxVal := make([]any, lenCols) //maxvalue of all blocks for column 187 minVal := make([]any, lenCols) //minvalue of all blocks for column 188 valMap := make([]map[any]int, lenCols) // all distinct value in blocks zonemap 189 for i := range columns { 190 valMap[i] = make(map[any]int, blockNumTotal) 191 } 192 193 //first, get info needed from zonemap 194 var init bool 195 for i := range *blocks { 196 for j := range (*blocks)[i] { 197 zonemapVal, blkTypes, err := getZonemapDataFromMeta(ctx, columns, (*blocks)[i][j], tableDef) 198 if err != nil { 199 return nil, err 200 } 201 if !init { 202 init = true 203 for i := range zonemapVal { 204 minVal[i] = zonemapVal[i][0] 205 maxVal[i] = zonemapVal[i][1] 206 dataTypes[i] = types.T(blkTypes[i]).ToType() 207 } 208 } 209 210 for colIdx := range zonemapVal { 211 currentBlockMin := zonemapVal[colIdx][0] 212 currentBlockMax := zonemapVal[colIdx][1] 213 if s, ok := currentBlockMin.([]uint8); ok { 214 valMap[colIdx][string(s)] = 1 215 } else { 216 valMap[colIdx][currentBlockMin] = 1 217 } 218 if s, ok := currentBlockMax.([]uint8); ok { 219 valMap[colIdx][string(s)] = 1 220 } else { 221 valMap[colIdx][currentBlockMax] = 1 222 } 223 if compute.CompareGeneric(currentBlockMin, minVal[colIdx], dataTypes[colIdx]) < 0 { 224 minVal[i] = zonemapVal[i][0] 225 } 226 if compute.CompareGeneric(currentBlockMax, maxVal[colIdx], dataTypes[colIdx]) > 0 { 227 maxVal[i] = zonemapVal[i][1] 228 } 229 } 230 } 231 } 232 233 //calc ndv with min,max,distinct value in zonemap, blocknumer and column type 234 ndvMap := make(map[string]float64, lenCols) //return ndvs 235 for i := range columns { 236 colName := tableDef.Cols[columns[i]].Name 237 ndvMap[colName] = calcNdv(minVal[i], maxVal[i], float64(len(valMap[i])), float64(blockNumTotal), tableCnt, dataTypes[i]) 238 } 239 return ndvMap, nil 240 } 241 242 // calculate the stats for scan node. 243 // we need to get the zonemap from cn, and eval the filters with zonemap 244 func CalcStats(ctx context.Context, blocks *[][]BlockMeta, expr *plan.Expr, tableDef *plan.TableDef, proc *process.Process, sortKeyName string) (*plan.Stats, error) { 245 var blockNumNeed, blockNumTotal int 246 var tableCnt, cost int64 247 exprMono := plan2.CheckExprIsMonotonic(ctx, expr) 248 columnMap, columns, maxCol := plan2.GetColumnsByExpr(expr, tableDef) 249 for i := range *blocks { 250 for j := range (*blocks)[i] { 251 blockNumTotal++ 252 tableCnt += (*blocks)[i][j].Rows 253 if !exprMono || needRead(ctx, expr, (*blocks)[i][j], tableDef, columnMap, columns, maxCol, proc) { 254 cost += (*blocks)[i][j].Rows 255 blockNumNeed++ 256 } 257 } 258 } 259 stats := new(plan.Stats) 260 stats.BlockNum = int32(blockNumNeed) 261 stats.TableCnt = float64(tableCnt) 262 stats.Cost = float64(cost) 263 if expr != nil { 264 ndvMap, err := getColumnsNDVFromZoneMap(ctx, columns, blocks, blockNumTotal, stats.TableCnt, tableDef) 265 if err != nil { 266 return plan2.DefaultStats(), nil 267 } 268 stats.Outcnt = estimateOutCnt(expr, sortKeyName, stats.TableCnt, stats.Cost, ndvMap) 269 } else { 270 stats.Outcnt = stats.TableCnt 271 } 272 stats.Selectivity = stats.Outcnt / stats.TableCnt 273 return stats, nil 274 }