github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/sqle/statspro/update.go (about) 1 // Copyright 2023 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package statspro 16 17 import ( 18 "container/heap" 19 "context" 20 "errors" 21 "fmt" 22 "io" 23 "strings" 24 "time" 25 26 "github.com/dolthub/go-mysql-server/sql" 27 "github.com/dolthub/go-mysql-server/sql/stats" 28 29 "github.com/dolthub/dolt/go/libraries/doltcore/doltdb" 30 "github.com/dolthub/dolt/go/libraries/doltcore/doltdb/durable" 31 "github.com/dolthub/dolt/go/store/hash" 32 "github.com/dolthub/dolt/go/store/prolly" 33 "github.com/dolthub/dolt/go/store/prolly/tree" 34 "github.com/dolthub/dolt/go/store/val" 35 ) 36 37 const ( 38 bucketLowCnt = 20 39 mcvCnt = 3 40 ) 41 42 // createNewStatsBuckets builds histograms for a list of index statistic metadata. 43 // We only read chunk ranges indicated by |indexMeta.updateOrdinals|. If 44 // the returned buckets are a subset of the index the caller is responsible 45 // for reconciling the difference. 46 func createNewStatsBuckets(ctx *sql.Context, sqlTable sql.Table, dTab *doltdb.Table, indexes []sql.Index, idxMetas []indexMeta) (map[sql.StatQualifier]*DoltStats, error) { 47 nameToIdx := make(map[string]sql.Index) 48 for _, idx := range indexes { 49 nameToIdx[strings.ToLower(idx.ID())] = idx 50 } 51 52 ret := make(map[sql.StatQualifier]*DoltStats) 53 54 for _, meta := range idxMetas { 55 var idx durable.Index 56 var err error 57 if strings.EqualFold(meta.qual.Index(), "PRIMARY") { 58 idx, err = dTab.GetRowData(ctx) 59 } else { 60 idx, err = dTab.GetIndexRowData(ctx, meta.qual.Index()) 61 } 62 if err != nil { 63 return nil, err 64 } 65 66 prollyMap := durable.ProllyMapFromIndex(idx) 67 keyBuilder := val.NewTupleBuilder(prollyMap.KeyDesc()) 68 69 sqlIdx := nameToIdx[strings.ToLower(meta.qual.Index())] 70 fds, colSet, err := stats.IndexFds(meta.qual.Table(), sqlTable.Schema(), sqlIdx) 71 if err != nil { 72 return nil, err 73 } 74 75 var types []sql.Type 76 for _, cet := range nameToIdx[strings.ToLower(meta.qual.Index())].ColumnExpressionTypes() { 77 types = append(types, cet.Type) 78 } 79 80 if cnt, err := prollyMap.Count(); err != nil { 81 return nil, err 82 } else if cnt == 0 { 83 // table is empty 84 ret[meta.qual] = NewDoltStats() 85 ret[meta.qual].Statistic.Created = time.Now() 86 ret[meta.qual].Statistic.Cols = meta.cols 87 ret[meta.qual].Statistic.Typs = types 88 ret[meta.qual].Statistic.Qual = meta.qual 89 90 ret[meta.qual].Statistic.Fds = fds 91 ret[meta.qual].Statistic.Colset = colSet 92 continue 93 } 94 95 firstRow, err := firstRowForIndex(ctx, prollyMap, keyBuilder, len(meta.cols)) 96 if err != nil { 97 return nil, err 98 } 99 100 updater := newBucketBuilder(meta.qual, len(meta.cols), prollyMap.KeyDesc()) 101 ret[meta.qual] = NewDoltStats() 102 ret[meta.qual].Chunks = meta.allAddrs 103 ret[meta.qual].Statistic.Created = time.Now() 104 ret[meta.qual].Statistic.Cols = meta.cols 105 ret[meta.qual].Statistic.Typs = types 106 ret[meta.qual].Statistic.Qual = meta.qual 107 108 var start, stop uint64 109 // read leaf rows for each bucket 110 for i, chunk := range meta.newNodes { 111 // each node is a bucket 112 updater.newBucket() 113 114 // we read exclusive range [node first key, next node first key) 115 start, stop = meta.updateOrdinals[i].start, meta.updateOrdinals[i].stop 116 iter, err := prollyMap.IterOrdinalRange(ctx, start, stop) 117 if err != nil { 118 return nil, err 119 } 120 for { 121 // stats key will be a prefix of the index key 122 keyBytes, _, err := iter.Next(ctx) 123 if errors.Is(err, io.EOF) { 124 break 125 } else if err != nil { 126 return nil, err 127 } 128 // build full key 129 for i := range keyBuilder.Desc.Types { 130 keyBuilder.PutRaw(i, keyBytes.GetField(i)) 131 } 132 133 updater.add(keyBuilder.BuildPrefixNoRecycle(prollyMap.Pool(), updater.prefixLen)) 134 keyBuilder.Recycle() 135 } 136 137 // finalize the aggregation 138 bucket, err := updater.finalize(ctx, prollyMap.NodeStore()) 139 if err != nil { 140 return nil, err 141 } 142 bucket.Chunk = chunk.HashOf() 143 ret[updater.qual].Hist = append(ret[updater.qual].Hist, bucket) 144 } 145 146 ret[updater.qual].Statistic.DistinctCnt = uint64(updater.globalDistinct) 147 ret[updater.qual].Statistic.RowCnt = uint64(updater.globalCount) 148 ret[updater.qual].Statistic.LowerBnd = firstRow 149 ret[updater.qual].Statistic.Fds = fds 150 ret[updater.qual].Statistic.Colset = colSet 151 ret[updater.qual].UpdateActive() 152 } 153 return ret, nil 154 } 155 156 // MergeNewChunks combines a set of old and new chunks to create 157 // the desired target histogram. Undefined behavior if a |targetHash| 158 // does not exist in either |oldChunks| or |newChunks|. 159 func MergeNewChunks(inputHashes []hash.Hash, oldChunks, newChunks []sql.HistogramBucket) ([]sql.HistogramBucket, error) { 160 hashToPos := make(map[hash.Hash]int, len(inputHashes)) 161 for i, h := range inputHashes { 162 hashToPos[h] = i 163 } 164 165 var cnt int 166 targetBuckets := make([]sql.HistogramBucket, len(inputHashes)) 167 for _, c := range oldChunks { 168 if idx, ok := hashToPos[DoltBucketChunk(c)]; ok { 169 cnt++ 170 targetBuckets[idx] = c 171 } 172 } 173 for _, c := range newChunks { 174 if idx, ok := hashToPos[DoltBucketChunk(c)]; ok && targetBuckets[idx] == nil { 175 cnt++ 176 targetBuckets[idx] = c 177 } 178 } 179 if cnt != len(inputHashes) { 180 return nil, fmt.Errorf("encountered invalid statistic chunks") 181 } 182 return targetBuckets, nil 183 } 184 185 func firstRowForIndex(ctx *sql.Context, prollyMap prolly.Map, keyBuilder *val.TupleBuilder, prefixLen int) (sql.Row, error) { 186 if cnt, err := prollyMap.Count(); err != nil { 187 return nil, err 188 } else if cnt == 0 { 189 return nil, nil 190 } 191 192 buffPool := prollyMap.NodeStore().Pool() 193 194 // first row is ordinal 0 195 firstIter, err := prollyMap.IterOrdinalRange(ctx, 0, 1) 196 if err != nil { 197 return nil, err 198 } 199 keyBytes, _, err := firstIter.Next(ctx) 200 if err != nil { 201 return nil, err 202 } 203 for i := range keyBuilder.Desc.Types { 204 keyBuilder.PutRaw(i, keyBytes.GetField(i)) 205 } 206 207 firstKey := keyBuilder.BuildPrefixNoRecycle(buffPool, prefixLen) 208 firstRow := make(sql.Row, prefixLen) 209 for i := 0; i < prefixLen; i++ { 210 firstRow[i], err = tree.GetField(ctx, prollyMap.KeyDesc(), i, firstKey, prollyMap.NodeStore()) 211 if err != nil { 212 return nil, err 213 } 214 } 215 return firstRow, nil 216 } 217 218 func newBucketBuilder(qual sql.StatQualifier, prefixLen int, tupleDesc val.TupleDesc) *bucketBuilder { 219 return &bucketBuilder{ 220 qual: qual, 221 prefixLen: prefixLen, 222 mcvs: new(mcvHeap), 223 tupleDesc: tupleDesc.PrefixDesc(prefixLen), 224 } 225 } 226 227 // bucketBuilder performs an aggregation on a sorted series of keys to 228 // collect statistics for a single histogram bucket. DistinctCount is fuzzy, 229 // we might double count a key that crosses bucket boundaries. 230 type bucketBuilder struct { 231 qual sql.StatQualifier 232 tupleDesc val.TupleDesc 233 prefixLen int 234 235 count int 236 distinct int 237 nulls int 238 mcvs *mcvHeap 239 240 currentKey val.Tuple 241 currentCnt int 242 243 globalDistinct int 244 globalCount int 245 prevBound val.Tuple 246 } 247 248 // newBucket zeroes aggregation statistics. Global counters are not reset for 249 // new buckets. Updaters should only be reused between buckets for the same 250 // column statistic. 251 func (u *bucketBuilder) newBucket() { 252 u.count = 0 253 u.distinct = 0 254 u.nulls = 0 255 u.currentKey = nil 256 u.currentCnt = 0 257 258 oldMcvs := *u.mcvs 259 oldMcvs = oldMcvs[:0] 260 u.mcvs = &oldMcvs 261 } 262 263 // finalize converts the current aggregation stats into a histogram bucket, 264 // which includes deserializing most common value tuples into sql.Rows. 265 func (u *bucketBuilder) finalize(ctx context.Context, ns tree.NodeStore) (DoltBucket, error) { 266 // update MCV in case we've ended on a run of many identical keys 267 u.updateMcv() 268 // convert the MCV tuples into SQL rows (most efficient to only do this once) 269 mcvRows, err := u.mcvs.Values(ctx, u.tupleDesc, ns, u.prefixLen) 270 if err != nil { 271 return DoltBucket{}, err 272 } 273 upperBound := make(sql.Row, u.prefixLen) 274 if u.currentKey != nil { 275 for i := 0; i < u.prefixLen; i++ { 276 upperBound[i], err = tree.GetField(ctx, u.tupleDesc, i, u.currentKey, ns) 277 if err != nil { 278 return DoltBucket{}, err 279 } 280 } 281 } 282 return DoltBucket{ 283 Bucket: &stats.Bucket{ 284 RowCnt: uint64(u.count), 285 DistinctCnt: uint64(u.distinct), 286 BoundCnt: uint64(u.currentCnt), 287 McvVals: mcvRows, 288 McvsCnt: u.mcvs.Counts(), 289 BoundVal: upperBound, 290 NullCnt: uint64(u.nulls), 291 }, 292 }, nil 293 } 294 295 // add inputs a new row for a histogram bucket aggregation. We assume 296 // the key has already been truncated to the appropriate prefix length. 297 func (u *bucketBuilder) add(key val.Tuple) { 298 newKey := u.currentKey == nil || u.tupleDesc.Compare(u.currentKey, key) != 0 299 if newKey { 300 u.newKey(key) 301 } else { 302 u.currentCnt++ 303 } 304 305 u.count++ 306 u.globalCount++ 307 for i := 0; i < u.prefixLen; i++ { 308 if key.FieldIsNull(i) { 309 u.nulls++ 310 break 311 } 312 } 313 } 314 315 // newKey updates state for a new key in the rolling stream. 316 func (u *bucketBuilder) newKey(key val.Tuple) { 317 u.updateMcv() 318 if u.prevBound != nil { 319 if u.tupleDesc.Compare(u.prevBound, key) != 0 { 320 u.globalDistinct++ 321 u.prevBound = nil 322 } else { 323 // not a globally unique key 324 } 325 } else { 326 u.globalDistinct++ 327 } 328 u.distinct++ 329 u.currentCnt = 1 330 u.currentKey = key 331 } 332 333 // updateMcv updates the most common value heap when we've demarked the 334 // end of a sequence of key repeats. 335 func (u *bucketBuilder) updateMcv() { 336 if u.count == 0 && u.nulls == 0 { 337 return 338 } 339 key := u.currentKey 340 cnt := u.currentCnt 341 heap.Push(u.mcvs, mcv{key, cnt}) 342 if u.mcvs.Len() > mcvCnt { 343 heap.Pop(u.mcvs) 344 } 345 } 346 347 type mcv struct { 348 val val.Tuple 349 cnt int 350 } 351 352 type mcvHeap []mcv 353 354 var _ heap.Interface = (*mcvHeap)(nil) 355 356 func (m mcvHeap) Counts() []uint64 { 357 ret := make([]uint64, len(m)) 358 for i, mcv := range m { 359 ret[i] = uint64(mcv.cnt) 360 } 361 return ret 362 } 363 364 func (m mcvHeap) Values(ctx context.Context, keyDesc val.TupleDesc, ns tree.NodeStore, prefixLen int) ([]sql.Row, error) { 365 ret := make([]sql.Row, len(m)) 366 for i, mcv := range m { 367 // todo build sql.Row 368 row := make(sql.Row, prefixLen) 369 var err error 370 for i := 0; i < prefixLen; i++ { 371 row[i], err = tree.GetField(ctx, keyDesc, i, mcv.val, ns) 372 if err != nil { 373 return nil, err 374 } 375 } 376 ret[i] = row 377 } 378 return ret, nil 379 } 380 381 func (m mcvHeap) Len() int { 382 return len(m) 383 } 384 385 func (m mcvHeap) Less(i, j int) bool { 386 return m[i].cnt < m[j].cnt 387 } 388 389 func (m mcvHeap) Swap(i, j int) { 390 m[i], m[j] = m[j], m[i] 391 } 392 393 func (m *mcvHeap) Push(x any) { 394 *m = append(*m, x.(mcv)) 395 } 396 397 func (m *mcvHeap) Pop() any { 398 old := *m 399 n := len(old) 400 ret := old[n-1] 401 *m = old[0 : n-1] 402 return ret 403 }