github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/colexec/aggexec/distinct.go (about) 1 // Copyright 2024 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package aggexec 16 17 import ( 18 "github.com/matrixorigin/matrixone/pkg/common/hashmap" 19 "github.com/matrixorigin/matrixone/pkg/common/moerr" 20 "github.com/matrixorigin/matrixone/pkg/common/mpool" 21 "github.com/matrixorigin/matrixone/pkg/container/vector" 22 ) 23 24 type distinctHash struct { 25 mp *mpool.MPool 26 maps []*hashmap.StrHashMap 27 hashHasNull bool 28 29 // optimized for bulk and batch insertions. 30 bs []bool 31 bs1 []bool 32 } 33 34 func newDistinctHash(mp *mpool.MPool, containNullValue bool) distinctHash { 35 return distinctHash{ 36 mp: mp, 37 maps: nil, 38 hashHasNull: containNullValue, 39 } 40 } 41 42 func (d *distinctHash) grows(more int) error { 43 oldLen, newLen := len(d.maps), len(d.maps)+more 44 d.maps = append(d.maps, make([]*hashmap.StrHashMap, more)...) 45 46 var err error 47 for i := oldLen; i < newLen; i++ { 48 if d.maps[i], err = hashmap.NewStrMap( 49 true, 0, 0, d.mp); err != nil { 50 return err 51 } 52 } 53 return nil 54 } 55 56 // fill inserts the row into the hash map. 57 // return true if this is a new value. 58 func (d *distinctHash) fill(group int, vs []*vector.Vector, row int) (bool, error) { 59 return d.maps[group].Insert(vs, row) 60 } 61 62 func (d *distinctHash) bulkFill(group int, vs []*vector.Vector) ([]bool, error) { 63 rowCount := vs[0].Length() 64 65 if cap(d.bs) < rowCount { 66 d.bs = make([]bool, rowCount) 67 } 68 if cap(d.bs1) < hashmap.UnitLimit { 69 d.bs1 = make([]bool, hashmap.UnitLimit) 70 } 71 d.bs = d.bs[:rowCount] 72 d.bs1 = d.bs1[:hashmap.UnitLimit] 73 74 iterator := d.maps[group].NewIterator() 75 76 for i := 0; i < rowCount; i += hashmap.UnitLimit { 77 n := rowCount - i 78 if n > hashmap.UnitLimit { 79 n = hashmap.UnitLimit 80 } 81 for j := 0; j < n; j++ { 82 d.bs1[j] = false 83 } 84 85 oldLen := d.maps[group].GroupCount() 86 indexOffset := oldLen + 1 87 88 values, _, err := iterator.Insert(i, n, vs) 89 if err != nil { 90 return nil, err 91 } 92 93 dd := d.bs[i:] 94 for k, v := range values { 95 if v > oldLen && !d.bs1[v-indexOffset] { 96 d.bs1[v-indexOffset] = true 97 dd[k] = true 98 } else { 99 dd[k] = false 100 } 101 } 102 } 103 return d.bs, nil 104 } 105 106 func (d *distinctHash) batchFill(vs []*vector.Vector, offset int, groups []uint64) ([]bool, error) { 107 rowCount := len(groups) 108 109 if cap(d.bs) < rowCount { 110 d.bs = make([]bool, rowCount) 111 } 112 d.bs = d.bs[:0] 113 114 for _, group := range groups { 115 if group != GroupNotMatched { 116 ok, err := d.fill(int(group-1), vs, offset) 117 if err != nil { 118 return nil, err 119 } 120 d.bs = append(d.bs, ok) 121 } else { 122 d.bs = append(d.bs, false) 123 } 124 offset++ 125 } 126 127 return d.bs, nil 128 } 129 130 // merge was the method to merge two groups of distinct agg. 131 // but distinct agg should be run in only one node and without any parallel. 132 // because the distinct agg need to store all the source data to make sure the result is correct if we use parallel. 133 // there is one simple example that: 134 // 135 // select count(distinct a) from t; 136 // and `a` is a column with 1, 2, 3, 3, 5 137 // if we use parallel, and the data is split into two parts: [1, 2, 3] and [3, 5]. 138 // once we do the merge, we will get the result 5 from (3 + 2), but the correct result should be 4 from (3 + 1). 139 // we need to loop the [3, 5] to do a new data fill to make sure the result is correct, but not do 3 + 2. 140 // 141 // this action to store all the source data is very expensive. 142 // 143 // I add this check to make sure the distinct agg is not used in parallel. 144 func (d *distinctHash) merge(next *distinctHash) error { 145 if len(d.maps) > 0 || len(next.maps) > 0 { 146 return moerr.NewInternalErrorNoCtx("distinct agg should be run in only one node and without any parallel") 147 } 148 return nil 149 } 150 151 func (d *distinctHash) free() { 152 for _, m := range d.maps { 153 if m != nil { 154 m.Free() 155 } 156 } 157 }