github.com/matrixorigin/matrixone@v0.7.0/pkg/sql/colexec/hashbuild/build.go (about) 1 // Copyright 2021 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package hashbuild 16 17 import ( 18 "bytes" 19 "time" 20 21 "github.com/matrixorigin/matrixone/pkg/common/hashmap" 22 "github.com/matrixorigin/matrixone/pkg/container/batch" 23 "github.com/matrixorigin/matrixone/pkg/container/index" 24 "github.com/matrixorigin/matrixone/pkg/container/vector" 25 "github.com/matrixorigin/matrixone/pkg/sql/colexec" 26 "github.com/matrixorigin/matrixone/pkg/sql/plan" 27 "github.com/matrixorigin/matrixone/pkg/vm/process" 28 ) 29 30 func String(_ any, buf *bytes.Buffer) { 31 buf.WriteString(" hash build ") 32 } 33 34 func Prepare(proc *process.Process, arg any) error { 35 var err error 36 37 ap := arg.(*Argument) 38 ap.ctr = new(container) 39 if ap.NeedHashMap { 40 if ap.ctr.mp, err = hashmap.NewStrMap(false, ap.Ibucket, ap.Nbucket, proc.Mp()); err != nil { 41 return err 42 } 43 ap.ctr.vecs = make([]*vector.Vector, len(ap.Conditions)) 44 ap.ctr.evecs = make([]evalVector, len(ap.Conditions)) 45 } 46 ap.ctr.bat = batch.NewWithSize(len(ap.Typs)) 47 ap.ctr.bat.Zs = proc.Mp().GetSels() 48 for i, typ := range ap.Typs { 49 ap.ctr.bat.Vecs[i] = vector.New(typ) 50 } 51 52 return nil 53 } 54 55 func Call(idx int, proc *process.Process, arg any, isFirst bool, _ bool) (bool, error) { 56 anal := proc.GetAnalyze(idx) 57 anal.Start() 58 defer anal.Stop() 59 ap := arg.(*Argument) 60 ctr := ap.ctr 61 for { 62 switch ctr.state { 63 case Build: 64 if err := ctr.build(ap, proc, anal, isFirst); err != nil { 65 ap.Free(proc, true) 66 return false, err 67 } 68 if ap.ctr.mp != nil { 69 anal.Alloc(ap.ctr.mp.Size()) 70 } 71 ctr.state = End 72 default: 73 if ctr.bat != nil { 74 if ap.NeedHashMap { 75 ctr.bat.Ht = hashmap.NewJoinMap(ctr.sels, nil, ctr.mp, ctr.hasNull, ctr.idx) 76 } 77 proc.SetInputBatch(ctr.bat) 78 ctr.mp = nil 79 ctr.bat = nil 80 ctr.sels = nil 81 } else { 82 proc.SetInputBatch(nil) 83 } 84 ap.Free(proc, false) 85 return true, nil 86 } 87 } 88 } 89 90 func (ctr *container) build(ap *Argument, proc *process.Process, anal process.Analyze, isFirst bool) error { 91 var err error 92 93 for { 94 start := time.Now() 95 bat := <-proc.Reg.MergeReceivers[0].Ch 96 anal.WaitStop(start) 97 98 if bat == nil { 99 break 100 } 101 if bat.Length() == 0 { 102 continue 103 } 104 anal.Input(bat, isFirst) 105 anal.Alloc(int64(bat.Size())) 106 if ctr.bat, err = ctr.bat.Append(proc.Ctx, proc.Mp(), bat); err != nil { 107 return err 108 } 109 bat.Clean(proc.Mp()) 110 } 111 if ctr.bat == nil || ctr.bat.Length() == 0 || !ap.NeedHashMap { 112 return nil 113 } 114 ctr.cleanEvalVectors(proc.Mp()) 115 if err = ctr.evalJoinCondition(ctr.bat, ap.Conditions, proc, anal); err != nil { 116 return err 117 } 118 119 if ctr.idx != nil { 120 return ctr.indexBuild() 121 } 122 123 itr := ctr.mp.NewIterator() 124 count := ctr.bat.Length() 125 for i := 0; i < count; i += hashmap.UnitLimit { 126 n := count - i 127 if n > hashmap.UnitLimit { 128 n = hashmap.UnitLimit 129 } 130 rows := ctr.mp.GroupCount() 131 vals, zvals, err := itr.Insert(i, n, ctr.vecs) 132 if err != nil { 133 return err 134 } 135 for k, v := range vals[:n] { 136 if zvals[k] == 0 { 137 ctr.hasNull = true 138 continue 139 } 140 if v == 0 { 141 continue 142 } 143 if v > rows { 144 ctr.sels = append(ctr.sels, make([]int32, 0)) 145 } 146 ai := int64(v) - 1 147 ctr.sels[ai] = append(ctr.sels[ai], int32(i+k)) 148 } 149 } 150 return nil 151 } 152 153 func (ctr *container) indexBuild() error { 154 // e.g. original data = ["a", "b", "a", "c", "b", "c", "a", "a"] 155 // => dictionary = ["a"->1, "b"->2, "c"->3] 156 // => poses = [1, 2, 1, 3, 2, 3, 1, 1] 157 // sels = [[0, 2, 6, 7], [1, 4], [3, 5]] 158 ctr.sels = make([][]int32, index.MaxLowCardinality) 159 poses := vector.MustTCols[uint16](ctr.idx.GetPoses()) 160 for k, v := range poses { 161 if v == 0 { 162 continue 163 } 164 bucket := int(v) - 1 165 if len(ctr.sels[bucket]) == 0 { 166 ctr.sels[bucket] = make([]int32, 0, 64) 167 } 168 ctr.sels[bucket] = append(ctr.sels[bucket], int32(k)) 169 } 170 return nil 171 } 172 173 func (ctr *container) evalJoinCondition(bat *batch.Batch, conds []*plan.Expr, proc *process.Process, analyze process.Analyze) error { 174 for i, cond := range conds { 175 vec, err := colexec.EvalExpr(bat, proc, cond) 176 if err != nil || vec.ConstExpand(false, proc.Mp()) == nil { 177 ctr.cleanEvalVectors(proc.Mp()) 178 return err 179 } 180 ctr.vecs[i] = vec 181 ctr.evecs[i].vec = vec 182 ctr.evecs[i].needFree = true 183 for j := range bat.Vecs { 184 if bat.Vecs[j] == vec { 185 ctr.evecs[i].needFree = false 186 break 187 } 188 } 189 if ctr.evecs[i].needFree && vec != nil { 190 analyze.Alloc(int64(vec.Size())) 191 } 192 193 // 1. multiple equivalent conditions are not considered currently 194 // 2. do not want the condition to be an expression 195 if len(conds) == 1 && !ctr.evecs[i].needFree { 196 if idx, ok := ctr.vecs[i].Index().(*index.LowCardinalityIndex); ok { 197 ctr.idx = idx.Dup() 198 } 199 } 200 } 201 return nil 202 }