github.com/matrixorigin/matrixone@v0.7.0/pkg/sql/colexec/hashbuild/build.go (about)

     1  // Copyright 2021 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package hashbuild
    16  
    17  import (
    18  	"bytes"
    19  	"time"
    20  
    21  	"github.com/matrixorigin/matrixone/pkg/common/hashmap"
    22  	"github.com/matrixorigin/matrixone/pkg/container/batch"
    23  	"github.com/matrixorigin/matrixone/pkg/container/index"
    24  	"github.com/matrixorigin/matrixone/pkg/container/vector"
    25  	"github.com/matrixorigin/matrixone/pkg/sql/colexec"
    26  	"github.com/matrixorigin/matrixone/pkg/sql/plan"
    27  	"github.com/matrixorigin/matrixone/pkg/vm/process"
    28  )
    29  
    30  func String(_ any, buf *bytes.Buffer) {
    31  	buf.WriteString(" hash build ")
    32  }
    33  
    34  func Prepare(proc *process.Process, arg any) error {
    35  	var err error
    36  
    37  	ap := arg.(*Argument)
    38  	ap.ctr = new(container)
    39  	if ap.NeedHashMap {
    40  		if ap.ctr.mp, err = hashmap.NewStrMap(false, ap.Ibucket, ap.Nbucket, proc.Mp()); err != nil {
    41  			return err
    42  		}
    43  		ap.ctr.vecs = make([]*vector.Vector, len(ap.Conditions))
    44  		ap.ctr.evecs = make([]evalVector, len(ap.Conditions))
    45  	}
    46  	ap.ctr.bat = batch.NewWithSize(len(ap.Typs))
    47  	ap.ctr.bat.Zs = proc.Mp().GetSels()
    48  	for i, typ := range ap.Typs {
    49  		ap.ctr.bat.Vecs[i] = vector.New(typ)
    50  	}
    51  
    52  	return nil
    53  }
    54  
    55  func Call(idx int, proc *process.Process, arg any, isFirst bool, _ bool) (bool, error) {
    56  	anal := proc.GetAnalyze(idx)
    57  	anal.Start()
    58  	defer anal.Stop()
    59  	ap := arg.(*Argument)
    60  	ctr := ap.ctr
    61  	for {
    62  		switch ctr.state {
    63  		case Build:
    64  			if err := ctr.build(ap, proc, anal, isFirst); err != nil {
    65  				ap.Free(proc, true)
    66  				return false, err
    67  			}
    68  			if ap.ctr.mp != nil {
    69  				anal.Alloc(ap.ctr.mp.Size())
    70  			}
    71  			ctr.state = End
    72  		default:
    73  			if ctr.bat != nil {
    74  				if ap.NeedHashMap {
    75  					ctr.bat.Ht = hashmap.NewJoinMap(ctr.sels, nil, ctr.mp, ctr.hasNull, ctr.idx)
    76  				}
    77  				proc.SetInputBatch(ctr.bat)
    78  				ctr.mp = nil
    79  				ctr.bat = nil
    80  				ctr.sels = nil
    81  			} else {
    82  				proc.SetInputBatch(nil)
    83  			}
    84  			ap.Free(proc, false)
    85  			return true, nil
    86  		}
    87  	}
    88  }
    89  
    90  func (ctr *container) build(ap *Argument, proc *process.Process, anal process.Analyze, isFirst bool) error {
    91  	var err error
    92  
    93  	for {
    94  		start := time.Now()
    95  		bat := <-proc.Reg.MergeReceivers[0].Ch
    96  		anal.WaitStop(start)
    97  
    98  		if bat == nil {
    99  			break
   100  		}
   101  		if bat.Length() == 0 {
   102  			continue
   103  		}
   104  		anal.Input(bat, isFirst)
   105  		anal.Alloc(int64(bat.Size()))
   106  		if ctr.bat, err = ctr.bat.Append(proc.Ctx, proc.Mp(), bat); err != nil {
   107  			return err
   108  		}
   109  		bat.Clean(proc.Mp())
   110  	}
   111  	if ctr.bat == nil || ctr.bat.Length() == 0 || !ap.NeedHashMap {
   112  		return nil
   113  	}
   114  	ctr.cleanEvalVectors(proc.Mp())
   115  	if err = ctr.evalJoinCondition(ctr.bat, ap.Conditions, proc, anal); err != nil {
   116  		return err
   117  	}
   118  
   119  	if ctr.idx != nil {
   120  		return ctr.indexBuild()
   121  	}
   122  
   123  	itr := ctr.mp.NewIterator()
   124  	count := ctr.bat.Length()
   125  	for i := 0; i < count; i += hashmap.UnitLimit {
   126  		n := count - i
   127  		if n > hashmap.UnitLimit {
   128  			n = hashmap.UnitLimit
   129  		}
   130  		rows := ctr.mp.GroupCount()
   131  		vals, zvals, err := itr.Insert(i, n, ctr.vecs)
   132  		if err != nil {
   133  			return err
   134  		}
   135  		for k, v := range vals[:n] {
   136  			if zvals[k] == 0 {
   137  				ctr.hasNull = true
   138  				continue
   139  			}
   140  			if v == 0 {
   141  				continue
   142  			}
   143  			if v > rows {
   144  				ctr.sels = append(ctr.sels, make([]int32, 0))
   145  			}
   146  			ai := int64(v) - 1
   147  			ctr.sels[ai] = append(ctr.sels[ai], int32(i+k))
   148  		}
   149  	}
   150  	return nil
   151  }
   152  
   153  func (ctr *container) indexBuild() error {
   154  	// e.g. original data = ["a", "b", "a", "c", "b", "c", "a", "a"]
   155  	//      => dictionary = ["a"->1, "b"->2, "c"->3]
   156  	//      => poses = [1, 2, 1, 3, 2, 3, 1, 1]
   157  	// sels = [[0, 2, 6, 7], [1, 4], [3, 5]]
   158  	ctr.sels = make([][]int32, index.MaxLowCardinality)
   159  	poses := vector.MustTCols[uint16](ctr.idx.GetPoses())
   160  	for k, v := range poses {
   161  		if v == 0 {
   162  			continue
   163  		}
   164  		bucket := int(v) - 1
   165  		if len(ctr.sels[bucket]) == 0 {
   166  			ctr.sels[bucket] = make([]int32, 0, 64)
   167  		}
   168  		ctr.sels[bucket] = append(ctr.sels[bucket], int32(k))
   169  	}
   170  	return nil
   171  }
   172  
   173  func (ctr *container) evalJoinCondition(bat *batch.Batch, conds []*plan.Expr, proc *process.Process, analyze process.Analyze) error {
   174  	for i, cond := range conds {
   175  		vec, err := colexec.EvalExpr(bat, proc, cond)
   176  		if err != nil || vec.ConstExpand(false, proc.Mp()) == nil {
   177  			ctr.cleanEvalVectors(proc.Mp())
   178  			return err
   179  		}
   180  		ctr.vecs[i] = vec
   181  		ctr.evecs[i].vec = vec
   182  		ctr.evecs[i].needFree = true
   183  		for j := range bat.Vecs {
   184  			if bat.Vecs[j] == vec {
   185  				ctr.evecs[i].needFree = false
   186  				break
   187  			}
   188  		}
   189  		if ctr.evecs[i].needFree && vec != nil {
   190  			analyze.Alloc(int64(vec.Size()))
   191  		}
   192  
   193  		// 1. multiple equivalent conditions are not considered currently
   194  		// 2. do not want the condition to be an expression
   195  		if len(conds) == 1 && !ctr.evecs[i].needFree {
   196  			if idx, ok := ctr.vecs[i].Index().(*index.LowCardinalityIndex); ok {
   197  				ctr.idx = idx.Dup()
   198  			}
   199  		}
   200  	}
   201  	return nil
   202  }