github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/colexec/aggexec/distinct.go (about)

     1  // Copyright 2024 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package aggexec
    16  
    17  import (
    18  	"github.com/matrixorigin/matrixone/pkg/common/hashmap"
    19  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    20  	"github.com/matrixorigin/matrixone/pkg/common/mpool"
    21  	"github.com/matrixorigin/matrixone/pkg/container/vector"
    22  )
    23  
    24  type distinctHash struct {
    25  	mp          *mpool.MPool
    26  	maps        []*hashmap.StrHashMap
    27  	hashHasNull bool
    28  
    29  	// optimized for bulk and batch insertions.
    30  	bs  []bool
    31  	bs1 []bool
    32  }
    33  
    34  func newDistinctHash(mp *mpool.MPool, containNullValue bool) distinctHash {
    35  	return distinctHash{
    36  		mp:          mp,
    37  		maps:        nil,
    38  		hashHasNull: containNullValue,
    39  	}
    40  }
    41  
    42  func (d *distinctHash) grows(more int) error {
    43  	oldLen, newLen := len(d.maps), len(d.maps)+more
    44  	d.maps = append(d.maps, make([]*hashmap.StrHashMap, more)...)
    45  
    46  	var err error
    47  	for i := oldLen; i < newLen; i++ {
    48  		if d.maps[i], err = hashmap.NewStrMap(
    49  			true, 0, 0, d.mp); err != nil {
    50  			return err
    51  		}
    52  	}
    53  	return nil
    54  }
    55  
    56  // fill inserts the row into the hash map.
    57  // return true if this is a new value.
    58  func (d *distinctHash) fill(group int, vs []*vector.Vector, row int) (bool, error) {
    59  	return d.maps[group].Insert(vs, row)
    60  }
    61  
    62  func (d *distinctHash) bulkFill(group int, vs []*vector.Vector) ([]bool, error) {
    63  	rowCount := vs[0].Length()
    64  
    65  	if cap(d.bs) < rowCount {
    66  		d.bs = make([]bool, rowCount)
    67  	}
    68  	if cap(d.bs1) < hashmap.UnitLimit {
    69  		d.bs1 = make([]bool, hashmap.UnitLimit)
    70  	}
    71  	d.bs = d.bs[:rowCount]
    72  	d.bs1 = d.bs1[:hashmap.UnitLimit]
    73  
    74  	iterator := d.maps[group].NewIterator()
    75  
    76  	for i := 0; i < rowCount; i += hashmap.UnitLimit {
    77  		n := rowCount - i
    78  		if n > hashmap.UnitLimit {
    79  			n = hashmap.UnitLimit
    80  		}
    81  		for j := 0; j < n; j++ {
    82  			d.bs1[j] = false
    83  		}
    84  
    85  		oldLen := d.maps[group].GroupCount()
    86  		indexOffset := oldLen + 1
    87  
    88  		values, _, err := iterator.Insert(i, n, vs)
    89  		if err != nil {
    90  			return nil, err
    91  		}
    92  
    93  		dd := d.bs[i:]
    94  		for k, v := range values {
    95  			if v > oldLen && !d.bs1[v-indexOffset] {
    96  				d.bs1[v-indexOffset] = true
    97  				dd[k] = true
    98  			} else {
    99  				dd[k] = false
   100  			}
   101  		}
   102  	}
   103  	return d.bs, nil
   104  }
   105  
   106  func (d *distinctHash) batchFill(vs []*vector.Vector, offset int, groups []uint64) ([]bool, error) {
   107  	rowCount := len(groups)
   108  
   109  	if cap(d.bs) < rowCount {
   110  		d.bs = make([]bool, rowCount)
   111  	}
   112  	d.bs = d.bs[:0]
   113  
   114  	for _, group := range groups {
   115  		if group != GroupNotMatched {
   116  			ok, err := d.fill(int(group-1), vs, offset)
   117  			if err != nil {
   118  				return nil, err
   119  			}
   120  			d.bs = append(d.bs, ok)
   121  		} else {
   122  			d.bs = append(d.bs, false)
   123  		}
   124  		offset++
   125  	}
   126  
   127  	return d.bs, nil
   128  }
   129  
   130  // merge was the method to merge two groups of distinct agg.
   131  // but distinct agg should be run in only one node and without any parallel.
   132  // because the distinct agg need to store all the source data to make sure the result is correct if we use parallel.
   133  // there is one simple example that:
   134  //
   135  //	select count(distinct a) from t;
   136  //	and `a` is a column with 1, 2, 3, 3, 5
   137  //	if we use parallel, and the data is split into two parts: [1, 2, 3] and [3, 5].
   138  //	once we do the merge, we will get the result 5 from (3 + 2), but the correct result should be 4 from (3 + 1).
   139  //	we need to loop the [3, 5] to do a new data fill to make sure the result is correct, but not do 3 + 2.
   140  //
   141  // this action to store all the source data is very expensive.
   142  //
   143  // I add this check to make sure the distinct agg is not used in parallel.
   144  func (d *distinctHash) merge(next *distinctHash) error {
   145  	if len(d.maps) > 0 || len(next.maps) > 0 {
   146  		return moerr.NewInternalErrorNoCtx("distinct agg should be run in only one node and without any parallel")
   147  	}
   148  	return nil
   149  }
   150  
   151  func (d *distinctHash) free() {
   152  	for _, m := range d.maps {
   153  		if m != nil {
   154  			m.Free()
   155  		}
   156  	}
   157  }