vitess.io/vitess@v0.16.2/go/vt/vtgate/engine/distinct.go (about)

     1  /*
     2  Copyright 2020 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package engine
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  
    23  	"vitess.io/vitess/go/mysql/collations"
    24  	"vitess.io/vitess/go/sqltypes"
    25  	querypb "vitess.io/vitess/go/vt/proto/query"
    26  	"vitess.io/vitess/go/vt/vterrors"
    27  	"vitess.io/vitess/go/vt/vtgate/evalengine"
    28  )
    29  
    30  // Distinct Primitive is used to uniqueify results
    31  var _ Primitive = (*Distinct)(nil)
    32  
    33  type (
    34  	// Distinct Primitive is used to uniqueify results
    35  	Distinct struct {
    36  		Source    Primitive
    37  		CheckCols []CheckCol
    38  		Truncate  bool
    39  	}
    40  	CheckCol struct {
    41  		Col       int
    42  		WsCol     *int
    43  		Collation collations.ID
    44  	}
    45  	probeTable struct {
    46  		seenRows  map[evalengine.HashCode][]sqltypes.Row
    47  		checkCols []CheckCol
    48  	}
    49  )
    50  
    51  func (pt *probeTable) exists(inputRow sqltypes.Row) (bool, error) {
    52  	// the two prime numbers used here (17 and 31) are used to
    53  	// calculate hashcode from all column values in the input sqltypes.Row
    54  	code, err := pt.hashCodeForRow(inputRow)
    55  	if err != nil {
    56  		return false, err
    57  	}
    58  
    59  	existingRows, found := pt.seenRows[code]
    60  	if !found {
    61  		// nothing with this hash code found, we can be sure it's a not seen sqltypes.Row
    62  		pt.seenRows[code] = []sqltypes.Row{inputRow}
    63  		return false, nil
    64  	}
    65  
    66  	// we found something in the map - still need to check all individual values
    67  	// so we don't just fall for a hash collision
    68  	for _, existingRow := range existingRows {
    69  		exists, err := pt.equal(existingRow, inputRow)
    70  		if err != nil {
    71  			return false, err
    72  		}
    73  		if exists {
    74  			return true, nil
    75  		}
    76  	}
    77  
    78  	pt.seenRows[code] = append(existingRows, inputRow)
    79  
    80  	return false, nil
    81  }
    82  
    83  func (pt *probeTable) hashCodeForRow(inputRow sqltypes.Row) (evalengine.HashCode, error) {
    84  	// Why use 17 and 31 in this method?
    85  	// Copied from an old usenet discussion on the topic:
    86  	// https://groups.google.com/g/comp.programming/c/HSurZEyrZ1E?pli=1#d887b5bdb2dac99d
    87  	// > It's a mixture of superstition and good sense.
    88  	// > Suppose the multiplier were 26, and consider
    89  	// > hashing a hundred-character string. How much influence does
    90  	// > the string's first character have on the final value of `h',
    91  	// > just before the mod operation? The first character's value
    92  	// > will have been multiplied by MULT 99 times, so if the arithmetic
    93  	// > were done in infinite precision the value would consist of some
    94  	// > jumble of bits followed by 99 low-order zero bits -- each time
    95  	// > you multiply by MULT you introduce another low-order zero, right?
    96  	// > The computer's finite arithmetic just chops away all the excess
    97  	// > high-order bits, so the first character's actual contribution to
    98  	// > `h' is ... precisely zero! The `h' value depends only on the
    99  	// > rightmost 32 string characters (assuming a 32-bit int), and even
   100  	// > then things are not wonderful: the first of those final 32 bytes
   101  	// > influences only the leftmost bit of `h' and has no effect on
   102  	// > the remaining 31. Clearly, an even-valued MULT is a poor idea.
   103  	// >
   104  	// > Need MULT be prime? Not as far as I know (I don't know
   105  	// > everything); any odd value ought to suffice. 31 may be attractive
   106  	// > because it is close to a power of two, and it may be easier for
   107  	// > the compiler to replace a possibly slow multiply instruction with
   108  	// > a shift and subtract (31*x == (x << 5) - x) on machines where it
   109  	// > makes a difference. Setting MULT one greater than a power of two
   110  	// > (e.g., 33) would also be easy to optimize, but might produce too
   111  	// > "simple" an arrangement: mostly a juxtaposition of two copies
   112  	// > of the original set of bits, with a little mixing in the middle.
   113  	// > So you want an odd MULT that has plenty of one-bits.
   114  
   115  	code := evalengine.HashCode(17)
   116  	for i, checkCol := range pt.checkCols {
   117  		if i >= len(inputRow) {
   118  			return 0, vterrors.VT13001("index out of range in row when creating the DISTINCT hash code")
   119  		}
   120  		col := inputRow[checkCol.Col]
   121  		hashcode, err := evalengine.NullsafeHashcode(col, checkCol.Collation, col.Type())
   122  		if err != nil {
   123  			if err != evalengine.UnsupportedCollationHashError || checkCol.WsCol == nil {
   124  				return 0, err
   125  			}
   126  			checkCol = checkCol.SwitchToWeightString()
   127  			pt.checkCols[i] = checkCol
   128  			hashcode, err = evalengine.NullsafeHashcode(inputRow[checkCol.Col], checkCol.Collation, col.Type())
   129  			if err != nil {
   130  				return 0, err
   131  			}
   132  		}
   133  		code = code*31 + hashcode
   134  	}
   135  	return code, nil
   136  }
   137  
   138  func (pt *probeTable) equal(a, b sqltypes.Row) (bool, error) {
   139  	for i, checkCol := range pt.checkCols {
   140  		cmp, err := evalengine.NullsafeCompare(a[i], b[i], checkCol.Collation)
   141  		if err != nil {
   142  			_, isComparisonErr := err.(evalengine.UnsupportedComparisonError)
   143  			if !isComparisonErr || checkCol.WsCol == nil {
   144  				return false, err
   145  			}
   146  			checkCol = checkCol.SwitchToWeightString()
   147  			pt.checkCols[i] = checkCol
   148  			cmp, err = evalengine.NullsafeCompare(a[i], b[i], checkCol.Collation)
   149  			if err != nil {
   150  				return false, err
   151  			}
   152  		}
   153  		if cmp != 0 {
   154  			return false, nil
   155  		}
   156  	}
   157  	return true, nil
   158  }
   159  
   160  func newProbeTable(checkCols []CheckCol) *probeTable {
   161  	cols := make([]CheckCol, len(checkCols))
   162  	copy(cols, checkCols)
   163  	return &probeTable{
   164  		seenRows:  map[uintptr][]sqltypes.Row{},
   165  		checkCols: cols,
   166  	}
   167  }
   168  
   169  // TryExecute implements the Primitive interface
   170  func (d *Distinct) TryExecute(ctx context.Context, vcursor VCursor, bindVars map[string]*querypb.BindVariable, wantfields bool) (*sqltypes.Result, error) {
   171  	input, err := vcursor.ExecutePrimitive(ctx, d.Source, bindVars, wantfields)
   172  	if err != nil {
   173  		return nil, err
   174  	}
   175  
   176  	result := &sqltypes.Result{
   177  		Fields:   input.Fields,
   178  		InsertID: input.InsertID,
   179  	}
   180  
   181  	pt := newProbeTable(d.CheckCols)
   182  
   183  	for _, row := range input.Rows {
   184  		exists, err := pt.exists(row)
   185  		if err != nil {
   186  			return nil, err
   187  		}
   188  		if !exists {
   189  			result.Rows = append(result.Rows, row)
   190  		}
   191  	}
   192  	if d.Truncate {
   193  		return result.Truncate(len(d.CheckCols)), nil
   194  	}
   195  	return result, err
   196  }
   197  
   198  // TryStreamExecute implements the Primitive interface
   199  func (d *Distinct) TryStreamExecute(ctx context.Context, vcursor VCursor, bindVars map[string]*querypb.BindVariable, wantfields bool, callback func(*sqltypes.Result) error) error {
   200  	pt := newProbeTable(d.CheckCols)
   201  
   202  	err := vcursor.StreamExecutePrimitive(ctx, d.Source, bindVars, wantfields, func(input *sqltypes.Result) error {
   203  		result := &sqltypes.Result{
   204  			Fields:   input.Fields,
   205  			InsertID: input.InsertID,
   206  		}
   207  		for _, row := range input.Rows {
   208  			exists, err := pt.exists(row)
   209  			if err != nil {
   210  				return err
   211  			}
   212  			if !exists {
   213  				result.Rows = append(result.Rows, row)
   214  			}
   215  		}
   216  		return callback(result.Truncate(len(d.CheckCols)))
   217  	})
   218  
   219  	return err
   220  }
   221  
   222  // RouteType implements the Primitive interface
   223  func (d *Distinct) RouteType() string {
   224  	return d.Source.RouteType()
   225  }
   226  
   227  // GetKeyspaceName implements the Primitive interface
   228  func (d *Distinct) GetKeyspaceName() string {
   229  	return d.Source.GetKeyspaceName()
   230  }
   231  
   232  // GetTableName implements the Primitive interface
   233  func (d *Distinct) GetTableName() string {
   234  	return d.Source.GetTableName()
   235  }
   236  
   237  // GetFields implements the Primitive interface
   238  func (d *Distinct) GetFields(ctx context.Context, vcursor VCursor, bindVars map[string]*querypb.BindVariable) (*sqltypes.Result, error) {
   239  	return d.Source.GetFields(ctx, vcursor, bindVars)
   240  }
   241  
   242  // NeedsTransaction implements the Primitive interface
   243  func (d *Distinct) NeedsTransaction() bool {
   244  	return d.Source.NeedsTransaction()
   245  }
   246  
   247  // Inputs implements the Primitive interface
   248  func (d *Distinct) Inputs() []Primitive {
   249  	return []Primitive{d.Source}
   250  }
   251  
   252  func (d *Distinct) description() PrimitiveDescription {
   253  	other := map[string]any{}
   254  
   255  	var colls []string
   256  	for _, checkCol := range d.CheckCols {
   257  		colls = append(colls, checkCol.String())
   258  	}
   259  	if colls != nil {
   260  		other["Collations"] = colls
   261  	}
   262  
   263  	if d.Truncate {
   264  		other["ResultColumns"] = len(d.CheckCols)
   265  	}
   266  	return PrimitiveDescription{
   267  		Other:        other,
   268  		OperatorType: "Distinct",
   269  	}
   270  }
   271  
   272  // SwitchToWeightString returns a new CheckCol that works on the weight string column instead
   273  func (cc CheckCol) SwitchToWeightString() CheckCol {
   274  	return CheckCol{
   275  		Col:       *cc.WsCol,
   276  		WsCol:     nil,
   277  		Collation: collations.CollationBinaryID,
   278  	}
   279  }
   280  
   281  func (cc CheckCol) String() string {
   282  	coll := collations.Local().LookupByID(cc.Collation)
   283  	var collation string
   284  	if coll != nil {
   285  		collation = ": " + coll.Name()
   286  	}
   287  
   288  	var column string
   289  	if cc.WsCol == nil {
   290  		column = fmt.Sprintf("%d", cc.Col)
   291  	} else {
   292  		column = fmt.Sprintf("(%d:%d)", cc.Col, *cc.WsCol)
   293  	}
   294  	return column + collation
   295  }