vitess.io/vitess@v0.16.2/go/vt/vtgate/engine/distinct.go (about) 1 /* 2 Copyright 2020 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package engine 18 19 import ( 20 "context" 21 "fmt" 22 23 "vitess.io/vitess/go/mysql/collations" 24 "vitess.io/vitess/go/sqltypes" 25 querypb "vitess.io/vitess/go/vt/proto/query" 26 "vitess.io/vitess/go/vt/vterrors" 27 "vitess.io/vitess/go/vt/vtgate/evalengine" 28 ) 29 30 // Distinct Primitive is used to uniqueify results 31 var _ Primitive = (*Distinct)(nil) 32 33 type ( 34 // Distinct Primitive is used to uniqueify results 35 Distinct struct { 36 Source Primitive 37 CheckCols []CheckCol 38 Truncate bool 39 } 40 CheckCol struct { 41 Col int 42 WsCol *int 43 Collation collations.ID 44 } 45 probeTable struct { 46 seenRows map[evalengine.HashCode][]sqltypes.Row 47 checkCols []CheckCol 48 } 49 ) 50 51 func (pt *probeTable) exists(inputRow sqltypes.Row) (bool, error) { 52 // the two prime numbers used here (17 and 31) are used to 53 // calculate hashcode from all column values in the input sqltypes.Row 54 code, err := pt.hashCodeForRow(inputRow) 55 if err != nil { 56 return false, err 57 } 58 59 existingRows, found := pt.seenRows[code] 60 if !found { 61 // nothing with this hash code found, we can be sure it's a not seen sqltypes.Row 62 pt.seenRows[code] = []sqltypes.Row{inputRow} 63 return false, nil 64 } 65 66 // we found something in the map - still need to check all individual values 67 // so we don't just fall for a hash collision 68 for _, existingRow := range existingRows { 69 exists, err := pt.equal(existingRow, inputRow) 70 if err != nil { 71 return false, err 72 } 73 if exists { 74 return true, nil 75 } 76 } 77 78 pt.seenRows[code] = append(existingRows, inputRow) 79 80 return false, nil 81 } 82 83 func (pt *probeTable) hashCodeForRow(inputRow sqltypes.Row) (evalengine.HashCode, error) { 84 // Why use 17 and 31 in this method? 85 // Copied from an old usenet discussion on the topic: 86 // https://groups.google.com/g/comp.programming/c/HSurZEyrZ1E?pli=1#d887b5bdb2dac99d 87 // > It's a mixture of superstition and good sense. 88 // > Suppose the multiplier were 26, and consider 89 // > hashing a hundred-character string. How much influence does 90 // > the string's first character have on the final value of `h', 91 // > just before the mod operation? The first character's value 92 // > will have been multiplied by MULT 99 times, so if the arithmetic 93 // > were done in infinite precision the value would consist of some 94 // > jumble of bits followed by 99 low-order zero bits -- each time 95 // > you multiply by MULT you introduce another low-order zero, right? 96 // > The computer's finite arithmetic just chops away all the excess 97 // > high-order bits, so the first character's actual contribution to 98 // > `h' is ... precisely zero! The `h' value depends only on the 99 // > rightmost 32 string characters (assuming a 32-bit int), and even 100 // > then things are not wonderful: the first of those final 32 bytes 101 // > influences only the leftmost bit of `h' and has no effect on 102 // > the remaining 31. Clearly, an even-valued MULT is a poor idea. 103 // > 104 // > Need MULT be prime? Not as far as I know (I don't know 105 // > everything); any odd value ought to suffice. 31 may be attractive 106 // > because it is close to a power of two, and it may be easier for 107 // > the compiler to replace a possibly slow multiply instruction with 108 // > a shift and subtract (31*x == (x << 5) - x) on machines where it 109 // > makes a difference. Setting MULT one greater than a power of two 110 // > (e.g., 33) would also be easy to optimize, but might produce too 111 // > "simple" an arrangement: mostly a juxtaposition of two copies 112 // > of the original set of bits, with a little mixing in the middle. 113 // > So you want an odd MULT that has plenty of one-bits. 114 115 code := evalengine.HashCode(17) 116 for i, checkCol := range pt.checkCols { 117 if i >= len(inputRow) { 118 return 0, vterrors.VT13001("index out of range in row when creating the DISTINCT hash code") 119 } 120 col := inputRow[checkCol.Col] 121 hashcode, err := evalengine.NullsafeHashcode(col, checkCol.Collation, col.Type()) 122 if err != nil { 123 if err != evalengine.UnsupportedCollationHashError || checkCol.WsCol == nil { 124 return 0, err 125 } 126 checkCol = checkCol.SwitchToWeightString() 127 pt.checkCols[i] = checkCol 128 hashcode, err = evalengine.NullsafeHashcode(inputRow[checkCol.Col], checkCol.Collation, col.Type()) 129 if err != nil { 130 return 0, err 131 } 132 } 133 code = code*31 + hashcode 134 } 135 return code, nil 136 } 137 138 func (pt *probeTable) equal(a, b sqltypes.Row) (bool, error) { 139 for i, checkCol := range pt.checkCols { 140 cmp, err := evalengine.NullsafeCompare(a[i], b[i], checkCol.Collation) 141 if err != nil { 142 _, isComparisonErr := err.(evalengine.UnsupportedComparisonError) 143 if !isComparisonErr || checkCol.WsCol == nil { 144 return false, err 145 } 146 checkCol = checkCol.SwitchToWeightString() 147 pt.checkCols[i] = checkCol 148 cmp, err = evalengine.NullsafeCompare(a[i], b[i], checkCol.Collation) 149 if err != nil { 150 return false, err 151 } 152 } 153 if cmp != 0 { 154 return false, nil 155 } 156 } 157 return true, nil 158 } 159 160 func newProbeTable(checkCols []CheckCol) *probeTable { 161 cols := make([]CheckCol, len(checkCols)) 162 copy(cols, checkCols) 163 return &probeTable{ 164 seenRows: map[uintptr][]sqltypes.Row{}, 165 checkCols: cols, 166 } 167 } 168 169 // TryExecute implements the Primitive interface 170 func (d *Distinct) TryExecute(ctx context.Context, vcursor VCursor, bindVars map[string]*querypb.BindVariable, wantfields bool) (*sqltypes.Result, error) { 171 input, err := vcursor.ExecutePrimitive(ctx, d.Source, bindVars, wantfields) 172 if err != nil { 173 return nil, err 174 } 175 176 result := &sqltypes.Result{ 177 Fields: input.Fields, 178 InsertID: input.InsertID, 179 } 180 181 pt := newProbeTable(d.CheckCols) 182 183 for _, row := range input.Rows { 184 exists, err := pt.exists(row) 185 if err != nil { 186 return nil, err 187 } 188 if !exists { 189 result.Rows = append(result.Rows, row) 190 } 191 } 192 if d.Truncate { 193 return result.Truncate(len(d.CheckCols)), nil 194 } 195 return result, err 196 } 197 198 // TryStreamExecute implements the Primitive interface 199 func (d *Distinct) TryStreamExecute(ctx context.Context, vcursor VCursor, bindVars map[string]*querypb.BindVariable, wantfields bool, callback func(*sqltypes.Result) error) error { 200 pt := newProbeTable(d.CheckCols) 201 202 err := vcursor.StreamExecutePrimitive(ctx, d.Source, bindVars, wantfields, func(input *sqltypes.Result) error { 203 result := &sqltypes.Result{ 204 Fields: input.Fields, 205 InsertID: input.InsertID, 206 } 207 for _, row := range input.Rows { 208 exists, err := pt.exists(row) 209 if err != nil { 210 return err 211 } 212 if !exists { 213 result.Rows = append(result.Rows, row) 214 } 215 } 216 return callback(result.Truncate(len(d.CheckCols))) 217 }) 218 219 return err 220 } 221 222 // RouteType implements the Primitive interface 223 func (d *Distinct) RouteType() string { 224 return d.Source.RouteType() 225 } 226 227 // GetKeyspaceName implements the Primitive interface 228 func (d *Distinct) GetKeyspaceName() string { 229 return d.Source.GetKeyspaceName() 230 } 231 232 // GetTableName implements the Primitive interface 233 func (d *Distinct) GetTableName() string { 234 return d.Source.GetTableName() 235 } 236 237 // GetFields implements the Primitive interface 238 func (d *Distinct) GetFields(ctx context.Context, vcursor VCursor, bindVars map[string]*querypb.BindVariable) (*sqltypes.Result, error) { 239 return d.Source.GetFields(ctx, vcursor, bindVars) 240 } 241 242 // NeedsTransaction implements the Primitive interface 243 func (d *Distinct) NeedsTransaction() bool { 244 return d.Source.NeedsTransaction() 245 } 246 247 // Inputs implements the Primitive interface 248 func (d *Distinct) Inputs() []Primitive { 249 return []Primitive{d.Source} 250 } 251 252 func (d *Distinct) description() PrimitiveDescription { 253 other := map[string]any{} 254 255 var colls []string 256 for _, checkCol := range d.CheckCols { 257 colls = append(colls, checkCol.String()) 258 } 259 if colls != nil { 260 other["Collations"] = colls 261 } 262 263 if d.Truncate { 264 other["ResultColumns"] = len(d.CheckCols) 265 } 266 return PrimitiveDescription{ 267 Other: other, 268 OperatorType: "Distinct", 269 } 270 } 271 272 // SwitchToWeightString returns a new CheckCol that works on the weight string column instead 273 func (cc CheckCol) SwitchToWeightString() CheckCol { 274 return CheckCol{ 275 Col: *cc.WsCol, 276 WsCol: nil, 277 Collation: collations.CollationBinaryID, 278 } 279 } 280 281 func (cc CheckCol) String() string { 282 coll := collations.Local().LookupByID(cc.Collation) 283 var collation string 284 if coll != nil { 285 collation = ": " + coll.Name() 286 } 287 288 var column string 289 if cc.WsCol == nil { 290 column = fmt.Sprintf("%d", cc.Col) 291 } else { 292 column = fmt.Sprintf("(%d:%d)", cc.Col, *cc.WsCol) 293 } 294 return column + collation 295 }