github.com/dolthub/go-mysql-server@v0.18.0/sql/expression/matchagainst.go (about) 1 // Copyright 2023 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package expression 16 17 import ( 18 "fmt" 19 "math" 20 "strings" 21 "sync" 22 23 "github.com/dolthub/go-mysql-server/sql" 24 "github.com/dolthub/go-mysql-server/sql/fulltext" 25 "github.com/dolthub/go-mysql-server/sql/types" 26 ) 27 28 // MatchAgainst reads from the tables that create a Full-Text index, and returns a relevancy for each row that is passed 29 // into it. Within the context of a filter, these relevancy values will be used to filter out rows, as a relevancy > 0 30 // is a match. Within the context of a SELECT expression, the relevancy value is returned as-is. An index may use the 31 // tables provided by the expression to reduce the searchable set of tables, however this is performed as a separate step 32 // that is not directly tied to this expression. This expression's purpose is solely to calculate relevancy values. 33 type MatchAgainst struct { 34 Columns []sql.Expression 35 Expr sql.Expression 36 SearchModifier fulltext.SearchModifier 37 38 ftIndex fulltext.Index 39 KeyCols fulltext.KeyColumns 40 ParentTable sql.IndexAddressableTable 41 ConfigTable sql.IndexAddressableTable 42 PositionTable sql.IndexAddressableTable 43 DocCountTable sql.IndexAddressableTable 44 GlobalCountTable sql.IndexAddressableTable 45 RowCountTable sql.IndexAddressableTable 46 47 once sync.Once 48 expectedRowLen int 49 evaluatedString string 50 parser fulltext.DefaultParser 51 docCountIndex sql.Index 52 globalCountIndex sql.Index 53 rowCountIndex sql.Index 54 parentRowCount uint64 55 } 56 57 var _ sql.Expression = (*MatchAgainst)(nil) 58 59 // NewMatchAgainst creates a new *MatchAgainst expression. 60 func NewMatchAgainst(columns []sql.Expression, expr sql.Expression, searchModifier fulltext.SearchModifier) *MatchAgainst { 61 return &MatchAgainst{ 62 Columns: columns, 63 Expr: expr, 64 SearchModifier: searchModifier, 65 ftIndex: nil, 66 KeyCols: fulltext.KeyColumns{}, 67 ParentTable: nil, 68 ConfigTable: nil, 69 PositionTable: nil, 70 DocCountTable: nil, 71 GlobalCountTable: nil, 72 RowCountTable: nil, 73 expectedRowLen: 0, 74 } 75 } 76 77 // Children implements sql.Expression 78 func (expr *MatchAgainst) Children() []sql.Expression { 79 exprs := make([]sql.Expression, len(expr.Columns)+1) 80 copy(exprs, expr.Columns) 81 exprs[len(exprs)-1] = expr.Expr 82 return exprs 83 } 84 85 // Eval implements sql.Expression 86 func (expr *MatchAgainst) Eval(ctx *sql.Context, row sql.Row) (interface{}, error) { 87 row = row[:expr.expectedRowLen] 88 switch expr.SearchModifier { 89 case fulltext.SearchModifier_NaturalLanguage: 90 return expr.inNaturalLanguageMode(ctx, row) 91 case fulltext.SearchModifier_NaturalLangaugeQueryExpansion: 92 return expr.inNaturalLanguageModeWithQueryExpansion(ctx, row) 93 case fulltext.SearchModifier_Boolean: 94 return expr.inBooleanMode(ctx, row) 95 case fulltext.SearchModifier_QueryExpansion: 96 return expr.withQueryExpansion(ctx, row) 97 default: 98 panic("invalid MATCH...AGAINST search modifier") 99 } 100 } 101 102 // IsNullable implements sql.Expression 103 func (expr *MatchAgainst) IsNullable() bool { 104 return false 105 } 106 107 // Resolved implements sql.Expression 108 func (expr *MatchAgainst) Resolved() bool { 109 for _, col := range expr.Columns { 110 if !col.Resolved() { 111 return false 112 } 113 } 114 return expr.Expr.Resolved() 115 } 116 117 // String implements sql.Expression 118 func (expr *MatchAgainst) String() string { 119 var searchModifierStr string 120 switch expr.SearchModifier { 121 case fulltext.SearchModifier_NaturalLanguage: 122 searchModifierStr = "IN NATURAL LANGUAGE MODE" 123 case fulltext.SearchModifier_NaturalLangaugeQueryExpansion: 124 searchModifierStr = "IN NATURAL LANGUAGE MODE WITH QUERY EXPANSION" 125 case fulltext.SearchModifier_Boolean: 126 searchModifierStr = "IN BOOLEAN MODE" 127 case fulltext.SearchModifier_QueryExpansion: 128 searchModifierStr = "WITH QUERY EXPANSION" 129 default: 130 panic("invalid MATCH...AGAINST search modifier") 131 } 132 columns := make([]string, len(expr.Columns)) 133 for i := range expr.Columns { 134 columns[i] = expr.Columns[i].String() 135 } 136 return fmt.Sprintf("MATCH (%s) AGAINST (%s %s)", strings.Join(columns, ","), expr.Expr.String(), searchModifierStr) 137 } 138 139 // Type implements sql.Expression 140 func (expr *MatchAgainst) Type() sql.Type { 141 return types.Float32 142 } 143 144 // WithChildren implements sql.Expression 145 func (expr *MatchAgainst) WithChildren(children ...sql.Expression) (sql.Expression, error) { 146 if len(children) != len(expr.Columns)+1 { 147 return nil, sql.ErrInvalidChildrenNumber.New(expr, len(children), len(expr.Columns)+1) 148 } 149 columns := make([]sql.Expression, len(children)-1) 150 copy(columns, children) 151 return &MatchAgainst{ 152 Columns: columns, 153 Expr: children[len(children)-1], 154 SearchModifier: expr.SearchModifier, 155 ftIndex: expr.ftIndex, 156 KeyCols: expr.KeyCols, 157 ParentTable: expr.ParentTable, 158 ConfigTable: expr.ConfigTable, 159 PositionTable: expr.PositionTable, 160 DocCountTable: expr.DocCountTable, 161 GlobalCountTable: expr.GlobalCountTable, 162 RowCountTable: expr.RowCountTable, 163 expectedRowLen: expr.expectedRowLen, 164 }, nil 165 } 166 167 // WithInfo returns a new *MatchAgainst with the given tables and other needed information to perform matching. 168 func (expr *MatchAgainst) WithInfo(parent, config, position, docCount, globalCount, rowCount sql.IndexAddressableTable, keyCols fulltext.KeyColumns) *MatchAgainst { 169 return &MatchAgainst{ 170 Columns: expr.Columns, 171 Expr: expr.Expr, 172 SearchModifier: expr.SearchModifier, 173 ftIndex: expr.ftIndex, 174 KeyCols: keyCols, 175 ParentTable: parent, 176 ConfigTable: config, 177 PositionTable: position, 178 DocCountTable: docCount, 179 GlobalCountTable: globalCount, 180 RowCountTable: rowCount, 181 expectedRowLen: len(parent.Schema()), 182 } 183 } 184 185 // GetIndex returns the relevant Full-Text index for this expression, or nil if it has not yet been set. 186 func (expr *MatchAgainst) GetIndex() fulltext.Index { 187 return expr.ftIndex 188 } 189 190 // SetIndex sets the index for this expression. This does not create and return a new expression, which differs from the 191 // "With" functions. 192 func (expr *MatchAgainst) SetIndex(fulltextIndex fulltext.Index) { 193 if fulltextIndex == nil { 194 return 195 } 196 expr.ftIndex = fulltextIndex 197 } 198 199 // ColumnsAsGetFields returns the columns as *GetField expressions. If the columns have not yet been resolved, then this 200 // returns a nil (empty) slice. 201 func (expr *MatchAgainst) ColumnsAsGetFields() []*GetField { 202 var ok bool 203 fields := make([]*GetField, len(expr.Columns)) 204 for i, col := range expr.Columns { 205 fields[i], ok = col.(*GetField) 206 if !ok { 207 return nil 208 } 209 } 210 return fields 211 } 212 213 // inNaturalLanguageMode calculates the relevancy using "IN NATURAL LANGUAGE MODE" (default mode). The returned float 214 // value is the relevancy. When used under a FILTER node, a non-zero result is interpreted as "true", while a zero result 215 // is interpreted as false. It is assumed that incoming rows will exactly match the schema of the parent table, meaning 216 // that we cannot take projected rows. 217 func (expr *MatchAgainst) inNaturalLanguageMode(ctx *sql.Context, row sql.Row) (float32, error) { 218 // The general flow of this function is as follows: 219 // 1) Perform the one-time setup by evaluating the match expression (string literal) and constructing a parser. 220 // a) Evaluate the match expression, which should be a string literal. 221 // b) Construct a parser over the evaluated literal, so that we may match against multiple words. 222 // c) Cache the indexes that will be used in our searches. 223 // 2) Reset the parser, so that we may iterate over the evaluated literal for each input row. 224 // 3) Iterate over each unique word from our evaluated literal. 225 // 4) Construct a lookup on the document count and global count tables using the word and key (constructed from the input row). 226 // 5) If entries were found in the tables, then calculate the relevancy. We'll loop back to #3 until we've exhausted our words. 227 // 6) Return the sum of all relevancy calculations. 228 var err error 229 expr.once.Do(func() { 230 // Evaluate the expression, which should always result in a string literal 231 words, nErr := expr.Expr.Eval(ctx, nil) 232 if nErr != nil { 233 err = nErr 234 return 235 } 236 wordsStr, ok := words.(string) 237 if !ok { 238 if words != nil { 239 err = fmt.Errorf("expected WORD to be a string, but had type `%T`", words) 240 } 241 } 242 expr.evaluatedString = wordsStr 243 // Grab the index for the doc count table 244 docCountIndexes, nErr := expr.DocCountTable.GetIndexes(ctx) 245 if nErr != nil { 246 err = nErr 247 return 248 } 249 if len(docCountIndexes) != 1 || docCountIndexes[0].ID() != "PRIMARY" { 250 err = fmt.Errorf("expected to find a primary key on the table `%s`", expr.DocCountTable.Name()) 251 } 252 expr.docCountIndex = docCountIndexes[0] 253 // Grab the index for the global count table 254 globalCountIndexes, nErr := expr.GlobalCountTable.GetIndexes(ctx) 255 if nErr != nil { 256 err = nErr 257 return 258 } 259 if len(globalCountIndexes) != 1 || globalCountIndexes[0].ID() != "PRIMARY" { 260 err = fmt.Errorf("expected to find a primary key on the table `%s`", expr.GlobalCountTable.Name()) 261 } 262 expr.globalCountIndex = globalCountIndexes[0] 263 // Grab the index for the row count table 264 rowCountIndexes, nErr := expr.RowCountTable.GetIndexes(ctx) 265 if nErr != nil { 266 err = nErr 267 return 268 } 269 if len(rowCountIndexes) != 1 || rowCountIndexes[0].ID() != "PRIMARY" { 270 err = fmt.Errorf("expected to find a primary key on the table `%s`", expr.RowCountTable.Name()) 271 } 272 expr.rowCountIndex = rowCountIndexes[0] 273 // Create the parser now since it does a lot of preprocessing. We'll reset the iterators every call. 274 expr.parser, nErr = fulltext.NewDefaultParser(ctx, fulltext.GetCollationFromSchema(ctx, expr.DocCountTable.Schema()), wordsStr) 275 if nErr != nil { 276 err = nErr 277 return 278 } 279 // Load the number of rows from the parent table, since it's used in the relevancy calculation 280 expr.parentRowCount, _, nErr = expr.ParentTable.(sql.StatisticsTable).RowCount(ctx) 281 if nErr != nil { 282 err = nErr 283 return 284 } 285 }) 286 if err != nil { 287 return 0, err 288 } 289 290 accumulatedRelevancy := float32(0) 291 hash, err := fulltext.HashRow(row) 292 if err != nil { 293 return 0, err 294 } 295 296 expr.parser.Reset() 297 wordStr, reachedTheEnd, err := expr.parser.NextUnique(ctx) 298 for ; err == nil && !reachedTheEnd; wordStr, reachedTheEnd, err = expr.parser.NextUnique(ctx) { 299 // We'll look for this word within the doc count table, so that we can: 300 // 1) Ensure that there's a match 301 // 2) Grab the count to use in the relevancy calculation 302 var lookup sql.IndexLookup 303 if expr.KeyCols.Type != fulltext.KeyType_None { 304 ranges := make(sql.Range, 1+len(expr.KeyCols.Positions)) 305 ranges[0] = sql.ClosedRangeColumnExpr(wordStr, wordStr, expr.DocCountTable.Schema()[0].Type) 306 for i, keyColPos := range expr.KeyCols.Positions { 307 ranges[i+1] = sql.ClosedRangeColumnExpr(row[keyColPos], row[keyColPos], expr.DocCountTable.Schema()[i+1].Type) 308 } 309 lookup = sql.IndexLookup{Ranges: []sql.Range{ranges}, Index: expr.docCountIndex} 310 } else { 311 lookup = sql.IndexLookup{Ranges: []sql.Range{ 312 { 313 sql.ClosedRangeColumnExpr(wordStr, wordStr, expr.DocCountTable.Schema()[0].Type), 314 sql.ClosedRangeColumnExpr(hash, hash, fulltext.SchemaRowCount[0].Type), 315 }, 316 }, Index: expr.docCountIndex} 317 } 318 319 editorData := expr.DocCountTable.IndexedAccess(lookup) 320 if err != nil { 321 return 0, err 322 } 323 324 partIter, err := editorData.LookupPartitions(ctx, lookup) 325 if err != nil { 326 return 0, err 327 } 328 docCountRows, err := sql.RowIterToRows(ctx, sql.NewTableRowIter(ctx, editorData, partIter)) 329 if err != nil { 330 return 0, err 331 } 332 if len(docCountRows) == 0 { 333 // This did not match, so we continue 334 continue 335 } else if len(docCountRows) > 1 { 336 return 0, fmt.Errorf("somehow there are duplicate entries within the Full-Text doc count table") 337 } 338 docCountRow := docCountRows[0] 339 docCount := float64(docCountRow[len(docCountRow)-1].(uint64)) 340 if docCount == 0 { 341 // We've got an empty document count, so the word does not match (so it should have been deleted) 342 continue 343 } 344 345 // Otherwise, we've found a match, so we'll grab the global count as well 346 lookup = sql.IndexLookup{Ranges: []sql.Range{ 347 { 348 sql.ClosedRangeColumnExpr(wordStr, wordStr, expr.GlobalCountTable.Schema()[0].Type), 349 }, 350 }, Index: expr.globalCountIndex} 351 editorData = expr.GlobalCountTable.IndexedAccess(lookup) 352 if err != nil { 353 return 0, err 354 } 355 356 partIter, err = editorData.LookupPartitions(ctx, lookup) 357 if err != nil { 358 return 0, err 359 } 360 globalCountRows, err := sql.RowIterToRows(ctx, sql.NewTableRowIter(ctx, editorData, partIter)) 361 if err != nil { 362 return 0, err 363 } 364 if len(globalCountRows) == 0 { 365 continue 366 } else if len(globalCountRows) > 1 { 367 return 0, fmt.Errorf("somehow there are duplicate entries within the Full-Text global count table") 368 } 369 globalCountRow := globalCountRows[0] 370 371 // Lastly, grab the number of unique words within this row from the row count 372 lookup = sql.IndexLookup{Ranges: []sql.Range{ 373 { 374 sql.ClosedRangeColumnExpr(hash, hash, expr.RowCountTable.Schema()[0].Type), 375 }, 376 }, Index: expr.rowCountIndex} 377 editorData = expr.RowCountTable.IndexedAccess(lookup) 378 if err != nil { 379 return 0, err 380 } 381 382 partIter, err = editorData.LookupPartitions(ctx, lookup) 383 if err != nil { 384 return 0, err 385 } 386 rowCountRows, err := sql.RowIterToRows(ctx, sql.NewTableRowIter(ctx, editorData, partIter)) 387 if err != nil { 388 return 0, err 389 } 390 if len(rowCountRows) == 0 { 391 continue 392 } else if len(rowCountRows) > 1 { 393 return 0, fmt.Errorf("somehow there are duplicate entries within the Full-Text row count table") 394 } 395 rowCountRow := rowCountRows[0] 396 397 // Calculate the relevancy (partially based on an old MySQL implementation) 398 // https://web.archive.org/web/20220122170304/http://dev.mysql.com/doc/internals/en/full-text-search.html 399 globalCount := float64(globalCountRow[len(globalCountRow)-1].(uint64)) 400 uniqueWords := float64(rowCountRow[2].(uint64)) 401 base := math.Log(docCount) + 1 402 normFactor := uniqueWords / (1 + 0.115*uniqueWords) 403 globalMult := math.Log(float64(expr.parentRowCount)/globalCount) + 1 404 accumulatedRelevancy += float32(base * normFactor * globalMult) 405 } 406 if err != nil { 407 return 0, err 408 } 409 // Due to how we handle floating to bool conversion, we need to add 0.5 if the result is positive 410 if accumulatedRelevancy > 0 { 411 accumulatedRelevancy += 0.5 412 } 413 // Return the accumulated relevancy from all of the parsed words 414 return accumulatedRelevancy, nil 415 } 416 417 // inNaturalLanguageModeWithQueryExpansion calculates the result using "IN NATURAL LANGUAGE MODE WITH QUERY EXPANSION". 418 func (expr *MatchAgainst) inNaturalLanguageModeWithQueryExpansion(ctx *sql.Context, row sql.Row) (float32, error) { 419 return 0, fmt.Errorf("'IN NATURAL LANGUAGE MODE WITH QUERY EXPANSION' has not yet been implemented") 420 } 421 422 // inBooleanMode calculates the result using "IN BOOLEAN MODE". 423 func (expr *MatchAgainst) inBooleanMode(ctx *sql.Context, row sql.Row) (float32, error) { 424 return 0, fmt.Errorf("'IN BOOLEAN MODE' has not yet been implemented") 425 } 426 427 // withQueryExpansion calculates the result using "WITH QUERY EXPANSION". 428 func (expr *MatchAgainst) withQueryExpansion(ctx *sql.Context, row sql.Row) (float32, error) { 429 return 0, fmt.Errorf("'WITH QUERY EXPANSION' has not yet been implemented") 430 }