github.com/dolthub/go-mysql-server@v0.18.0/sql/expression/matchagainst.go (about)

     1  // Copyright 2023 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package expression
    16  
    17  import (
    18  	"fmt"
    19  	"math"
    20  	"strings"
    21  	"sync"
    22  
    23  	"github.com/dolthub/go-mysql-server/sql"
    24  	"github.com/dolthub/go-mysql-server/sql/fulltext"
    25  	"github.com/dolthub/go-mysql-server/sql/types"
    26  )
    27  
    28  // MatchAgainst reads from the tables that create a Full-Text index, and returns a relevancy for each row that is passed
    29  // into it. Within the context of a filter, these relevancy values will be used to filter out rows, as a relevancy > 0
    30  // is a match. Within the context of a SELECT expression, the relevancy value is returned as-is. An index may use the
    31  // tables provided by the expression to reduce the searchable set of tables, however this is performed as a separate step
    32  // that is not directly tied to this expression. This expression's purpose is solely to calculate relevancy values.
    33  type MatchAgainst struct {
    34  	Columns        []sql.Expression
    35  	Expr           sql.Expression
    36  	SearchModifier fulltext.SearchModifier
    37  
    38  	ftIndex          fulltext.Index
    39  	KeyCols          fulltext.KeyColumns
    40  	ParentTable      sql.IndexAddressableTable
    41  	ConfigTable      sql.IndexAddressableTable
    42  	PositionTable    sql.IndexAddressableTable
    43  	DocCountTable    sql.IndexAddressableTable
    44  	GlobalCountTable sql.IndexAddressableTable
    45  	RowCountTable    sql.IndexAddressableTable
    46  
    47  	once             sync.Once
    48  	expectedRowLen   int
    49  	evaluatedString  string
    50  	parser           fulltext.DefaultParser
    51  	docCountIndex    sql.Index
    52  	globalCountIndex sql.Index
    53  	rowCountIndex    sql.Index
    54  	parentRowCount   uint64
    55  }
    56  
    57  var _ sql.Expression = (*MatchAgainst)(nil)
    58  
    59  // NewMatchAgainst creates a new *MatchAgainst expression.
    60  func NewMatchAgainst(columns []sql.Expression, expr sql.Expression, searchModifier fulltext.SearchModifier) *MatchAgainst {
    61  	return &MatchAgainst{
    62  		Columns:          columns,
    63  		Expr:             expr,
    64  		SearchModifier:   searchModifier,
    65  		ftIndex:          nil,
    66  		KeyCols:          fulltext.KeyColumns{},
    67  		ParentTable:      nil,
    68  		ConfigTable:      nil,
    69  		PositionTable:    nil,
    70  		DocCountTable:    nil,
    71  		GlobalCountTable: nil,
    72  		RowCountTable:    nil,
    73  		expectedRowLen:   0,
    74  	}
    75  }
    76  
    77  // Children implements sql.Expression
    78  func (expr *MatchAgainst) Children() []sql.Expression {
    79  	exprs := make([]sql.Expression, len(expr.Columns)+1)
    80  	copy(exprs, expr.Columns)
    81  	exprs[len(exprs)-1] = expr.Expr
    82  	return exprs
    83  }
    84  
    85  // Eval implements sql.Expression
    86  func (expr *MatchAgainst) Eval(ctx *sql.Context, row sql.Row) (interface{}, error) {
    87  	row = row[:expr.expectedRowLen]
    88  	switch expr.SearchModifier {
    89  	case fulltext.SearchModifier_NaturalLanguage:
    90  		return expr.inNaturalLanguageMode(ctx, row)
    91  	case fulltext.SearchModifier_NaturalLangaugeQueryExpansion:
    92  		return expr.inNaturalLanguageModeWithQueryExpansion(ctx, row)
    93  	case fulltext.SearchModifier_Boolean:
    94  		return expr.inBooleanMode(ctx, row)
    95  	case fulltext.SearchModifier_QueryExpansion:
    96  		return expr.withQueryExpansion(ctx, row)
    97  	default:
    98  		panic("invalid MATCH...AGAINST search modifier")
    99  	}
   100  }
   101  
   102  // IsNullable implements sql.Expression
   103  func (expr *MatchAgainst) IsNullable() bool {
   104  	return false
   105  }
   106  
   107  // Resolved implements sql.Expression
   108  func (expr *MatchAgainst) Resolved() bool {
   109  	for _, col := range expr.Columns {
   110  		if !col.Resolved() {
   111  			return false
   112  		}
   113  	}
   114  	return expr.Expr.Resolved()
   115  }
   116  
   117  // String implements sql.Expression
   118  func (expr *MatchAgainst) String() string {
   119  	var searchModifierStr string
   120  	switch expr.SearchModifier {
   121  	case fulltext.SearchModifier_NaturalLanguage:
   122  		searchModifierStr = "IN NATURAL LANGUAGE MODE"
   123  	case fulltext.SearchModifier_NaturalLangaugeQueryExpansion:
   124  		searchModifierStr = "IN NATURAL LANGUAGE MODE WITH QUERY EXPANSION"
   125  	case fulltext.SearchModifier_Boolean:
   126  		searchModifierStr = "IN BOOLEAN MODE"
   127  	case fulltext.SearchModifier_QueryExpansion:
   128  		searchModifierStr = "WITH QUERY EXPANSION"
   129  	default:
   130  		panic("invalid MATCH...AGAINST search modifier")
   131  	}
   132  	columns := make([]string, len(expr.Columns))
   133  	for i := range expr.Columns {
   134  		columns[i] = expr.Columns[i].String()
   135  	}
   136  	return fmt.Sprintf("MATCH (%s) AGAINST (%s %s)", strings.Join(columns, ","), expr.Expr.String(), searchModifierStr)
   137  }
   138  
   139  // Type implements sql.Expression
   140  func (expr *MatchAgainst) Type() sql.Type {
   141  	return types.Float32
   142  }
   143  
   144  // WithChildren implements sql.Expression
   145  func (expr *MatchAgainst) WithChildren(children ...sql.Expression) (sql.Expression, error) {
   146  	if len(children) != len(expr.Columns)+1 {
   147  		return nil, sql.ErrInvalidChildrenNumber.New(expr, len(children), len(expr.Columns)+1)
   148  	}
   149  	columns := make([]sql.Expression, len(children)-1)
   150  	copy(columns, children)
   151  	return &MatchAgainst{
   152  		Columns:          columns,
   153  		Expr:             children[len(children)-1],
   154  		SearchModifier:   expr.SearchModifier,
   155  		ftIndex:          expr.ftIndex,
   156  		KeyCols:          expr.KeyCols,
   157  		ParentTable:      expr.ParentTable,
   158  		ConfigTable:      expr.ConfigTable,
   159  		PositionTable:    expr.PositionTable,
   160  		DocCountTable:    expr.DocCountTable,
   161  		GlobalCountTable: expr.GlobalCountTable,
   162  		RowCountTable:    expr.RowCountTable,
   163  		expectedRowLen:   expr.expectedRowLen,
   164  	}, nil
   165  }
   166  
   167  // WithInfo returns a new *MatchAgainst with the given tables and other needed information to perform matching.
   168  func (expr *MatchAgainst) WithInfo(parent, config, position, docCount, globalCount, rowCount sql.IndexAddressableTable, keyCols fulltext.KeyColumns) *MatchAgainst {
   169  	return &MatchAgainst{
   170  		Columns:          expr.Columns,
   171  		Expr:             expr.Expr,
   172  		SearchModifier:   expr.SearchModifier,
   173  		ftIndex:          expr.ftIndex,
   174  		KeyCols:          keyCols,
   175  		ParentTable:      parent,
   176  		ConfigTable:      config,
   177  		PositionTable:    position,
   178  		DocCountTable:    docCount,
   179  		GlobalCountTable: globalCount,
   180  		RowCountTable:    rowCount,
   181  		expectedRowLen:   len(parent.Schema()),
   182  	}
   183  }
   184  
   185  // GetIndex returns the relevant Full-Text index for this expression, or nil if it has not yet been set.
   186  func (expr *MatchAgainst) GetIndex() fulltext.Index {
   187  	return expr.ftIndex
   188  }
   189  
   190  // SetIndex sets the index for this expression. This does not create and return a new expression, which differs from the
   191  // "With" functions.
   192  func (expr *MatchAgainst) SetIndex(fulltextIndex fulltext.Index) {
   193  	if fulltextIndex == nil {
   194  		return
   195  	}
   196  	expr.ftIndex = fulltextIndex
   197  }
   198  
   199  // ColumnsAsGetFields returns the columns as *GetField expressions. If the columns have not yet been resolved, then this
   200  // returns a nil (empty) slice.
   201  func (expr *MatchAgainst) ColumnsAsGetFields() []*GetField {
   202  	var ok bool
   203  	fields := make([]*GetField, len(expr.Columns))
   204  	for i, col := range expr.Columns {
   205  		fields[i], ok = col.(*GetField)
   206  		if !ok {
   207  			return nil
   208  		}
   209  	}
   210  	return fields
   211  }
   212  
   213  // inNaturalLanguageMode calculates the relevancy using "IN NATURAL LANGUAGE MODE" (default mode). The returned float
   214  // value is the relevancy. When used under a FILTER node, a non-zero result is interpreted as "true", while a zero result
   215  // is interpreted as false. It is assumed that incoming rows will exactly match the schema of the parent table, meaning
   216  // that we cannot take projected rows.
   217  func (expr *MatchAgainst) inNaturalLanguageMode(ctx *sql.Context, row sql.Row) (float32, error) {
   218  	// The general flow of this function is as follows:
   219  	// 1) Perform the one-time setup by evaluating the match expression (string literal) and constructing a parser.
   220  	//    a) Evaluate the match expression, which should be a string literal.
   221  	//    b) Construct a parser over the evaluated literal, so that we may match against multiple words.
   222  	//    c) Cache the indexes that will be used in our searches.
   223  	// 2) Reset the parser, so that we may iterate over the evaluated literal for each input row.
   224  	// 3) Iterate over each unique word from our evaluated literal.
   225  	// 4) Construct a lookup on the document count and global count tables using the word and key (constructed from the input row).
   226  	// 5) If entries were found in the tables, then calculate the relevancy. We'll loop back to #3 until we've exhausted our words.
   227  	// 6) Return the sum of all relevancy calculations.
   228  	var err error
   229  	expr.once.Do(func() {
   230  		// Evaluate the expression, which should always result in a string literal
   231  		words, nErr := expr.Expr.Eval(ctx, nil)
   232  		if nErr != nil {
   233  			err = nErr
   234  			return
   235  		}
   236  		wordsStr, ok := words.(string)
   237  		if !ok {
   238  			if words != nil {
   239  				err = fmt.Errorf("expected WORD to be a string, but had type `%T`", words)
   240  			}
   241  		}
   242  		expr.evaluatedString = wordsStr
   243  		// Grab the index for the doc count table
   244  		docCountIndexes, nErr := expr.DocCountTable.GetIndexes(ctx)
   245  		if nErr != nil {
   246  			err = nErr
   247  			return
   248  		}
   249  		if len(docCountIndexes) != 1 || docCountIndexes[0].ID() != "PRIMARY" {
   250  			err = fmt.Errorf("expected to find a primary key on the table `%s`", expr.DocCountTable.Name())
   251  		}
   252  		expr.docCountIndex = docCountIndexes[0]
   253  		// Grab the index for the global count table
   254  		globalCountIndexes, nErr := expr.GlobalCountTable.GetIndexes(ctx)
   255  		if nErr != nil {
   256  			err = nErr
   257  			return
   258  		}
   259  		if len(globalCountIndexes) != 1 || globalCountIndexes[0].ID() != "PRIMARY" {
   260  			err = fmt.Errorf("expected to find a primary key on the table `%s`", expr.GlobalCountTable.Name())
   261  		}
   262  		expr.globalCountIndex = globalCountIndexes[0]
   263  		// Grab the index for the row count table
   264  		rowCountIndexes, nErr := expr.RowCountTable.GetIndexes(ctx)
   265  		if nErr != nil {
   266  			err = nErr
   267  			return
   268  		}
   269  		if len(rowCountIndexes) != 1 || rowCountIndexes[0].ID() != "PRIMARY" {
   270  			err = fmt.Errorf("expected to find a primary key on the table `%s`", expr.RowCountTable.Name())
   271  		}
   272  		expr.rowCountIndex = rowCountIndexes[0]
   273  		// Create the parser now since it does a lot of preprocessing. We'll reset the iterators every call.
   274  		expr.parser, nErr = fulltext.NewDefaultParser(ctx, fulltext.GetCollationFromSchema(ctx, expr.DocCountTable.Schema()), wordsStr)
   275  		if nErr != nil {
   276  			err = nErr
   277  			return
   278  		}
   279  		// Load the number of rows from the parent table, since it's used in the relevancy calculation
   280  		expr.parentRowCount, _, nErr = expr.ParentTable.(sql.StatisticsTable).RowCount(ctx)
   281  		if nErr != nil {
   282  			err = nErr
   283  			return
   284  		}
   285  	})
   286  	if err != nil {
   287  		return 0, err
   288  	}
   289  
   290  	accumulatedRelevancy := float32(0)
   291  	hash, err := fulltext.HashRow(row)
   292  	if err != nil {
   293  		return 0, err
   294  	}
   295  
   296  	expr.parser.Reset()
   297  	wordStr, reachedTheEnd, err := expr.parser.NextUnique(ctx)
   298  	for ; err == nil && !reachedTheEnd; wordStr, reachedTheEnd, err = expr.parser.NextUnique(ctx) {
   299  		// We'll look for this word within the doc count table, so that we can:
   300  		// 1) Ensure that there's a match
   301  		// 2) Grab the count to use in the relevancy calculation
   302  		var lookup sql.IndexLookup
   303  		if expr.KeyCols.Type != fulltext.KeyType_None {
   304  			ranges := make(sql.Range, 1+len(expr.KeyCols.Positions))
   305  			ranges[0] = sql.ClosedRangeColumnExpr(wordStr, wordStr, expr.DocCountTable.Schema()[0].Type)
   306  			for i, keyColPos := range expr.KeyCols.Positions {
   307  				ranges[i+1] = sql.ClosedRangeColumnExpr(row[keyColPos], row[keyColPos], expr.DocCountTable.Schema()[i+1].Type)
   308  			}
   309  			lookup = sql.IndexLookup{Ranges: []sql.Range{ranges}, Index: expr.docCountIndex}
   310  		} else {
   311  			lookup = sql.IndexLookup{Ranges: []sql.Range{
   312  				{
   313  					sql.ClosedRangeColumnExpr(wordStr, wordStr, expr.DocCountTable.Schema()[0].Type),
   314  					sql.ClosedRangeColumnExpr(hash, hash, fulltext.SchemaRowCount[0].Type),
   315  				},
   316  			}, Index: expr.docCountIndex}
   317  		}
   318  
   319  		editorData := expr.DocCountTable.IndexedAccess(lookup)
   320  		if err != nil {
   321  			return 0, err
   322  		}
   323  
   324  		partIter, err := editorData.LookupPartitions(ctx, lookup)
   325  		if err != nil {
   326  			return 0, err
   327  		}
   328  		docCountRows, err := sql.RowIterToRows(ctx, sql.NewTableRowIter(ctx, editorData, partIter))
   329  		if err != nil {
   330  			return 0, err
   331  		}
   332  		if len(docCountRows) == 0 {
   333  			// This did not match, so we continue
   334  			continue
   335  		} else if len(docCountRows) > 1 {
   336  			return 0, fmt.Errorf("somehow there are duplicate entries within the Full-Text doc count table")
   337  		}
   338  		docCountRow := docCountRows[0]
   339  		docCount := float64(docCountRow[len(docCountRow)-1].(uint64))
   340  		if docCount == 0 {
   341  			// We've got an empty document count, so the word does not match (so it should have been deleted)
   342  			continue
   343  		}
   344  
   345  		// Otherwise, we've found a match, so we'll grab the global count as well
   346  		lookup = sql.IndexLookup{Ranges: []sql.Range{
   347  			{
   348  				sql.ClosedRangeColumnExpr(wordStr, wordStr, expr.GlobalCountTable.Schema()[0].Type),
   349  			},
   350  		}, Index: expr.globalCountIndex}
   351  		editorData = expr.GlobalCountTable.IndexedAccess(lookup)
   352  		if err != nil {
   353  			return 0, err
   354  		}
   355  
   356  		partIter, err = editorData.LookupPartitions(ctx, lookup)
   357  		if err != nil {
   358  			return 0, err
   359  		}
   360  		globalCountRows, err := sql.RowIterToRows(ctx, sql.NewTableRowIter(ctx, editorData, partIter))
   361  		if err != nil {
   362  			return 0, err
   363  		}
   364  		if len(globalCountRows) == 0 {
   365  			continue
   366  		} else if len(globalCountRows) > 1 {
   367  			return 0, fmt.Errorf("somehow there are duplicate entries within the Full-Text global count table")
   368  		}
   369  		globalCountRow := globalCountRows[0]
   370  
   371  		// Lastly, grab the number of unique words within this row from the row count
   372  		lookup = sql.IndexLookup{Ranges: []sql.Range{
   373  			{
   374  				sql.ClosedRangeColumnExpr(hash, hash, expr.RowCountTable.Schema()[0].Type),
   375  			},
   376  		}, Index: expr.rowCountIndex}
   377  		editorData = expr.RowCountTable.IndexedAccess(lookup)
   378  		if err != nil {
   379  			return 0, err
   380  		}
   381  
   382  		partIter, err = editorData.LookupPartitions(ctx, lookup)
   383  		if err != nil {
   384  			return 0, err
   385  		}
   386  		rowCountRows, err := sql.RowIterToRows(ctx, sql.NewTableRowIter(ctx, editorData, partIter))
   387  		if err != nil {
   388  			return 0, err
   389  		}
   390  		if len(rowCountRows) == 0 {
   391  			continue
   392  		} else if len(rowCountRows) > 1 {
   393  			return 0, fmt.Errorf("somehow there are duplicate entries within the Full-Text row count table")
   394  		}
   395  		rowCountRow := rowCountRows[0]
   396  
   397  		// Calculate the relevancy (partially based on an old MySQL implementation)
   398  		// https://web.archive.org/web/20220122170304/http://dev.mysql.com/doc/internals/en/full-text-search.html
   399  		globalCount := float64(globalCountRow[len(globalCountRow)-1].(uint64))
   400  		uniqueWords := float64(rowCountRow[2].(uint64))
   401  		base := math.Log(docCount) + 1
   402  		normFactor := uniqueWords / (1 + 0.115*uniqueWords)
   403  		globalMult := math.Log(float64(expr.parentRowCount)/globalCount) + 1
   404  		accumulatedRelevancy += float32(base * normFactor * globalMult)
   405  	}
   406  	if err != nil {
   407  		return 0, err
   408  	}
   409  	// Due to how we handle floating to bool conversion, we need to add 0.5 if the result is positive
   410  	if accumulatedRelevancy > 0 {
   411  		accumulatedRelevancy += 0.5
   412  	}
   413  	// Return the accumulated relevancy from all of the parsed words
   414  	return accumulatedRelevancy, nil
   415  }
   416  
   417  // inNaturalLanguageModeWithQueryExpansion calculates the result using "IN NATURAL LANGUAGE MODE WITH QUERY EXPANSION".
   418  func (expr *MatchAgainst) inNaturalLanguageModeWithQueryExpansion(ctx *sql.Context, row sql.Row) (float32, error) {
   419  	return 0, fmt.Errorf("'IN NATURAL LANGUAGE MODE WITH QUERY EXPANSION' has not yet been implemented")
   420  }
   421  
   422  // inBooleanMode calculates the result using "IN BOOLEAN MODE".
   423  func (expr *MatchAgainst) inBooleanMode(ctx *sql.Context, row sql.Row) (float32, error) {
   424  	return 0, fmt.Errorf("'IN BOOLEAN MODE' has not yet been implemented")
   425  }
   426  
   427  // withQueryExpansion calculates the result using "WITH QUERY EXPANSION".
   428  func (expr *MatchAgainst) withQueryExpansion(ctx *sql.Context, row sql.Row) (float32, error) {
   429  	return 0, fmt.Errorf("'WITH QUERY EXPANSION' has not yet been implemented")
   430  }