github.com/dolthub/go-mysql-server@v0.18.0/sql/fulltext/default_parser.go (about)

     1  // Copyright 2023 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package fulltext
    16  
    17  import (
    18  	"fmt"
    19  	"strings"
    20  	"unicode"
    21  
    22  	"github.com/dolthub/go-mysql-server/sql"
    23  )
    24  
    25  // parserState represents the state of the parser as it iterates over runes.
    26  type parserState byte
    27  
    28  const (
    29  	parserState_Whitespace parserState = iota
    30  	parserState_Word
    31  	parserState_Apostrophe
    32  )
    33  
    34  // DefaultParser is the default text parser that is used when parsing Full-Text documents. Its intention is the match the
    35  // expected behavior of MySQL's default Full-Text parser. This provides normalization, as well as statistics regarding
    36  // the input document, such as the occurrence of any given word. Such statistics may later be used when calculating the
    37  // relevancy within a MatchAgainst expression.
    38  type DefaultParser struct {
    39  	document  string
    40  	words     []parserWord
    41  	wordsIdx  int
    42  	unique    []string
    43  	uniqueIdx int
    44  	uniqueMap map[uint64]uint32
    45  	collation sql.CollationID
    46  }
    47  
    48  // parserWord contains the word and its starting position.
    49  type parserWord struct {
    50  	Word     string
    51  	Position uint64
    52  }
    53  
    54  // NewDefaultParser creates a new DefaultParser.
    55  func NewDefaultParser(ctx *sql.Context, collation sql.CollationID, colVals ...interface{}) (DefaultParser, error) {
    56  	//TODO: implement exact matching using double quotes
    57  	sb := strings.Builder{}
    58  	for i, colVal := range colVals {
    59  		switch v := colVal.(type) {
    60  		case string:
    61  			if i > 0 {
    62  				sb.WriteString(" ")
    63  			}
    64  			sb.WriteString(v)
    65  		case []byte:
    66  			if i > 0 {
    67  				sb.WriteString(" ")
    68  			}
    69  			sb.Write(v)
    70  		case nil:
    71  			continue
    72  		default:
    73  			panic(fmt.Errorf("Full-Text parser has encountered an unexpected type: %T", colVal))
    74  		}
    75  	}
    76  	document := sb.String()
    77  
    78  	// We preprocess the document so that it's easier to calculate counts
    79  	var words []parserWord
    80  	var buildingWord []rune
    81  	state := parserState_Whitespace
    82  	position := uint64(0)
    83  	for i, r := range document {
    84  		isCharacter := ((unicode.IsLetter(r) || unicode.IsNumber(r) || unicode.IsDigit(r)) && !unicode.IsPunct(r)) || r == '_'
    85  		isApostrophe := r == '\''
    86  
    87  		switch state {
    88  		case parserState_Whitespace:
    89  			if isCharacter {
    90  				buildingWord = append(buildingWord, r)
    91  				state = parserState_Word
    92  			} else {
    93  				position++
    94  			}
    95  		case parserState_Word:
    96  			if !isCharacter {
    97  				if isApostrophe {
    98  					buildingWord = append(buildingWord, r)
    99  					state = parserState_Apostrophe
   100  				} else {
   101  					word := newParserWord(string(buildingWord), position)
   102  					if len(word.Word) >= 3 {
   103  						words = append(words, word)
   104  					}
   105  					buildingWord = buildingWord[:0]
   106  					position = uint64(i)
   107  					state = parserState_Whitespace
   108  				}
   109  			} else {
   110  				buildingWord = append(buildingWord, r)
   111  			}
   112  		case parserState_Apostrophe:
   113  			if !isCharacter {
   114  				word := newParserWord(string(buildingWord), position)
   115  				if len(word.Word) >= 3 {
   116  					words = append(words, word)
   117  				}
   118  				buildingWord = buildingWord[:0]
   119  				position = uint64(i)
   120  				state = parserState_Whitespace
   121  			} else {
   122  				buildingWord = append(buildingWord, r)
   123  				state = parserState_Word
   124  			}
   125  		}
   126  	}
   127  	{ // Grab the last word if there is one
   128  		word := newParserWord(string(buildingWord), position)
   129  		if len(word.Word) >= 3 {
   130  			words = append(words, word)
   131  		}
   132  	}
   133  
   134  	var unique []string
   135  	uniqueMap := make(map[uint64]uint32)
   136  	for _, word := range words {
   137  		hash, err := collation.HashToUint(word.Word)
   138  		if err != nil {
   139  			return DefaultParser{}, err
   140  		}
   141  		if count, ok := uniqueMap[hash]; ok {
   142  			uniqueMap[hash] = count + 1
   143  		} else {
   144  			unique = append(unique, word.Word)
   145  			uniqueMap[hash] = 1
   146  		}
   147  	}
   148  	return DefaultParser{
   149  		document:  document,
   150  		words:     words,
   151  		wordsIdx:  0,
   152  		unique:    unique,
   153  		uniqueIdx: 0,
   154  		uniqueMap: uniqueMap,
   155  		collation: collation,
   156  	}, nil
   157  }
   158  
   159  // Next returns the next word and its position. Once no more words can be returned, then we've reached the end.
   160  // This iterates through its list separately from NextUnique.
   161  func (dp *DefaultParser) Next(ctx *sql.Context) (word string, wordPosition uint64, reachedTheEnd bool, err error) {
   162  	if dp.wordsIdx >= len(dp.words) {
   163  		return "", 0, true, nil
   164  	}
   165  	pWord := dp.words[dp.wordsIdx]
   166  	dp.wordsIdx++
   167  	return pWord.Word, pWord.Position, false, nil
   168  }
   169  
   170  // NextUnique returns the next unique word. Once no more words can be returned, then we've reached the end. This
   171  // iterates through its list separately from Next.
   172  func (dp *DefaultParser) NextUnique(ctx *sql.Context) (uniqueWord string, reachedTheEnd bool, err error) {
   173  	if dp.uniqueIdx >= len(dp.unique) {
   174  		return "", true, nil
   175  	}
   176  	uniqueWord = dp.unique[dp.uniqueIdx]
   177  	dp.uniqueIdx++
   178  	return uniqueWord, false, nil
   179  }
   180  
   181  // DocumentCount returns the count of the given word within the document.
   182  func (dp *DefaultParser) DocumentCount(ctx *sql.Context, word string) (count uint64, err error) {
   183  	hash, err := dp.collation.HashToUint(word)
   184  	if err != nil {
   185  		return 0, err
   186  	}
   187  	if count, ok := dp.uniqueMap[hash]; ok {
   188  		return uint64(count), nil
   189  	}
   190  	return 0, nil
   191  }
   192  
   193  // UniqueWordCount returns the number of unique words within the document.
   194  func (dp *DefaultParser) UniqueWordCount(ctx *sql.Context) (count uint64) {
   195  	return uint64(len(dp.unique))
   196  }
   197  
   198  // Reset will set the progress on both Next and NextUnique to the beginning, allowing the parser to be reused.
   199  func (dp *DefaultParser) Reset() {
   200  	dp.wordsIdx = 0
   201  	dp.uniqueIdx = 0
   202  }
   203  
   204  // newParserWord creates a new parserWord from the given string. This also takes care of trimming.
   205  func newParserWord(word string, position uint64) parserWord {
   206  	originalWord := word
   207  	word = strings.TrimLeft(word, "'")
   208  	position += uint64(len(originalWord) - len(word))
   209  	return parserWord{
   210  		Word:     strings.TrimRight(word, "'"),
   211  		Position: position,
   212  	}
   213  }