github.com/dolthub/go-mysql-server@v0.18.0/sql/fulltext/default_parser.go (about) 1 // Copyright 2023 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package fulltext 16 17 import ( 18 "fmt" 19 "strings" 20 "unicode" 21 22 "github.com/dolthub/go-mysql-server/sql" 23 ) 24 25 // parserState represents the state of the parser as it iterates over runes. 26 type parserState byte 27 28 const ( 29 parserState_Whitespace parserState = iota 30 parserState_Word 31 parserState_Apostrophe 32 ) 33 34 // DefaultParser is the default text parser that is used when parsing Full-Text documents. Its intention is the match the 35 // expected behavior of MySQL's default Full-Text parser. This provides normalization, as well as statistics regarding 36 // the input document, such as the occurrence of any given word. Such statistics may later be used when calculating the 37 // relevancy within a MatchAgainst expression. 38 type DefaultParser struct { 39 document string 40 words []parserWord 41 wordsIdx int 42 unique []string 43 uniqueIdx int 44 uniqueMap map[uint64]uint32 45 collation sql.CollationID 46 } 47 48 // parserWord contains the word and its starting position. 49 type parserWord struct { 50 Word string 51 Position uint64 52 } 53 54 // NewDefaultParser creates a new DefaultParser. 55 func NewDefaultParser(ctx *sql.Context, collation sql.CollationID, colVals ...interface{}) (DefaultParser, error) { 56 //TODO: implement exact matching using double quotes 57 sb := strings.Builder{} 58 for i, colVal := range colVals { 59 switch v := colVal.(type) { 60 case string: 61 if i > 0 { 62 sb.WriteString(" ") 63 } 64 sb.WriteString(v) 65 case []byte: 66 if i > 0 { 67 sb.WriteString(" ") 68 } 69 sb.Write(v) 70 case nil: 71 continue 72 default: 73 panic(fmt.Errorf("Full-Text parser has encountered an unexpected type: %T", colVal)) 74 } 75 } 76 document := sb.String() 77 78 // We preprocess the document so that it's easier to calculate counts 79 var words []parserWord 80 var buildingWord []rune 81 state := parserState_Whitespace 82 position := uint64(0) 83 for i, r := range document { 84 isCharacter := ((unicode.IsLetter(r) || unicode.IsNumber(r) || unicode.IsDigit(r)) && !unicode.IsPunct(r)) || r == '_' 85 isApostrophe := r == '\'' 86 87 switch state { 88 case parserState_Whitespace: 89 if isCharacter { 90 buildingWord = append(buildingWord, r) 91 state = parserState_Word 92 } else { 93 position++ 94 } 95 case parserState_Word: 96 if !isCharacter { 97 if isApostrophe { 98 buildingWord = append(buildingWord, r) 99 state = parserState_Apostrophe 100 } else { 101 word := newParserWord(string(buildingWord), position) 102 if len(word.Word) >= 3 { 103 words = append(words, word) 104 } 105 buildingWord = buildingWord[:0] 106 position = uint64(i) 107 state = parserState_Whitespace 108 } 109 } else { 110 buildingWord = append(buildingWord, r) 111 } 112 case parserState_Apostrophe: 113 if !isCharacter { 114 word := newParserWord(string(buildingWord), position) 115 if len(word.Word) >= 3 { 116 words = append(words, word) 117 } 118 buildingWord = buildingWord[:0] 119 position = uint64(i) 120 state = parserState_Whitespace 121 } else { 122 buildingWord = append(buildingWord, r) 123 state = parserState_Word 124 } 125 } 126 } 127 { // Grab the last word if there is one 128 word := newParserWord(string(buildingWord), position) 129 if len(word.Word) >= 3 { 130 words = append(words, word) 131 } 132 } 133 134 var unique []string 135 uniqueMap := make(map[uint64]uint32) 136 for _, word := range words { 137 hash, err := collation.HashToUint(word.Word) 138 if err != nil { 139 return DefaultParser{}, err 140 } 141 if count, ok := uniqueMap[hash]; ok { 142 uniqueMap[hash] = count + 1 143 } else { 144 unique = append(unique, word.Word) 145 uniqueMap[hash] = 1 146 } 147 } 148 return DefaultParser{ 149 document: document, 150 words: words, 151 wordsIdx: 0, 152 unique: unique, 153 uniqueIdx: 0, 154 uniqueMap: uniqueMap, 155 collation: collation, 156 }, nil 157 } 158 159 // Next returns the next word and its position. Once no more words can be returned, then we've reached the end. 160 // This iterates through its list separately from NextUnique. 161 func (dp *DefaultParser) Next(ctx *sql.Context) (word string, wordPosition uint64, reachedTheEnd bool, err error) { 162 if dp.wordsIdx >= len(dp.words) { 163 return "", 0, true, nil 164 } 165 pWord := dp.words[dp.wordsIdx] 166 dp.wordsIdx++ 167 return pWord.Word, pWord.Position, false, nil 168 } 169 170 // NextUnique returns the next unique word. Once no more words can be returned, then we've reached the end. This 171 // iterates through its list separately from Next. 172 func (dp *DefaultParser) NextUnique(ctx *sql.Context) (uniqueWord string, reachedTheEnd bool, err error) { 173 if dp.uniqueIdx >= len(dp.unique) { 174 return "", true, nil 175 } 176 uniqueWord = dp.unique[dp.uniqueIdx] 177 dp.uniqueIdx++ 178 return uniqueWord, false, nil 179 } 180 181 // DocumentCount returns the count of the given word within the document. 182 func (dp *DefaultParser) DocumentCount(ctx *sql.Context, word string) (count uint64, err error) { 183 hash, err := dp.collation.HashToUint(word) 184 if err != nil { 185 return 0, err 186 } 187 if count, ok := dp.uniqueMap[hash]; ok { 188 return uint64(count), nil 189 } 190 return 0, nil 191 } 192 193 // UniqueWordCount returns the number of unique words within the document. 194 func (dp *DefaultParser) UniqueWordCount(ctx *sql.Context) (count uint64) { 195 return uint64(len(dp.unique)) 196 } 197 198 // Reset will set the progress on both Next and NextUnique to the beginning, allowing the parser to be reused. 199 func (dp *DefaultParser) Reset() { 200 dp.wordsIdx = 0 201 dp.uniqueIdx = 0 202 } 203 204 // newParserWord creates a new parserWord from the given string. This also takes care of trimming. 205 func newParserWord(word string, position uint64) parserWord { 206 originalWord := word 207 word = strings.TrimLeft(word, "'") 208 position += uint64(len(originalWord) - len(word)) 209 return parserWord{ 210 Word: strings.TrimRight(word, "'"), 211 Position: position, 212 } 213 }