github.com/cockroachdb/cockroachdb-parser@v0.23.3-0.20240213214944-911057d40c9a/pkg/util/tsearch/lex.go (about) 1 // Copyright 2022 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package tsearch 12 13 import ( 14 "sort" 15 "strconv" 16 "unicode" 17 "unicode/utf8" 18 19 "github.com/cockroachdb/cockroachdb-parser/pkg/sql/pgwire/pgcode" 20 "github.com/cockroachdb/cockroachdb-parser/pkg/sql/pgwire/pgerror" 21 ) 22 23 type tsVectorParseState int 24 25 const ( 26 // Waiting for term (whitespace, ', or any other char) 27 expectingTerm tsVectorParseState = iota 28 // Inside of a normal term (single quotes are processed as normal chars) 29 insideNormalTerm 30 // Inside of a ' term 31 insideQuoteTerm 32 // Finished with ' term (waiting for : or space) 33 finishedQuoteTerm 34 // Found a colon (or comma) and expecting a position 35 expectingPosList 36 // Finished parsing a position, expecting a comma or whitespace 37 expectingPosDelimiter 38 ) 39 40 // tsVectorLexer is a lexing state machine for the TSVector and TSQuery input 41 // formats. See the comment above lex() for more details. 42 type tsVectorLexer struct { 43 input string 44 lastLen int 45 pos int 46 state tsVectorParseState 47 48 // If true, we're in "TSQuery lexing mode" 49 tsQuery bool 50 } 51 52 func (p *tsVectorLexer) back() { 53 p.pos -= p.lastLen 54 p.lastLen = 0 55 } 56 57 func (p *tsVectorLexer) advance() rune { 58 r, n := utf8.DecodeRuneInString(p.input[p.pos:]) 59 p.pos += n 60 p.lastLen = n 61 return r 62 } 63 64 const ( 65 // The maximum number of bytes in a TSVector. 66 maxTSVectorLen = (1 << 20) - 1 67 // The maximum number of positions in a TSVector position list. 68 maxTSVectorPositions = 256 69 // The maximum number within a <> followed-by declaration. 70 maxTSVectorFollowedBy = 1 << 14 71 // The maximum size of a TSVector lexeme. 72 maxTSVectorLexemeLen = (1 << 14) - 1 73 // The maximum position within a TSVector position list. 74 maxTSVectorPosition = (1 << 14) - 1 75 ) 76 77 // lex lexes the input in the receiver according to the TSVector "grammar", or 78 // according the TSQuery "grammar" if tsQuery is set to true. 79 // 80 // A simple TSVector input could look like this: 81 // 82 // foo bar:3 baz:3A 'blah :blah' 83 // 84 // A TSVector is a list of terms. 85 // 86 // Each term is a word and an optional "position list". 87 // 88 // A word may be single-quote wrapped, in which case the next term may begin 89 // without any whitespace in between (if there is no position list on the word). 90 // In a single-quote wrapped word, the word must terminate with a single quote. 91 // All other characters are treated as literals. Backlashes can be used to 92 // escape single quotes, and are otherwise skipped, allowing the following 93 // character to be included as a literal (such as the backslash character itself). 94 // 95 // If a word is not single-quote wrapped, the next term will begin if there is 96 // whitespace after the word. Whitespace and colons may be entered by escaping 97 // them with backslashes. All other uses of backslashes are skipped, allowing 98 // the following character to be included as a literal. 99 // 100 // A word is delimited from its position list with a colon. 101 // 102 // A position list is made up of a comma-delimited list of numbers, each 103 // of which may have an optional "strength" which is a letter from A-D. 104 // 105 // In TSQuery mode, there are a few differences: 106 // - Terms must be separated with tsOperators (!, <->, |, &), not just spaces. 107 // - Terms may be surrounded by the ( ) grouping tokens. 108 // - Terms cannot include multiple positions. 109 // - Terms can include more than one "strength", as well as the * prefix search 110 // operator. For example, foo:3AC* 111 // 112 // See examples in tsvector_test.go and tsquery_test.go, and see the 113 // documentation in tsvector.go for more information and a link to the Postgres 114 // documentation that is the spec for all of this behavior. 115 func (p tsVectorLexer) lex() (TSVector, error) { 116 // termBuf will be reused as a temporary buffer to assemble each term before 117 // copying into the vector. 118 termBuf := make([]rune, 0, 32) 119 ret := TSVector{} 120 121 if len(p.input) >= maxTSVectorLen { 122 typ := "tsvector" 123 if p.tsQuery { 124 typ = "tsquery" 125 } 126 return nil, pgerror.Newf(pgcode.ProgramLimitExceeded, 127 "string is too long for %s (%d bytes, max %d bytes)", 128 typ, len(p.input), maxTSVectorLen) 129 } 130 131 for p.pos < len(p.input) { 132 r := p.advance() 133 switch p.state { 134 case expectingTerm: 135 // Expect either a single quote, a whitespace, or anything else. 136 if r == '\'' { 137 p.state = insideQuoteTerm 138 continue 139 } 140 if unicode.IsSpace(r) { 141 continue 142 } 143 144 if p.tsQuery { 145 // Check for &, |, !, and <-> (or <number>) 146 switch r { 147 case '&': 148 ret = append(ret, tsTerm{operator: and}) 149 continue 150 case '|': 151 ret = append(ret, tsTerm{operator: or}) 152 continue 153 case '!': 154 ret = append(ret, tsTerm{operator: not}) 155 continue 156 case '(': 157 ret = append(ret, tsTerm{operator: lparen}) 158 continue 159 case ')': 160 ret = append(ret, tsTerm{operator: rparen}) 161 continue 162 case '<': 163 r = p.advance() 164 n := 1 165 if r == '-' { 166 r = p.advance() 167 } else { 168 for unicode.IsNumber(r) { 169 termBuf = append(termBuf, r) 170 r = p.advance() 171 } 172 var err error 173 n, err = strconv.Atoi(string(termBuf)) 174 if n > maxTSVectorFollowedBy || n < 0 { 175 return nil, pgerror.Newf(pgcode.InvalidParameterValue, 176 "distance in phrase operator must be an integer value between zero and %d inclusive", maxTSVectorFollowedBy) 177 } 178 termBuf = termBuf[:0] 179 if err != nil { 180 return p.syntaxError() 181 } 182 } 183 if r != '>' { 184 return p.syntaxError() 185 } 186 ret = append(ret, tsTerm{operator: followedby, followedN: uint16(n)}) 187 continue 188 } 189 } 190 191 p.state = insideNormalTerm 192 // Need to consume the rune we just found again. 193 p.back() 194 continue 195 196 case insideQuoteTerm: 197 // If escaped, eat character and continue. 198 switch r { 199 case '\\': 200 r = p.advance() 201 termBuf = append(termBuf, r) 202 continue 203 case '\'': 204 term, err := newLexemeTerm(string(termBuf)) 205 if err != nil { 206 return nil, err 207 } 208 ret = append(ret, term) 209 termBuf = termBuf[:0] 210 p.state = finishedQuoteTerm 211 continue 212 } 213 termBuf = append(termBuf, r) 214 case finishedQuoteTerm: 215 if unicode.IsSpace(r) { 216 p.state = expectingTerm 217 } else if r == ':' { 218 lastTerm := &ret[len(ret)-1] 219 lastTerm.positions = append(lastTerm.positions, tsPosition{}) 220 p.state = expectingPosList 221 } else { 222 p.state = expectingTerm 223 p.back() 224 } 225 case insideNormalTerm: 226 // If escaped, eat character and continue. 227 if r == '\\' { 228 r = p.advance() 229 termBuf = append(termBuf, r) 230 continue 231 } 232 233 if p.tsQuery { 234 switch r { 235 case '&', '!', '|', '<', '(', ')': 236 // These are all "operators" in the TSQuery language. End the current 237 // term and start a new one. 238 term, err := newLexemeTerm(string(termBuf)) 239 if err != nil { 240 return nil, err 241 } 242 ret = append(ret, term) 243 termBuf = termBuf[:0] 244 p.state = expectingTerm 245 p.back() 246 continue 247 } 248 } 249 250 // Colon that comes first is an ordinary character. 251 space := unicode.IsSpace(r) 252 if space || r == ':' && len(termBuf) > 0 { 253 // Found a terminator. 254 // Copy the termBuf into the vector, resize the termBuf, continue on. 255 term, err := newLexemeTerm(string(termBuf)) 256 if err != nil { 257 return nil, err 258 } 259 if r == ':' { 260 term.positions = append(term.positions, tsPosition{}) 261 } 262 ret = append(ret, term) 263 termBuf = termBuf[:0] 264 if space { 265 p.state = expectingTerm 266 } else { 267 p.state = expectingPosList 268 } 269 continue 270 } 271 if p.tsQuery && r == ':' { 272 return p.syntaxError() 273 } 274 termBuf = append(termBuf, r) 275 case expectingPosList: 276 var pos int 277 if !p.tsQuery { 278 // If we have nothing in our termBuf, we need to see at least one number. 279 if unicode.IsNumber(r) { 280 termBuf = append(termBuf, r) 281 continue 282 } 283 if len(termBuf) == 0 { 284 return p.syntaxError() 285 } 286 var err error 287 pos, err = strconv.Atoi(string(termBuf)) 288 if err != nil { 289 return p.syntaxError() 290 } 291 if pos == 0 { 292 return ret, pgerror.Newf(pgcode.Syntax, "wrong position info in TSVector", p.input) 293 } else if pos > maxTSVectorPosition { 294 // Postgres silently truncates positions larger than 16383 to 16383. 295 pos = maxTSVectorPosition 296 } 297 termBuf = termBuf[:0] 298 } 299 lastTerm := &ret[len(ret)-1] 300 lastTermPos := len(lastTerm.positions) - 1 301 lastTerm.positions[lastTermPos].position = uint16(pos) 302 if unicode.IsSpace(r) { 303 // Done with our term. Advance to next term! 304 p.state = expectingTerm 305 continue 306 } 307 switch r { 308 case ',': 309 if p.tsQuery { 310 // Not valid! No , allowed in position lists in tsqueries. 311 return ret, pgerror.Newf(pgcode.Syntax, "syntax error in TSVector: %s", p.input) 312 } 313 lastTerm.positions = append(lastTerm.positions, tsPosition{}) 314 // Expecting another number next. 315 continue 316 case '*': 317 if p.tsQuery { 318 lastTerm.positions[lastTermPos].weight |= weightStar 319 } else { 320 p.state = expectingPosDelimiter 321 lastTerm.positions[lastTermPos].weight |= weightA 322 } 323 case 'a', 'A': 324 if !p.tsQuery { 325 p.state = expectingPosDelimiter 326 } 327 lastTerm.positions[lastTermPos].weight |= weightA 328 case 'b', 'B': 329 if !p.tsQuery { 330 p.state = expectingPosDelimiter 331 } 332 lastTerm.positions[lastTermPos].weight |= weightB 333 case 'c', 'C': 334 if !p.tsQuery { 335 p.state = expectingPosDelimiter 336 } 337 lastTerm.positions[lastTermPos].weight |= weightC 338 case 'd', 'D': 339 // Weight D is handled differently in TSQuery parsing than TSVector. In 340 // TSVector parsing, the default is already D - so we don't record any 341 // weight at all. This matches Postgres behavior - a default D weight is 342 // not printed or stored. In TSQuery, we have to record it explicitly. 343 if p.tsQuery { 344 lastTerm.positions[lastTermPos].weight |= weightD 345 } else { 346 p.state = expectingPosDelimiter 347 } 348 default: 349 return p.syntaxError() 350 } 351 case expectingPosDelimiter: 352 if r == ',' { 353 p.state = expectingPosList 354 lastTerm := &ret[len(ret)-1] 355 lastTerm.positions = append(lastTerm.positions, tsPosition{}) 356 } else if unicode.IsSpace(r) { 357 p.state = expectingTerm 358 } else { 359 return p.syntaxError() 360 } 361 default: 362 panic("invalid TSVector lex state") 363 } 364 } 365 // Reached the end of the string. 366 switch p.state { 367 case insideQuoteTerm: 368 // Unfinished quote term. 369 return p.syntaxError() 370 case insideNormalTerm: 371 // Finish normal term. 372 term, err := newLexemeTerm(string(termBuf)) 373 if err != nil { 374 return nil, err 375 } 376 ret = append(ret, term) 377 case expectingPosList: 378 // Finish number. 379 if !p.tsQuery { 380 if len(termBuf) == 0 { 381 return p.syntaxError() 382 } 383 pos, err := strconv.Atoi(string(termBuf)) 384 if err != nil { 385 return p.syntaxError() 386 } 387 if pos == 0 { 388 return ret, pgerror.Newf(pgcode.Syntax, "wrong position info in TSVector", p.input) 389 } else if pos > maxTSVectorPosition { 390 // Postgres silently truncates positions larger than 16383 to 16383. 391 pos = maxTSVectorPosition 392 } 393 lastTerm := &ret[len(ret)-1] 394 lastTerm.positions[len(lastTerm.positions)-1].position = uint16(pos) 395 } 396 case expectingTerm, finishedQuoteTerm: 397 // We are good to go, we just finished a term and nothing needs to be cleaned up. 398 case expectingPosDelimiter: 399 // We are good to go, we just finished a position and nothing needs to be cleaned up. 400 default: 401 panic("invalid TSVector lex state") 402 } 403 for _, t := range ret { 404 sort.Slice(t.positions, func(i, j int) bool { 405 return t.positions[i].position < t.positions[j].position 406 }) 407 } 408 return ret, nil 409 } 410 411 func (p *tsVectorLexer) syntaxError() (TSVector, error) { 412 typ := "TSVector" 413 if p.tsQuery { 414 typ = "TSQuery" 415 } 416 return TSVector{}, pgerror.Newf(pgcode.Syntax, "syntax error in %s: %s", typ, p.input) 417 }