github.com/cockroachdb/cockroachdb-parser@v0.23.3-0.20240213214944-911057d40c9a/pkg/util/tsearch/tsquery.go (about) 1 // Copyright 2022 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package tsearch 12 13 import ( 14 "fmt" 15 "strings" 16 17 "github.com/cockroachdb/cockroachdb-parser/pkg/keysbase" 18 "github.com/cockroachdb/cockroachdb-parser/pkg/sql/inverted" 19 "github.com/cockroachdb/cockroachdb-parser/pkg/sql/pgwire/pgcode" 20 "github.com/cockroachdb/cockroachdb-parser/pkg/sql/pgwire/pgerror" 21 "github.com/cockroachdb/errors" 22 ) 23 24 // tsOperator is an enum that represents the different operators within a 25 // TSQuery. 26 type tsOperator int 27 28 const ( 29 // Parentheses can be used to control nesting of the TSQuery operators. 30 // Without parentheses, | binds least tightly, 31 // then &, then <->, and ! most tightly. 32 33 invalid tsOperator = iota 34 // and is the & operator, which requires both of its operands to exist in 35 // the searched document. 36 and 37 // or is the | operator, which requires one or more of its operands to exist 38 // in the searched document. 39 or 40 // not is the ! operator, which requires that its single operand doesn't exist 41 // in the searched document. 42 not 43 // followedby is the <-> operator. It can also be specified with a number like 44 // <1> or <2> or <3>. It requires that the left operand is followed by the right 45 // operand. The <-> and <1> forms mean that they should be directly followed 46 // by each other. A number indicates how many terms away the operands should be. 47 followedby 48 // lparen and rparen are grouping operators. They're just used in parsing and 49 // don't appear in the TSQuery tree. 50 lparen 51 rparen 52 ) 53 54 // precedence returns the parsing precedence of the receiver. A higher 55 // precedence means that the operator binds more tightly. 56 func (o tsOperator) precedence() int { 57 switch o { 58 case not: 59 return 4 60 case followedby: 61 return 3 62 case and: 63 return 2 64 case or: 65 return 1 66 } 67 panic(errors.AssertionFailedf("no precedence for operator %d", o)) 68 } 69 70 func (o tsOperator) pgwireEncoding() byte { 71 switch o { 72 case not: 73 return 1 74 case and: 75 return 2 76 case or: 77 return 3 78 case followedby: 79 return 4 80 } 81 panic(errors.AssertionFailedf("no pgwire encoding for operator %d", o)) 82 } 83 84 func (o tsOperator) String() string { 85 switch o { 86 case not: 87 return "!" 88 case and: 89 return "&" 90 case or: 91 return "|" 92 case followedby: 93 return "<->" 94 case lparen: 95 return "(" 96 case rparen: 97 return ")" 98 } 99 panic(errors.AssertionFailedf("no string for operator %d", o)) 100 } 101 102 func tsOperatorFromPgwireEncoding(b byte) (tsOperator, error) { 103 switch b { 104 case 1: 105 return not, nil 106 case 2: 107 return and, nil 108 case 3: 109 return or, nil 110 case 4: 111 return followedby, nil 112 } 113 return invalid, errors.AssertionFailedf("no operator for pgwire byte %d", b) 114 } 115 116 // tsNode represents a single AST node within the tree of a TSQuery. 117 type tsNode struct { 118 // Only one of term or op will be set. 119 // If term is set, this is a leaf node containing a lexeme. 120 term tsTerm 121 // If op is set, this is an operator node: either not, and, or, or followedby. 122 op tsOperator 123 // set only when op is followedby. Indicates the number n within the <n> 124 // operator, which means the number of terms separating the left and the right 125 // argument. 126 // At most 16384. 127 followedN uint16 128 129 // l is the left child of the node if op is set, or the only child if 130 // op is set to "not". 131 l *tsNode 132 // r is the right child of the node if op is set. 133 r *tsNode 134 } 135 136 func (n tsNode) String() string { 137 var buf strings.Builder 138 n.writeInfixString(&buf, 0) 139 return buf.String() 140 } 141 142 func (n tsNode) writeInfixString(buf *strings.Builder, parentPrecedence int) { 143 if n.op == invalid { 144 n.term.writeString(buf) 145 return 146 } 147 prec := n.op.precedence() 148 needParen := prec < parentPrecedence 149 if needParen { 150 buf.WriteString("( ") 151 } 152 switch n.op { 153 case not: 154 buf.WriteString("!") 155 n.l.writeInfixString(buf, prec) 156 default: 157 n.l.writeInfixString(buf, prec) 158 buf.WriteString(" ") 159 tsTerm{operator: n.op, followedN: n.followedN}.writeString(buf) 160 buf.WriteString(" ") 161 n.r.writeInfixString(buf, prec) 162 } 163 if needParen { 164 buf.WriteString(" )") 165 } 166 } 167 168 // UnambiguousString returns a string representation of this tsNode that wraps 169 // all expressions with parentheses. It's just for testing. 170 func (n tsNode) UnambiguousString() string { 171 switch n.op { 172 case invalid: 173 return n.term.lexeme 174 case not: 175 return fmt.Sprintf("!%s", n.l.UnambiguousString()) 176 } 177 var buf strings.Builder 178 tsTerm{operator: n.op, followedN: n.followedN}.writeString(&buf) 179 return fmt.Sprintf("[%s%s%s]", n.l.UnambiguousString(), buf.String(), n.r.UnambiguousString()) 180 } 181 182 // TSQuery represents a tsNode AST root. A TSQuery is a tree of text search 183 // operators that can be run against a TSVector to produce a predicate of 184 // whether the query matched. 185 type TSQuery struct { 186 root *tsNode 187 } 188 189 func (q TSQuery) String() string { 190 if q.root == nil { 191 return "" 192 } 193 return q.root.String() 194 } 195 196 // GetInvertedExpr returns the inverted expression that can be used to search 197 // an index. 198 func (q TSQuery) GetInvertedExpr() (expr inverted.Expression, err error) { 199 return q.root.getInvertedExpr() 200 } 201 202 func (n *tsNode) getInvertedExpr() (inverted.Expression, error) { 203 switch n.op { 204 case invalid: 205 // We're looking at a lexeme match. 206 // There are 3 options: 207 // 1. Normal match. 208 // In this case, we make a tight and unique span. 209 // 2. Prefix match. 210 // In this case, we make a non-unique, tight span that starts with the 211 // prefix. 212 // 3. Weighted match. 213 // In this case, we make the match non-tight, because we don't store the 214 // weights of the lexemes in the index, and are forced to re-check 215 // once we get the result from the inverted index. 216 // Note that options 2 and 3 can both be present. 217 var weight tsWeight 218 if len(n.term.positions) > 0 { 219 weight = n.term.positions[0].weight 220 } 221 key := EncodeInvertedIndexKey(nil /* inKey */, n.term.lexeme) 222 var span inverted.Span 223 224 prefixMatch := weight&weightStar != 0 225 if prefixMatch { 226 span = inverted.Span{ 227 Start: key, 228 End: EncodeInvertedIndexKey(nil /* inKey */, string(keysbase.PrefixEnd([]byte(n.term.lexeme)))), 229 } 230 } else { 231 span = inverted.MakeSingleValSpan(key) 232 } 233 invertedExpr := inverted.ExprForSpan(span, true /* tight */) 234 if !prefixMatch { 235 // If we don't have a prefix match we also can set unique=true. 236 invertedExpr.Unique = true 237 } 238 239 if weight != 0 && weight != weightStar { 240 // Some weights are set. 241 invertedExpr.SetNotTight() 242 } 243 return invertedExpr, nil 244 case followedby: 245 fallthrough 246 case and: 247 l, lErr := n.l.getInvertedExpr() 248 r, rErr := n.r.getInvertedExpr() 249 if lErr != nil && rErr != nil { 250 // We need a positive match on at least one side. 251 return nil, lErr 252 } else if lErr != nil { 253 // An error on one side means we have to re-check that side's condition 254 // later. 255 r.SetNotTight() 256 //nolint:returnerrcheck 257 return r, nil 258 } else if rErr != nil { 259 // Ditto above. 260 l.SetNotTight() 261 //nolint:returnerrcheck 262 return l, nil 263 } 264 expr := inverted.And(l, r) 265 if n.op == followedby { 266 // If we have a followedby match, we have to re-check the results of the 267 // match after we get them from the inverted index - just because both 268 // terms are present doesn't mean they're properly next to each other, 269 // and the index doesn't store position information at all. 270 expr.SetNotTight() 271 } 272 return expr, nil 273 case or: 274 l, lErr := n.l.getInvertedExpr() 275 r, rErr := n.r.getInvertedExpr() 276 if lErr != nil { 277 // We need a positive match on both sides, so we return an error here. 278 // For example, searching for a | !b would require a full scan, since some 279 // documents could match that contain neither a nor b. 280 return nil, lErr 281 } else if rErr != nil { 282 return nil, rErr 283 } 284 return inverted.Or(l, r), nil 285 case not: 286 // A not would require more advanced machinery than we have, so for now 287 // we'll just assume we can't perform an inverted expression search on a 288 // not. Note that a nested not would make it possible, but we are ignoring 289 // this case for now as it seems marginal. 290 return nil, errors.New("unable to create inverted expr for not") 291 } 292 return nil, errors.AssertionFailedf("invalid operator %d", n.op) 293 } 294 295 func lexTSQuery(input string) (TSVector, error) { 296 parser := tsVectorLexer{ 297 input: input, 298 state: expectingTerm, 299 tsQuery: true, 300 } 301 302 return parser.lex() 303 } 304 305 // ParseTSQuery produces a TSQuery from an input string. 306 func ParseTSQuery(input string) (TSQuery, error) { 307 terms, err := lexTSQuery(input) 308 if err != nil { 309 return TSQuery{}, err 310 } 311 312 // Now create the operator tree. 313 queryParser := tsQueryParser{terms: terms, input: input} 314 return queryParser.parse() 315 } 316 317 // tsQueryParser is a parser that operates on a set of lexed tokens, represented 318 // as the tsTerms in a TSVector. 319 type tsQueryParser struct { 320 input string 321 terms TSVector 322 } 323 324 func (p tsQueryParser) peek() (*tsTerm, bool) { 325 if len(p.terms) == 0 { 326 return nil, false 327 } 328 return &p.terms[0], true 329 } 330 331 func (p *tsQueryParser) nextTerm() (*tsTerm, bool) { 332 if len(p.terms) == 0 { 333 return nil, false 334 } 335 ret := &p.terms[0] 336 p.terms = p.terms[1:] 337 return ret, true 338 } 339 340 func (p *tsQueryParser) parse() (TSQuery, error) { 341 expr, err := p.parseTSExpr(0) 342 if err != nil { 343 return TSQuery{}, err 344 } 345 if len(p.terms) > 0 { 346 _, err := p.syntaxError() 347 return TSQuery{}, err 348 } 349 return TSQuery{root: expr}, nil 350 } 351 352 // parseTSExpr is a "Pratt parser" which constructs a query tree out of the 353 // lexed tsTerms, respecting the precedence of the tsOperators. 354 // See this nice article about Pratt parsing, which this parser was adapted from: 355 // https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html 356 func (p *tsQueryParser) parseTSExpr(minBindingPower int) (*tsNode, error) { 357 t, ok := p.nextTerm() 358 if !ok { 359 return nil, pgerror.Newf(pgcode.Syntax, "text-search query doesn't contain lexemes: %s", p.input) 360 } 361 362 // First section: grab either atoms, nots, or parens. 363 var lExpr *tsNode 364 switch t.operator { 365 case invalid: 366 lExpr = &tsNode{term: *t} 367 case lparen: 368 expr, err := p.parseTSExpr(0) 369 if err != nil { 370 return nil, err 371 } 372 nextTerm, ok := p.nextTerm() 373 if !ok || nextTerm.operator != rparen { 374 return p.syntaxError() 375 } 376 lExpr = expr 377 case not: 378 expr, err := p.parseTSExpr(t.operator.precedence()) 379 if err != nil { 380 return nil, err 381 } 382 lExpr = &tsNode{op: not, l: expr} 383 default: 384 return p.syntaxError() 385 } 386 387 // Now we do our "Pratt parser loop". 388 for { 389 next, ok := p.peek() 390 if !ok { 391 return lExpr, nil 392 } 393 switch next.operator { 394 case and, or, followedby: 395 default: 396 return lExpr, nil 397 } 398 precedence := next.operator.precedence() 399 if precedence < minBindingPower { 400 break 401 } 402 p.nextTerm() 403 rExpr, err := p.parseTSExpr(precedence) 404 if err != nil { 405 return nil, err 406 } 407 lExpr = &tsNode{op: next.operator, followedN: next.followedN, l: lExpr, r: rExpr} 408 } 409 return lExpr, nil 410 } 411 412 func (p *tsQueryParser) syntaxError() (*tsNode, error) { 413 return nil, pgerror.Newf(pgcode.Syntax, "syntax error in TSQuery: %s", p.input) 414 } 415 416 // ToTSQuery implements the to_tsquery builtin, which lexes an input, performs 417 // stopwording and normalization on the tokens, and returns a parsed query. 418 func ToTSQuery(config string, input string) (TSQuery, error) { 419 return toTSQuery(config, invalid, input) 420 } 421 422 // PlainToTSQuery implements the plainto_tsquery builtin, which lexes an input, 423 // performs stopwording and normalization on the tokens, and returns a parsed 424 // query, interposing the & operator between each token. 425 func PlainToTSQuery(config string, input string) (TSQuery, error) { 426 return toTSQuery(config, and, input) 427 } 428 429 // PhraseToTSQuery implements the phraseto_tsquery builtin, which lexes an input, 430 // performs stopwording and normalization on the tokens, and returns a parsed 431 // query, interposing the <-> operator between each token. 432 func PhraseToTSQuery(config string, input string) (TSQuery, error) { 433 return toTSQuery(config, followedby, input) 434 } 435 436 // toTSQuery implements the to_tsquery builtin, which lexes an input, 437 // performs stopwording and normalization on the tokens, and returns a parsed 438 // query. If the interpose operator is not invalid, it's interposed between each 439 // token in the input. 440 func toTSQuery(config string, interpose tsOperator, input string) (TSQuery, error) { 441 vector, err := lexTSQuery(input) 442 if err != nil { 443 return TSQuery{}, err 444 } 445 tokens := make(TSVector, 0, len(vector)) 446 foundStopwords := false 447 for i := range vector { 448 tok := vector[i] 449 450 foundOperator := tok.operator != invalid 451 var lexemeTokens []string 452 453 if !foundOperator { 454 // Try parsing the token. 455 lexemeTokens = TSParse(tok.lexeme) 456 } 457 458 // If we found an operator or were able to parse lexemes from the token, 459 // add the interpose operator if there is one. 460 if interpose != invalid && i > 0 && (foundOperator || len(lexemeTokens) > 0) { 461 term := tsTerm{operator: interpose} 462 if interpose == followedby { 463 term.followedN = 1 464 } 465 tokens = append(tokens, term) 466 } 467 468 if foundOperator { 469 tokens = append(tokens, tok) 470 continue 471 } 472 473 if len(lexemeTokens) == 0 { 474 // We ate some whitespace or whitespace-like text with no tokens. 475 continue 476 } 477 478 // When we support more than just the simple configuration, we'll also 479 // want to remove stopwords, which will affect the interposing, but we can 480 // worry about that later. 481 // Additionally, if we're doing phraseto_tsquery, if we remove a stopword, 482 // we need to make sure to increase the "followedN" of the followedby 483 // operator. For example, phraseto_tsquery('hello a deer') will return 484 // 'hello <2> deer', since the a stopword would be removed. 485 486 tokInterpose := interpose 487 if tokInterpose == invalid { 488 tokInterpose = followedby 489 } 490 for j := range lexemeTokens { 491 if j > 0 { 492 // We found more than one lexeme in our token, so we need to add all of them 493 // to the query, connected by our interpose operator. 494 // If we aren't running with an interpose, like in to_tsquery, Postgres 495 // uses the <-> operator to connect multiple lexemes from a single token. 496 term := tsTerm{operator: tokInterpose} 497 if tokInterpose == followedby { 498 term.followedN = 1 499 } 500 tokens = append(tokens, term) 501 } 502 lexeme, stopWord, err := TSLexize(config, lexemeTokens[j]) 503 if err != nil { 504 return TSQuery{}, err 505 } 506 if stopWord { 507 foundStopwords = true 508 } 509 tokens = append(tokens, tsTerm{lexeme: lexeme, positions: tok.positions}) 510 } 511 } 512 513 // Now create the operator tree. 514 queryParser := tsQueryParser{terms: tokens, input: input} 515 query, err := queryParser.parse() 516 if err != nil { 517 return query, err 518 } 519 520 if foundStopwords { 521 query = cleanupStopwords(query) 522 if query.root == nil { 523 return query, pgerror.Newf(pgcode.Syntax, "text-search query doesn't contain lexemes: %s", input) 524 } 525 } 526 return query, err 527 } 528 529 func cleanupStopwords(query TSQuery) TSQuery { 530 query.root, _, _ = cleanupStopword(query.root) 531 if query.root == nil { 532 return TSQuery{} 533 } 534 return query 535 } 536 537 // cleanupStopword cleans up a query tree by removing stop words and adjusting 538 // the width of the followedby operators to account for removed stop words. 539 // It returns the new root of the tree, and the amount to add to a followedBy 540 // distance to the left and right of the input node. 541 // 542 // This function parallels the clean_stopword_intree function in Postgres. 543 // What follows is a reproduction of the explanation of this function in 544 // Postgres. 545 546 // When we remove a phrase operator due to removing one or both of its 547 // arguments, we might need to adjust the distance of a parent phrase 548 // operator. For example, 'a' is a stopword, so: 549 // 550 // (b <-> a) <-> c should become b <2> c 551 // b <-> (a <-> c) should become b <2> c 552 // (b <-> (a <-> a)) <-> c should become b <3> c 553 // b <-> ((a <-> a) <-> c) should become b <3> c 554 // 555 // To handle that, we define two output parameters: 556 // 557 // ladd: amount to add to a phrase distance to the left of this node 558 // radd: amount to add to a phrase distance to the right of this node 559 // 560 // We need two outputs because we could need to bubble up adjustments to two 561 // different parent phrase operators. Consider 562 // 563 // w <-> (((a <-> x) <2> (y <3> a)) <-> z) 564 // 565 // After we've removed the two a's and are considering the <2> node (which is 566 // now just x <2> y), we have an ladd distance of 1 that needs to propagate 567 // up to the topmost (leftmost) <->, and an radd distance of 3 that needs to 568 // propagate to the rightmost <->, so that we'll end up with 569 // 570 // w <2> ((x <2> y) <4> z) 571 // 572 // Near the bottom of the tree, we may have subtrees consisting only of 573 // stopwords. The distances of any phrase operators within such a subtree are 574 // summed and propagated to both ladd and radd, since we don't know which side 575 // of the lowest surviving phrase operator we are in. The rule is that any 576 // subtree that degenerates to NULL must return equal values of ladd and radd, 577 // and the parent node dealing with it should incorporate only one of those. 578 // 579 // Currently, we only implement this adjustment for adjacent phrase operators. 580 // Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which 581 // isn't ideal, but there is no way to represent the really desired semantics 582 // without some redesign of the tsquery structure. Certainly it would not be 583 // any better to convert that to 'x <2> (y | z)'. Since this is such a weird 584 // corner case, let it go for now. But we can fix it in cases where the 585 // intervening non-phrase operator also gets removed, for example 586 // '((x <-> a) | a) <-> y' will become 'x <2> y'. 587 func cleanupStopword(node *tsNode) (ret *tsNode, lAdd int, rAdd int) { 588 if node.op == invalid { 589 if node.term.lexeme == "" { 590 // Found a stop word. 591 return nil, 0, 0 592 } 593 return node, 0, 0 594 } 595 if node.op == not { 596 // Not doesn't change the pattern width, so just report child distances. 597 node.l, lAdd, rAdd = cleanupStopword(node.l) 598 if node.l == nil { 599 return nil, lAdd, rAdd 600 } 601 return node, lAdd, rAdd 602 } 603 604 var llAdd, lrAdd, rlAdd, rrAdd int 605 node.l, llAdd, lrAdd = cleanupStopword(node.l) 606 node.r, rlAdd, rrAdd = cleanupStopword(node.r) 607 isPhrase := node.op == followedby 608 followedN := node.followedN 609 if node.l == nil && node.r == nil { 610 // Removing an entire node. Propagate its distance into both lAdd and rAdd; 611 // it is the responsibility of the parent to count it only once. 612 if isPhrase { 613 // If we're a followed by, sum up the children lengths and propagate. 614 // Distances coming from children are summed and propagated up to the 615 // parent (we assume llAdd == lrAdd and rlAdd == rrAdd, else rule was 616 // broken at a lower level). 617 lAdd = llAdd + int(followedN) + rlAdd 618 rAdd = lAdd 619 } else { 620 // If not, we take the max. This corresponds to the logic in evalWithinFollowedBy. 621 lAdd = llAdd 622 if rlAdd > lAdd { 623 lAdd = rlAdd 624 } 625 rAdd = lAdd 626 } 627 return nil, lAdd, rAdd 628 } else if node.l == nil { 629 // Remove this operator and the left node. 630 if isPhrase { 631 // Operator's own distance must propagate to the left. 632 return node.r, llAdd + int(followedN) + rlAdd, rrAdd 633 } else { 634 // At non-followedby op, just forget the left node entirely. 635 return node.r, rlAdd, rrAdd 636 } 637 } else if node.r == nil { 638 // Remove this operator and the right node. 639 if isPhrase { 640 // Operator's own distance must propagate to the right. 641 return node.l, llAdd, lrAdd + int(followedN) + rrAdd 642 } else { 643 // At non-followedby op, just forget the right node entirely. 644 return node.l, llAdd, lrAdd 645 } 646 } else if isPhrase { 647 // Add the adjusted values to this operator. 648 node.followedN += uint16(lrAdd + rlAdd) 649 // Continue to propagate unaccounted-for adjustments. 650 return node, llAdd, rrAdd 651 } 652 // Otherwise we found a non-phrase operator; keep it as-is. 653 return node, 0, 0 654 }