
     1  // Copyright 2022 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    11  package tsearch
    13  import (
    14  	"fmt"
    15  	"strings"
    17  	""
    18  	""
    19  	""
    20  	""
    21  	""
    22  )
    24  // tsOperator is an enum that represents the different operators within a
    25  // TSQuery.
    26  type tsOperator int
    28  const (
    29  	// Parentheses can be used to control nesting of the TSQuery operators.
    30  	// Without parentheses, | binds least tightly,
    31  	// then &, then <->, and ! most tightly.
    33  	invalid tsOperator = iota
    34  	// and is the & operator, which requires both of its operands to exist in
    35  	// the searched document.
    36  	and
    37  	// or is the | operator, which requires one or more of its operands to exist
    38  	// in the searched document.
    39  	or
    40  	// not is the ! operator, which requires that its single operand doesn't exist
    41  	// in the searched document.
    42  	not
    43  	// followedby is the <-> operator. It can also be specified with a number like
    44  	// <1> or <2> or <3>. It requires that the left operand is followed by the right
    45  	// operand. The <-> and <1> forms mean that they should be directly followed
    46  	// by each other. A number indicates how many terms away the operands should be.
    47  	followedby
    48  	// lparen and rparen are grouping operators. They're just used in parsing and
    49  	// don't appear in the TSQuery tree.
    50  	lparen
    51  	rparen
    52  )
    54  // precedence returns the parsing precedence of the receiver. A higher
    55  // precedence means that the operator binds more tightly.
    56  func (o tsOperator) precedence() int {
    57  	switch o {
    58  	case not:
    59  		return 4
    60  	case followedby:
    61  		return 3
    62  	case and:
    63  		return 2
    64  	case or:
    65  		return 1
    66  	}
    67  	panic(errors.AssertionFailedf("no precedence for operator %d", o))
    68  }
    70  func (o tsOperator) pgwireEncoding() byte {
    71  	switch o {
    72  	case not:
    73  		return 1
    74  	case and:
    75  		return 2
    76  	case or:
    77  		return 3
    78  	case followedby:
    79  		return 4
    80  	}
    81  	panic(errors.AssertionFailedf("no pgwire encoding for operator %d", o))
    82  }
    84  func (o tsOperator) String() string {
    85  	switch o {
    86  	case not:
    87  		return "!"
    88  	case and:
    89  		return "&"
    90  	case or:
    91  		return "|"
    92  	case followedby:
    93  		return "<->"
    94  	case lparen:
    95  		return "("
    96  	case rparen:
    97  		return ")"
    98  	}
    99  	panic(errors.AssertionFailedf("no string for operator %d", o))
   100  }
   102  func tsOperatorFromPgwireEncoding(b byte) (tsOperator, error) {
   103  	switch b {
   104  	case 1:
   105  		return not, nil
   106  	case 2:
   107  		return and, nil
   108  	case 3:
   109  		return or, nil
   110  	case 4:
   111  		return followedby, nil
   112  	}
   113  	return invalid, errors.AssertionFailedf("no operator for pgwire byte %d", b)
   114  }
   116  // tsNode represents a single AST node within the tree of a TSQuery.
   117  type tsNode struct {
   118  	// Only one of term or op will be set.
   119  	// If term is set, this is a leaf node containing a lexeme.
   120  	term tsTerm
   121  	// If op is set, this is an operator node: either not, and, or, or followedby.
   122  	op tsOperator
   123  	// set only when op is followedby. Indicates the number n within the <n>
   124  	// operator, which means the number of terms separating the left and the right
   125  	// argument.
   126  	// At most 16384.
   127  	followedN uint16
   129  	// l is the left child of the node if op is set, or the only child if
   130  	// op is set to "not".
   131  	l *tsNode
   132  	// r is the right child of the node if op is set.
   133  	r *tsNode
   134  }
   136  func (n tsNode) String() string {
   137  	var buf strings.Builder
   138  	n.writeInfixString(&buf, 0)
   139  	return buf.String()
   140  }
   142  func (n tsNode) writeInfixString(buf *strings.Builder, parentPrecedence int) {
   143  	if n.op == invalid {
   144  		n.term.writeString(buf)
   145  		return
   146  	}
   147  	prec := n.op.precedence()
   148  	needParen := prec < parentPrecedence
   149  	if needParen {
   150  		buf.WriteString("( ")
   151  	}
   152  	switch n.op {
   153  	case not:
   154  		buf.WriteString("!")
   155  		n.l.writeInfixString(buf, prec)
   156  	default:
   157  		n.l.writeInfixString(buf, prec)
   158  		buf.WriteString(" ")
   159  		tsTerm{operator: n.op, followedN: n.followedN}.writeString(buf)
   160  		buf.WriteString(" ")
   161  		n.r.writeInfixString(buf, prec)
   162  	}
   163  	if needParen {
   164  		buf.WriteString(" )")
   165  	}
   166  }
   168  // UnambiguousString returns a string representation of this tsNode that wraps
   169  // all expressions with parentheses. It's just for testing.
   170  func (n tsNode) UnambiguousString() string {
   171  	switch n.op {
   172  	case invalid:
   173  		return n.term.lexeme
   174  	case not:
   175  		return fmt.Sprintf("!%s", n.l.UnambiguousString())
   176  	}
   177  	var buf strings.Builder
   178  	tsTerm{operator: n.op, followedN: n.followedN}.writeString(&buf)
   179  	return fmt.Sprintf("[%s%s%s]", n.l.UnambiguousString(), buf.String(), n.r.UnambiguousString())
   180  }
   182  // TSQuery represents a tsNode AST root. A TSQuery is a tree of text search
   183  // operators that can be run against a TSVector to produce a predicate of
   184  // whether the query matched.
   185  type TSQuery struct {
   186  	root *tsNode
   187  }
   189  func (q TSQuery) String() string {
   190  	if q.root == nil {
   191  		return ""
   192  	}
   193  	return q.root.String()
   194  }
   196  // GetInvertedExpr returns the inverted expression that can be used to search
   197  // an index.
   198  func (q TSQuery) GetInvertedExpr() (expr inverted.Expression, err error) {
   199  	return q.root.getInvertedExpr()
   200  }
   202  func (n *tsNode) getInvertedExpr() (inverted.Expression, error) {
   203  	switch n.op {
   204  	case invalid:
   205  		// We're looking at a lexeme match.
   206  		// There are 3 options:
   207  		// 1. Normal match.
   208  		//    In this case, we make a tight and unique span.
   209  		// 2. Prefix match.
   210  		//    In this case, we make a non-unique, tight span that starts with the
   211  		//    prefix.
   212  		// 3. Weighted match.
   213  		//    In this case, we make the match non-tight, because we don't store the
   214  		//    weights of the lexemes in the index, and are forced to re-check
   215  		//    once we get the result from the inverted index.
   216  		// Note that options 2 and 3 can both be present.
   217  		var weight tsWeight
   218  		if len(n.term.positions) > 0 {
   219  			weight = n.term.positions[0].weight
   220  		}
   221  		key := EncodeInvertedIndexKey(nil /* inKey */, n.term.lexeme)
   222  		var span inverted.Span
   224  		prefixMatch := weight&weightStar != 0
   225  		if prefixMatch {
   226  			span = inverted.Span{
   227  				Start: key,
   228  				End:   EncodeInvertedIndexKey(nil /* inKey */, string(keysbase.PrefixEnd([]byte(n.term.lexeme)))),
   229  			}
   230  		} else {
   231  			span = inverted.MakeSingleValSpan(key)
   232  		}
   233  		invertedExpr := inverted.ExprForSpan(span, true /* tight */)
   234  		if !prefixMatch {
   235  			// If we don't have a prefix match we also can set unique=true.
   236  			invertedExpr.Unique = true
   237  		}
   239  		if weight != 0 && weight != weightStar {
   240  			// Some weights are set.
   241  			invertedExpr.SetNotTight()
   242  		}
   243  		return invertedExpr, nil
   244  	case followedby:
   245  		fallthrough
   246  	case and:
   247  		l, lErr := n.l.getInvertedExpr()
   248  		r, rErr := n.r.getInvertedExpr()
   249  		if lErr != nil && rErr != nil {
   250  			// We need a positive match on at least one side.
   251  			return nil, lErr
   252  		} else if lErr != nil {
   253  			// An error on one side means we have to re-check that side's condition
   254  			// later.
   255  			r.SetNotTight()
   256  			//nolint:returnerrcheck
   257  			return r, nil
   258  		} else if rErr != nil {
   259  			// Ditto above.
   260  			l.SetNotTight()
   261  			//nolint:returnerrcheck
   262  			return l, nil
   263  		}
   264  		expr := inverted.And(l, r)
   265  		if n.op == followedby {
   266  			// If we have a followedby match, we have to re-check the results of the
   267  			// match after we get them from the inverted index - just because both
   268  			// terms are present doesn't mean they're properly next to each other,
   269  			// and the index doesn't store position information at all.
   270  			expr.SetNotTight()
   271  		}
   272  		return expr, nil
   273  	case or:
   274  		l, lErr := n.l.getInvertedExpr()
   275  		r, rErr := n.r.getInvertedExpr()
   276  		if lErr != nil {
   277  			// We need a positive match on both sides, so we return an error here.
   278  			// For example, searching for a | !b would require a full scan, since some
   279  			// documents could match that contain neither a nor b.
   280  			return nil, lErr
   281  		} else if rErr != nil {
   282  			return nil, rErr
   283  		}
   284  		return inverted.Or(l, r), nil
   285  	case not:
   286  		// A not would require more advanced machinery than we have, so for now
   287  		// we'll just assume we can't perform an inverted expression search on a
   288  		// not. Note that a nested not would make it possible, but we are ignoring
   289  		// this case for now as it seems marginal.
   290  		return nil, errors.New("unable to create inverted expr for not")
   291  	}
   292  	return nil, errors.AssertionFailedf("invalid operator %d", n.op)
   293  }
   295  func lexTSQuery(input string) (TSVector, error) {
   296  	parser := tsVectorLexer{
   297  		input:   input,
   298  		state:   expectingTerm,
   299  		tsQuery: true,
   300  	}
   302  	return parser.lex()
   303  }
   305  // ParseTSQuery produces a TSQuery from an input string.
   306  func ParseTSQuery(input string) (TSQuery, error) {
   307  	terms, err := lexTSQuery(input)
   308  	if err != nil {
   309  		return TSQuery{}, err
   310  	}
   312  	// Now create the operator tree.
   313  	queryParser := tsQueryParser{terms: terms, input: input}
   314  	return queryParser.parse()
   315  }
   317  // tsQueryParser is a parser that operates on a set of lexed tokens, represented
   318  // as the tsTerms in a TSVector.
   319  type tsQueryParser struct {
   320  	input string
   321  	terms TSVector
   322  }
   324  func (p tsQueryParser) peek() (*tsTerm, bool) {
   325  	if len(p.terms) == 0 {
   326  		return nil, false
   327  	}
   328  	return &p.terms[0], true
   329  }
   331  func (p *tsQueryParser) nextTerm() (*tsTerm, bool) {
   332  	if len(p.terms) == 0 {
   333  		return nil, false
   334  	}
   335  	ret := &p.terms[0]
   336  	p.terms = p.terms[1:]
   337  	return ret, true
   338  }
   340  func (p *tsQueryParser) parse() (TSQuery, error) {
   341  	expr, err := p.parseTSExpr(0)
   342  	if err != nil {
   343  		return TSQuery{}, err
   344  	}
   345  	if len(p.terms) > 0 {
   346  		_, err := p.syntaxError()
   347  		return TSQuery{}, err
   348  	}
   349  	return TSQuery{root: expr}, nil
   350  }
   352  // parseTSExpr is a "Pratt parser" which constructs a query tree out of the
   353  // lexed tsTerms, respecting the precedence of the tsOperators.
   354  // See this nice article about Pratt parsing, which this parser was adapted from:
   355  //
   356  func (p *tsQueryParser) parseTSExpr(minBindingPower int) (*tsNode, error) {
   357  	t, ok := p.nextTerm()
   358  	if !ok {
   359  		return nil, pgerror.Newf(pgcode.Syntax, "text-search query doesn't contain lexemes: %s", p.input)
   360  	}
   362  	// First section: grab either atoms, nots, or parens.
   363  	var lExpr *tsNode
   364  	switch t.operator {
   365  	case invalid:
   366  		lExpr = &tsNode{term: *t}
   367  	case lparen:
   368  		expr, err := p.parseTSExpr(0)
   369  		if err != nil {
   370  			return nil, err
   371  		}
   372  		nextTerm, ok := p.nextTerm()
   373  		if !ok || nextTerm.operator != rparen {
   374  			return p.syntaxError()
   375  		}
   376  		lExpr = expr
   377  	case not:
   378  		expr, err := p.parseTSExpr(t.operator.precedence())
   379  		if err != nil {
   380  			return nil, err
   381  		}
   382  		lExpr = &tsNode{op: not, l: expr}
   383  	default:
   384  		return p.syntaxError()
   385  	}
   387  	// Now we do our "Pratt parser loop".
   388  	for {
   389  		next, ok := p.peek()
   390  		if !ok {
   391  			return lExpr, nil
   392  		}
   393  		switch next.operator {
   394  		case and, or, followedby:
   395  		default:
   396  			return lExpr, nil
   397  		}
   398  		precedence := next.operator.precedence()
   399  		if precedence < minBindingPower {
   400  			break
   401  		}
   402  		p.nextTerm()
   403  		rExpr, err := p.parseTSExpr(precedence)
   404  		if err != nil {
   405  			return nil, err
   406  		}
   407  		lExpr = &tsNode{op: next.operator, followedN: next.followedN, l: lExpr, r: rExpr}
   408  	}
   409  	return lExpr, nil
   410  }
   412  func (p *tsQueryParser) syntaxError() (*tsNode, error) {
   413  	return nil, pgerror.Newf(pgcode.Syntax, "syntax error in TSQuery: %s", p.input)
   414  }
   416  // ToTSQuery implements the to_tsquery builtin, which lexes an input, performs
   417  // stopwording and normalization on the tokens, and returns a parsed query.
   418  func ToTSQuery(config string, input string) (TSQuery, error) {
   419  	return toTSQuery(config, invalid, input)
   420  }
   422  // PlainToTSQuery implements the plainto_tsquery builtin, which lexes an input,
   423  // performs stopwording and normalization on the tokens, and returns a parsed
   424  // query, interposing the & operator between each token.
   425  func PlainToTSQuery(config string, input string) (TSQuery, error) {
   426  	return toTSQuery(config, and, input)
   427  }
   429  // PhraseToTSQuery implements the phraseto_tsquery builtin, which lexes an input,
   430  // performs stopwording and normalization on the tokens, and returns a parsed
   431  // query, interposing the <-> operator between each token.
   432  func PhraseToTSQuery(config string, input string) (TSQuery, error) {
   433  	return toTSQuery(config, followedby, input)
   434  }
   436  // toTSQuery implements the to_tsquery builtin, which lexes an input,
   437  // performs stopwording and normalization on the tokens, and returns a parsed
   438  // query. If the interpose operator is not invalid, it's interposed between each
   439  // token in the input.
   440  func toTSQuery(config string, interpose tsOperator, input string) (TSQuery, error) {
   441  	vector, err := lexTSQuery(input)
   442  	if err != nil {
   443  		return TSQuery{}, err
   444  	}
   445  	tokens := make(TSVector, 0, len(vector))
   446  	foundStopwords := false
   447  	for i := range vector {
   448  		tok := vector[i]
   450  		foundOperator := tok.operator != invalid
   451  		var lexemeTokens []string
   453  		if !foundOperator {
   454  			// Try parsing the token.
   455  			lexemeTokens = TSParse(tok.lexeme)
   456  		}
   458  		// If we found an operator or were able to parse lexemes from the token,
   459  		// add the interpose operator if there is one.
   460  		if interpose != invalid && i > 0 && (foundOperator || len(lexemeTokens) > 0) {
   461  			term := tsTerm{operator: interpose}
   462  			if interpose == followedby {
   463  				term.followedN = 1
   464  			}
   465  			tokens = append(tokens, term)
   466  		}
   468  		if foundOperator {
   469  			tokens = append(tokens, tok)
   470  			continue
   471  		}
   473  		if len(lexemeTokens) == 0 {
   474  			// We ate some whitespace or whitespace-like text with no tokens.
   475  			continue
   476  		}
   478  		// When we support more than just the simple configuration, we'll also
   479  		// want to remove stopwords, which will affect the interposing, but we can
   480  		// worry about that later.
   481  		// Additionally, if we're doing phraseto_tsquery, if we remove a stopword,
   482  		// we need to make sure to increase the "followedN" of the followedby
   483  		// operator. For example, phraseto_tsquery('hello a deer') will return
   484  		// 'hello <2> deer', since the a stopword would be removed.
   486  		tokInterpose := interpose
   487  		if tokInterpose == invalid {
   488  			tokInterpose = followedby
   489  		}
   490  		for j := range lexemeTokens {
   491  			if j > 0 {
   492  				// We found more than one lexeme in our token, so we need to add all of them
   493  				// to the query, connected by our interpose operator.
   494  				// If we aren't running with an interpose, like in to_tsquery, Postgres
   495  				// uses the <-> operator to connect multiple lexemes from a single token.
   496  				term := tsTerm{operator: tokInterpose}
   497  				if tokInterpose == followedby {
   498  					term.followedN = 1
   499  				}
   500  				tokens = append(tokens, term)
   501  			}
   502  			lexeme, stopWord, err := TSLexize(config, lexemeTokens[j])
   503  			if err != nil {
   504  				return TSQuery{}, err
   505  			}
   506  			if stopWord {
   507  				foundStopwords = true
   508  			}
   509  			tokens = append(tokens, tsTerm{lexeme: lexeme, positions: tok.positions})
   510  		}
   511  	}
   513  	// Now create the operator tree.
   514  	queryParser := tsQueryParser{terms: tokens, input: input}
   515  	query, err := queryParser.parse()
   516  	if err != nil {
   517  		return query, err
   518  	}
   520  	if foundStopwords {
   521  		query = cleanupStopwords(query)
   522  		if query.root == nil {
   523  			return query, pgerror.Newf(pgcode.Syntax, "text-search query doesn't contain lexemes: %s", input)
   524  		}
   525  	}
   526  	return query, err
   527  }
   529  func cleanupStopwords(query TSQuery) TSQuery {
   530  	query.root, _, _ = cleanupStopword(query.root)
   531  	if query.root == nil {
   532  		return TSQuery{}
   533  	}
   534  	return query
   535  }
   537  // cleanupStopword cleans up a query tree by removing stop words and adjusting
   538  // the width of the followedby operators to account for removed stop words.
   539  // It returns the new root of the tree, and the amount to add to a followedBy
   540  // distance to the left and right of the input node.
   541  //
   542  // This function parallels the clean_stopword_intree function in Postgres.
   543  // What follows is a reproduction of the explanation of this function in
   544  // Postgres.
   546  // When we remove a phrase operator due to removing one or both of its
   547  // arguments, we might need to adjust the distance of a parent phrase
   548  // operator.  For example, 'a' is a stopword, so:
   549  //
   550  //	(b <-> a) <-> c  should become	b <2> c
   551  //	b <-> (a <-> c)  should become	b <2> c
   552  //	(b <-> (a <-> a)) <-> c  should become	b <3> c
   553  //	b <-> ((a <-> a) <-> c)  should become	b <3> c
   554  //
   555  // To handle that, we define two output parameters:
   556  //
   557  //	ladd: amount to add to a phrase distance to the left of this node
   558  //	radd: amount to add to a phrase distance to the right of this node
   559  //
   560  // We need two outputs because we could need to bubble up adjustments to two
   561  // different parent phrase operators. Consider
   562  //
   563  //	w <-> (((a <-> x) <2> (y <3> a)) <-> z)
   564  //
   565  // After we've removed the two a's and are considering the <2> node (which is
   566  // now just x <2> y), we have an ladd distance of 1 that needs to propagate
   567  // up to the topmost (leftmost) <->, and an radd distance of 3 that needs to
   568  // propagate to the rightmost <->, so that we'll end up with
   569  //
   570  //	w <2> ((x <2> y) <4> z)
   571  //
   572  // Near the bottom of the tree, we may have subtrees consisting only of
   573  // stopwords.  The distances of any phrase operators within such a subtree are
   574  // summed and propagated to both ladd and radd, since we don't know which side
   575  // of the lowest surviving phrase operator we are in.  The rule is that any
   576  // subtree that degenerates to NULL must return equal values of ladd and radd,
   577  // and the parent node dealing with it should incorporate only one of those.
   578  //
   579  // Currently, we only implement this adjustment for adjacent phrase operators.
   580  // Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which
   581  // isn't ideal, but there is no way to represent the really desired semantics
   582  // without some redesign of the tsquery structure.  Certainly it would not be
   583  // any better to convert that to 'x <2> (y | z)'.  Since this is such a weird
   584  // corner case, let it go for now.  But we can fix it in cases where the
   585  // intervening non-phrase operator also gets removed, for example
   586  // '((x <-> a) | a) <-> y' will become 'x <2> y'.
   587  func cleanupStopword(node *tsNode) (ret *tsNode, lAdd int, rAdd int) {
   588  	if node.op == invalid {
   589  		if node.term.lexeme == "" {
   590  			// Found a stop word.
   591  			return nil, 0, 0
   592  		}
   593  		return node, 0, 0
   594  	}
   595  	if node.op == not {
   596  		// Not doesn't change the pattern width, so just report child distances.
   597  		node.l, lAdd, rAdd = cleanupStopword(node.l)
   598  		if node.l == nil {
   599  			return nil, lAdd, rAdd
   600  		}
   601  		return node, lAdd, rAdd
   602  	}
   604  	var llAdd, lrAdd, rlAdd, rrAdd int
   605  	node.l, llAdd, lrAdd = cleanupStopword(node.l)
   606  	node.r, rlAdd, rrAdd = cleanupStopword(node.r)
   607  	isPhrase := node.op == followedby
   608  	followedN := node.followedN
   609  	if node.l == nil && node.r == nil {
   610  		// Removing an entire node. Propagate its distance into both lAdd and rAdd;
   611  		// it is the responsibility of the parent to count it only once.
   612  		if isPhrase {
   613  			// If we're a followed by, sum up the children lengths and propagate.
   614  			// Distances coming from children are summed and propagated up to the
   615  			// parent (we assume llAdd == lrAdd and rlAdd == rrAdd, else rule was
   616  			// broken at a lower level).
   617  			lAdd = llAdd + int(followedN) + rlAdd
   618  			rAdd = lAdd
   619  		} else {
   620  			// If not, we take the max. This corresponds to the logic in evalWithinFollowedBy.
   621  			lAdd = llAdd
   622  			if rlAdd > lAdd {
   623  				lAdd = rlAdd
   624  			}
   625  			rAdd = lAdd
   626  		}
   627  		return nil, lAdd, rAdd
   628  	} else if node.l == nil {
   629  		// Remove this operator and the left node.
   630  		if isPhrase {
   631  			// Operator's own distance must propagate to the left.
   632  			return node.r, llAdd + int(followedN) + rlAdd, rrAdd
   633  		} else {
   634  			// At non-followedby op, just forget the left node entirely.
   635  			return node.r, rlAdd, rrAdd
   636  		}
   637  	} else if node.r == nil {
   638  		// Remove this operator and the right node.
   639  		if isPhrase {
   640  			// Operator's own distance must propagate to the right.
   641  			return node.l, llAdd, lrAdd + int(followedN) + rrAdd
   642  		} else {
   643  			// At non-followedby op, just forget the right node entirely.
   644  			return node.l, llAdd, lrAdd
   645  		}
   646  	} else if isPhrase {
   647  		// Add the adjusted values to this operator.
   648  		node.followedN += uint16(lrAdd + rlAdd)
   649  		// Continue to propagate unaccounted-for adjustments.
   650  		return node, llAdd, rrAdd
   651  	}
   652  	// Otherwise we found a non-phrase operator; keep it as-is.
   653  	return node, 0, 0
   654  }