github.com/cockroachdb/cockroachdb-parser@v0.23.3-0.20240213214944-911057d40c9a/pkg/util/tsearch/lex.go (about)

     1  // Copyright 2022 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package tsearch
    12  
    13  import (
    14  	"sort"
    15  	"strconv"
    16  	"unicode"
    17  	"unicode/utf8"
    18  
    19  	"github.com/cockroachdb/cockroachdb-parser/pkg/sql/pgwire/pgcode"
    20  	"github.com/cockroachdb/cockroachdb-parser/pkg/sql/pgwire/pgerror"
    21  )
    22  
    23  type tsVectorParseState int
    24  
    25  const (
    26  	// Waiting for term (whitespace, ', or any other char)
    27  	expectingTerm tsVectorParseState = iota
    28  	// Inside of a normal term (single quotes are processed as normal chars)
    29  	insideNormalTerm
    30  	// Inside of a ' term
    31  	insideQuoteTerm
    32  	// Finished with ' term (waiting for : or space)
    33  	finishedQuoteTerm
    34  	// Found a colon (or comma) and expecting a position
    35  	expectingPosList
    36  	// Finished parsing a position, expecting a comma or whitespace
    37  	expectingPosDelimiter
    38  )
    39  
    40  // tsVectorLexer is a lexing state machine for the TSVector and TSQuery input
    41  // formats. See the comment above lex() for more details.
    42  type tsVectorLexer struct {
    43  	input   string
    44  	lastLen int
    45  	pos     int
    46  	state   tsVectorParseState
    47  
    48  	// If true, we're in "TSQuery lexing mode"
    49  	tsQuery bool
    50  }
    51  
    52  func (p *tsVectorLexer) back() {
    53  	p.pos -= p.lastLen
    54  	p.lastLen = 0
    55  }
    56  
    57  func (p *tsVectorLexer) advance() rune {
    58  	r, n := utf8.DecodeRuneInString(p.input[p.pos:])
    59  	p.pos += n
    60  	p.lastLen = n
    61  	return r
    62  }
    63  
    64  const (
    65  	// The maximum number of bytes in a TSVector.
    66  	maxTSVectorLen = (1 << 20) - 1
    67  	// The maximum number of positions in a TSVector position list.
    68  	maxTSVectorPositions = 256
    69  	// The maximum number within a <> followed-by declaration.
    70  	maxTSVectorFollowedBy = 1 << 14
    71  	// The maximum size of a TSVector lexeme.
    72  	maxTSVectorLexemeLen = (1 << 14) - 1
    73  	// The maximum position within a TSVector position list.
    74  	maxTSVectorPosition = (1 << 14) - 1
    75  )
    76  
    77  // lex lexes the input in the receiver according to the TSVector "grammar", or
    78  // according the TSQuery "grammar" if tsQuery is set to true.
    79  //
    80  // A simple TSVector input could look like this:
    81  //
    82  // foo bar:3 baz:3A 'blah :blah'
    83  //
    84  // A TSVector is a list of terms.
    85  //
    86  // Each term is a word and an optional "position list".
    87  //
    88  // A word may be single-quote wrapped, in which case the next term may begin
    89  // without any whitespace in between (if there is no position list on the word).
    90  // In a single-quote wrapped word, the word must terminate with a single quote.
    91  // All other characters are treated as literals. Backlashes can be used to
    92  // escape single quotes, and are otherwise skipped, allowing the following
    93  // character to be included as a literal (such as the backslash character itself).
    94  //
    95  // If a word is not single-quote wrapped, the next term will begin if there is
    96  // whitespace after the word. Whitespace and colons may be entered by escaping
    97  // them with backslashes. All other uses of backslashes are skipped, allowing
    98  // the following character to be included as a literal.
    99  //
   100  // A word is delimited from its position list with a colon.
   101  //
   102  // A position list is made up of a comma-delimited list of numbers, each
   103  // of which may have an optional "strength" which is a letter from A-D.
   104  //
   105  // In TSQuery mode, there are a few differences:
   106  //   - Terms must be separated with tsOperators (!, <->, |, &), not just spaces.
   107  //   - Terms may be surrounded by the ( ) grouping tokens.
   108  //   - Terms cannot include multiple positions.
   109  //   - Terms can include more than one "strength", as well as the * prefix search
   110  //     operator. For example, foo:3AC*
   111  //
   112  // See examples in tsvector_test.go and tsquery_test.go, and see the
   113  // documentation in tsvector.go for more information and a link to the Postgres
   114  // documentation that is the spec for all of this behavior.
   115  func (p tsVectorLexer) lex() (TSVector, error) {
   116  	// termBuf will be reused as a temporary buffer to assemble each term before
   117  	// copying into the vector.
   118  	termBuf := make([]rune, 0, 32)
   119  	ret := TSVector{}
   120  
   121  	if len(p.input) >= maxTSVectorLen {
   122  		typ := "tsvector"
   123  		if p.tsQuery {
   124  			typ = "tsquery"
   125  		}
   126  		return nil, pgerror.Newf(pgcode.ProgramLimitExceeded,
   127  			"string is too long for %s (%d bytes, max %d bytes)",
   128  			typ, len(p.input), maxTSVectorLen)
   129  	}
   130  
   131  	for p.pos < len(p.input) {
   132  		r := p.advance()
   133  		switch p.state {
   134  		case expectingTerm:
   135  			// Expect either a single quote, a whitespace, or anything else.
   136  			if r == '\'' {
   137  				p.state = insideQuoteTerm
   138  				continue
   139  			}
   140  			if unicode.IsSpace(r) {
   141  				continue
   142  			}
   143  
   144  			if p.tsQuery {
   145  				// Check for &, |, !, and <-> (or <number>)
   146  				switch r {
   147  				case '&':
   148  					ret = append(ret, tsTerm{operator: and})
   149  					continue
   150  				case '|':
   151  					ret = append(ret, tsTerm{operator: or})
   152  					continue
   153  				case '!':
   154  					ret = append(ret, tsTerm{operator: not})
   155  					continue
   156  				case '(':
   157  					ret = append(ret, tsTerm{operator: lparen})
   158  					continue
   159  				case ')':
   160  					ret = append(ret, tsTerm{operator: rparen})
   161  					continue
   162  				case '<':
   163  					r = p.advance()
   164  					n := 1
   165  					if r == '-' {
   166  						r = p.advance()
   167  					} else {
   168  						for unicode.IsNumber(r) {
   169  							termBuf = append(termBuf, r)
   170  							r = p.advance()
   171  						}
   172  						var err error
   173  						n, err = strconv.Atoi(string(termBuf))
   174  						if n > maxTSVectorFollowedBy || n < 0 {
   175  							return nil, pgerror.Newf(pgcode.InvalidParameterValue,
   176  								"distance in phrase operator must be an integer value between zero and %d inclusive", maxTSVectorFollowedBy)
   177  						}
   178  						termBuf = termBuf[:0]
   179  						if err != nil {
   180  							return p.syntaxError()
   181  						}
   182  					}
   183  					if r != '>' {
   184  						return p.syntaxError()
   185  					}
   186  					ret = append(ret, tsTerm{operator: followedby, followedN: uint16(n)})
   187  					continue
   188  				}
   189  			}
   190  
   191  			p.state = insideNormalTerm
   192  			// Need to consume the rune we just found again.
   193  			p.back()
   194  			continue
   195  
   196  		case insideQuoteTerm:
   197  			// If escaped, eat character and continue.
   198  			switch r {
   199  			case '\\':
   200  				r = p.advance()
   201  				termBuf = append(termBuf, r)
   202  				continue
   203  			case '\'':
   204  				term, err := newLexemeTerm(string(termBuf))
   205  				if err != nil {
   206  					return nil, err
   207  				}
   208  				ret = append(ret, term)
   209  				termBuf = termBuf[:0]
   210  				p.state = finishedQuoteTerm
   211  				continue
   212  			}
   213  			termBuf = append(termBuf, r)
   214  		case finishedQuoteTerm:
   215  			if unicode.IsSpace(r) {
   216  				p.state = expectingTerm
   217  			} else if r == ':' {
   218  				lastTerm := &ret[len(ret)-1]
   219  				lastTerm.positions = append(lastTerm.positions, tsPosition{})
   220  				p.state = expectingPosList
   221  			} else {
   222  				p.state = expectingTerm
   223  				p.back()
   224  			}
   225  		case insideNormalTerm:
   226  			// If escaped, eat character and continue.
   227  			if r == '\\' {
   228  				r = p.advance()
   229  				termBuf = append(termBuf, r)
   230  				continue
   231  			}
   232  
   233  			if p.tsQuery {
   234  				switch r {
   235  				case '&', '!', '|', '<', '(', ')':
   236  					// These are all "operators" in the TSQuery language. End the current
   237  					// term and start a new one.
   238  					term, err := newLexemeTerm(string(termBuf))
   239  					if err != nil {
   240  						return nil, err
   241  					}
   242  					ret = append(ret, term)
   243  					termBuf = termBuf[:0]
   244  					p.state = expectingTerm
   245  					p.back()
   246  					continue
   247  				}
   248  			}
   249  
   250  			// Colon that comes first is an ordinary character.
   251  			space := unicode.IsSpace(r)
   252  			if space || r == ':' && len(termBuf) > 0 {
   253  				// Found a terminator.
   254  				// Copy the termBuf into the vector, resize the termBuf, continue on.
   255  				term, err := newLexemeTerm(string(termBuf))
   256  				if err != nil {
   257  					return nil, err
   258  				}
   259  				if r == ':' {
   260  					term.positions = append(term.positions, tsPosition{})
   261  				}
   262  				ret = append(ret, term)
   263  				termBuf = termBuf[:0]
   264  				if space {
   265  					p.state = expectingTerm
   266  				} else {
   267  					p.state = expectingPosList
   268  				}
   269  				continue
   270  			}
   271  			if p.tsQuery && r == ':' {
   272  				return p.syntaxError()
   273  			}
   274  			termBuf = append(termBuf, r)
   275  		case expectingPosList:
   276  			var pos int
   277  			if !p.tsQuery {
   278  				// If we have nothing in our termBuf, we need to see at least one number.
   279  				if unicode.IsNumber(r) {
   280  					termBuf = append(termBuf, r)
   281  					continue
   282  				}
   283  				if len(termBuf) == 0 {
   284  					return p.syntaxError()
   285  				}
   286  				var err error
   287  				pos, err = strconv.Atoi(string(termBuf))
   288  				if err != nil {
   289  					return p.syntaxError()
   290  				}
   291  				if pos == 0 {
   292  					return ret, pgerror.Newf(pgcode.Syntax, "wrong position info in TSVector", p.input)
   293  				} else if pos > maxTSVectorPosition {
   294  					// Postgres silently truncates positions larger than 16383 to 16383.
   295  					pos = maxTSVectorPosition
   296  				}
   297  				termBuf = termBuf[:0]
   298  			}
   299  			lastTerm := &ret[len(ret)-1]
   300  			lastTermPos := len(lastTerm.positions) - 1
   301  			lastTerm.positions[lastTermPos].position = uint16(pos)
   302  			if unicode.IsSpace(r) {
   303  				// Done with our term. Advance to next term!
   304  				p.state = expectingTerm
   305  				continue
   306  			}
   307  			switch r {
   308  			case ',':
   309  				if p.tsQuery {
   310  					// Not valid! No , allowed in position lists in tsqueries.
   311  					return ret, pgerror.Newf(pgcode.Syntax, "syntax error in TSVector: %s", p.input)
   312  				}
   313  				lastTerm.positions = append(lastTerm.positions, tsPosition{})
   314  				// Expecting another number next.
   315  				continue
   316  			case '*':
   317  				if p.tsQuery {
   318  					lastTerm.positions[lastTermPos].weight |= weightStar
   319  				} else {
   320  					p.state = expectingPosDelimiter
   321  					lastTerm.positions[lastTermPos].weight |= weightA
   322  				}
   323  			case 'a', 'A':
   324  				if !p.tsQuery {
   325  					p.state = expectingPosDelimiter
   326  				}
   327  				lastTerm.positions[lastTermPos].weight |= weightA
   328  			case 'b', 'B':
   329  				if !p.tsQuery {
   330  					p.state = expectingPosDelimiter
   331  				}
   332  				lastTerm.positions[lastTermPos].weight |= weightB
   333  			case 'c', 'C':
   334  				if !p.tsQuery {
   335  					p.state = expectingPosDelimiter
   336  				}
   337  				lastTerm.positions[lastTermPos].weight |= weightC
   338  			case 'd', 'D':
   339  				// Weight D is handled differently in TSQuery parsing than TSVector. In
   340  				// TSVector parsing, the default is already D - so we don't record any
   341  				// weight at all. This matches Postgres behavior - a default D weight is
   342  				// not printed or stored. In TSQuery, we have to record it explicitly.
   343  				if p.tsQuery {
   344  					lastTerm.positions[lastTermPos].weight |= weightD
   345  				} else {
   346  					p.state = expectingPosDelimiter
   347  				}
   348  			default:
   349  				return p.syntaxError()
   350  			}
   351  		case expectingPosDelimiter:
   352  			if r == ',' {
   353  				p.state = expectingPosList
   354  				lastTerm := &ret[len(ret)-1]
   355  				lastTerm.positions = append(lastTerm.positions, tsPosition{})
   356  			} else if unicode.IsSpace(r) {
   357  				p.state = expectingTerm
   358  			} else {
   359  				return p.syntaxError()
   360  			}
   361  		default:
   362  			panic("invalid TSVector lex state")
   363  		}
   364  	}
   365  	// Reached the end of the string.
   366  	switch p.state {
   367  	case insideQuoteTerm:
   368  		// Unfinished quote term.
   369  		return p.syntaxError()
   370  	case insideNormalTerm:
   371  		// Finish normal term.
   372  		term, err := newLexemeTerm(string(termBuf))
   373  		if err != nil {
   374  			return nil, err
   375  		}
   376  		ret = append(ret, term)
   377  	case expectingPosList:
   378  		// Finish number.
   379  		if !p.tsQuery {
   380  			if len(termBuf) == 0 {
   381  				return p.syntaxError()
   382  			}
   383  			pos, err := strconv.Atoi(string(termBuf))
   384  			if err != nil {
   385  				return p.syntaxError()
   386  			}
   387  			if pos == 0 {
   388  				return ret, pgerror.Newf(pgcode.Syntax, "wrong position info in TSVector", p.input)
   389  			} else if pos > maxTSVectorPosition {
   390  				// Postgres silently truncates positions larger than 16383 to 16383.
   391  				pos = maxTSVectorPosition
   392  			}
   393  			lastTerm := &ret[len(ret)-1]
   394  			lastTerm.positions[len(lastTerm.positions)-1].position = uint16(pos)
   395  		}
   396  	case expectingTerm, finishedQuoteTerm:
   397  		// We are good to go, we just finished a term and nothing needs to be cleaned up.
   398  	case expectingPosDelimiter:
   399  		// We are good to go, we just finished a position and nothing needs to be cleaned up.
   400  	default:
   401  		panic("invalid TSVector lex state")
   402  	}
   403  	for _, t := range ret {
   404  		sort.Slice(t.positions, func(i, j int) bool {
   405  			return t.positions[i].position < t.positions[j].position
   406  		})
   407  	}
   408  	return ret, nil
   409  }
   410  
   411  func (p *tsVectorLexer) syntaxError() (TSVector, error) {
   412  	typ := "TSVector"
   413  	if p.tsQuery {
   414  		typ = "TSQuery"
   415  	}
   416  	return TSVector{}, pgerror.Newf(pgcode.Syntax, "syntax error in %s: %s", typ, p.input)
   417  }