github.com/jgbaldwinbrown/perf@v0.1.1/benchproc/internal/parse/tok.go (about)

     1  // Copyright 2022 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package parse
     6  
     7  import (
     8  	"errors"
     9  	"fmt"
    10  	"regexp"
    11  	"strconv"
    12  	"strings"
    13  	"unicode"
    14  	"unicode/utf8"
    15  )
    16  
    17  // A SyntaxError is an error produced by parsing a malformed expression.
    18  type SyntaxError struct {
    19  	Query string // The original query string
    20  	Off   int    // Byte offset of the error in Query
    21  	Msg   string // Error message
    22  }
    23  
    24  func (e *SyntaxError) Error() string {
    25  	// Translate byte offset to a rune offset.
    26  	pos := 0
    27  	for i, r := range e.Query {
    28  		if i >= e.Off {
    29  			break
    30  		}
    31  		if unicode.IsGraphic(r) {
    32  			pos++
    33  		}
    34  	}
    35  	return fmt.Sprintf("syntax error: %s\n\t%s\n\t%*s^", e.Msg, e.Query, pos, "")
    36  }
    37  
    38  type errorTracker struct {
    39  	qOrig string
    40  	err   *SyntaxError
    41  }
    42  
    43  func (t *errorTracker) error(q string, msg string) {
    44  	off := len(t.qOrig) - len(q)
    45  	if t.err == nil {
    46  		t.err = &SyntaxError{t.qOrig, off, msg}
    47  	}
    48  }
    49  
    50  // A tok is a single token in the filter/projection lexical syntax.
    51  type tok struct {
    52  	// Kind specifies the category of this token. It is either 'w'
    53  	// or 'q' for an unquoted or quoted word, respectively, 'r'
    54  	// for a regexp, an operator character, or 0 for the
    55  	// end-of-string token.
    56  	Kind   byte
    57  	Off    int    // Byte offset of the beginning of this token
    58  	Tok    string // Literal token contents; quoted words are unescaped
    59  	Regexp *regexp.Regexp
    60  }
    61  
    62  type tokenizer struct {
    63  	q    string
    64  	errt *errorTracker
    65  }
    66  
    67  func newTokenizer(q string) tokenizer {
    68  	return tokenizer{q, &errorTracker{q, nil}}
    69  }
    70  
    71  func isOp(ch rune) bool {
    72  	return ch == '(' || ch == ')' || ch == ':' || ch == '@' || ch == ','
    73  }
    74  
    75  // At the beginning of a word, we accept "-" and "*" as operators,
    76  // but in the middle of words we treat them as part of the word.
    77  func isStartOp(ch rune) bool {
    78  	return isOp(ch) || ch == '-' || ch == '*'
    79  }
    80  
    81  func isSpace(q string) int {
    82  	if q[0] == ' ' {
    83  		return 1
    84  	}
    85  	r, size := utf8.DecodeRuneInString(q)
    86  	if unicode.IsSpace(r) {
    87  		return size
    88  	}
    89  	return 0
    90  }
    91  
    92  // keyOrOp returns the next key or operator token.
    93  // A key may be a bare word or a quoted word.
    94  func (t *tokenizer) keyOrOp() (tok, tokenizer) {
    95  	return t.next(false)
    96  }
    97  
    98  // valueOrOp returns the next value or operator token.
    99  // A value may be a bare word, a quoted word, or a regexp.
   100  func (t *tokenizer) valueOrOp() (tok, tokenizer) {
   101  	return t.next(true)
   102  }
   103  
   104  // end asserts that t has reached the end of the token stream. If it
   105  // has not, it returns a tokenizer the reports an error.
   106  func (t *tokenizer) end() tokenizer {
   107  	if tok, _ := t.keyOrOp(); tok.Kind != 0 {
   108  		_, t2 := t.error("unexpected " + strconv.Quote(tok.Tok))
   109  		return t2
   110  	}
   111  	return *t
   112  }
   113  
   114  func (t *tokenizer) next(allowRegexp bool) (tok, tokenizer) {
   115  	for len(t.q) > 0 {
   116  		if isStartOp(rune(t.q[0])) {
   117  			return t.tok(t.q[0], t.q[:1], t.q[1:])
   118  		} else if n := isSpace(t.q); n > 0 {
   119  			t.q = t.q[n:]
   120  		} else if allowRegexp && t.q[0] == '/' {
   121  			return t.regexp()
   122  		} else if t.q[0] == '"' {
   123  			return t.quotedWord()
   124  		} else {
   125  			return t.bareWord()
   126  		}
   127  	}
   128  	// Add an EOF token. This eliminates the need for lots of
   129  	// bounds checks in the parser and gives the EOF a position.
   130  	return t.tok(0, "", "")
   131  }
   132  
   133  func (t *tokenizer) tok(kind byte, token string, rest string) (tok, tokenizer) {
   134  	off := len(t.errt.qOrig) - len(t.q)
   135  	return tok{kind, off, token, nil}, tokenizer{rest, t.errt}
   136  }
   137  
   138  func (t *tokenizer) error(msg string) (tok, tokenizer) {
   139  	t.errt.error(t.q, msg)
   140  	// Move to the end.
   141  	return t.tok(0, "", "")
   142  }
   143  
   144  func (t *tokenizer) quotedWord() (tok, tokenizer) {
   145  	pos := 1 // Skip initial "
   146  	for pos < len(t.q) && (t.q[pos] != '"' || t.q[pos-1] == '\\') {
   147  		pos++
   148  	}
   149  	if pos == len(t.q) {
   150  		return t.error("missing end quote")
   151  	}
   152  	// Parse the quoted string.
   153  	word, err := strconv.Unquote(t.q[:pos+1])
   154  	if err != nil {
   155  		return t.error("bad escape sequence")
   156  	}
   157  	return t.tok('q', word, t.q[pos+1:])
   158  }
   159  
   160  func (t *tokenizer) bareWord() (tok, tokenizer) {
   161  	// Consume until a space or operator. We only take "-"
   162  	// as an operator immediately following another space
   163  	// or operator so things like "foo-bar" work as
   164  	// expected.
   165  	end := len(t.q)
   166  	for i, r := range t.q {
   167  		if unicode.IsSpace(r) || isOp(r) {
   168  			end = i
   169  			break
   170  		}
   171  	}
   172  	word := t.q[:end]
   173  	if word == "AND" {
   174  		return t.tok('A', word, t.q[end:])
   175  	} else if word == "OR" {
   176  		return t.tok('O', word, t.q[end:])
   177  	}
   178  	return t.tok('w', word, t.q[end:])
   179  }
   180  
   181  // quoteWord returns a string that tokenizes as the word s.
   182  func quoteWord(s string) string {
   183  	if len(s) == 0 {
   184  		return `""`
   185  	}
   186  	for i, r := range s {
   187  		switch r {
   188  		case '"', ' ', '\a', '\b':
   189  			return strconv.Quote(s)
   190  		}
   191  		if isOp(r) || unicode.IsSpace(r) || (i == 0 && (r == '-' || r == '*')) {
   192  			return strconv.Quote(s)
   193  		}
   194  	}
   195  	// No quoting necessary.
   196  	return s
   197  }
   198  
   199  func (t *tokenizer) regexp() (tok, tokenizer) {
   200  	expr, rest, err := regexpParseUntil(t.q[1:], "/")
   201  	if err == errNoDelim {
   202  		return t.error("missing close \"/\"")
   203  	} else if err != nil {
   204  		return t.error(err.Error())
   205  	}
   206  
   207  	r, err := regexp.Compile(expr)
   208  	if err != nil {
   209  		return t.error(err.Error())
   210  	}
   211  
   212  	// To avoid confusion when "/" appears in the regexp itself,
   213  	// we require space or an operator after the close "/".
   214  	q2 := rest[1:]
   215  	if !(q2 == "" || unicode.IsSpace(rune(q2[0])) || isStartOp(rune(q2[0]))) {
   216  		t.q = q2
   217  		return t.error("regexp must be followed by space or an operator (unescaped \"/\"?)")
   218  	}
   219  
   220  	tok, next := t.tok('r', expr, q2)
   221  	tok.Regexp = r
   222  	return tok, next
   223  }
   224  
   225  var errNoDelim = errors.New("unterminated regexp")
   226  
   227  // regexpParseUntil parses a regular expression from the beginning of str
   228  // until the string delim appears at the top level of the expression.
   229  // It returns the regular expression prefix of str and the remainder of str.
   230  // If successful, rest will always begin with delim.
   231  // If delim does not appear at the top level of str, it returns str, "", errNoDelim.
   232  //
   233  // TODO: There are corner cases this doesn't get right. Replace it
   234  // with a standard library call if #44254 is implemented.
   235  func regexpParseUntil(str, delim string) (expr, rest string, err error) {
   236  	cs := 0
   237  	cp := 0
   238  	for i := 0; i < len(str); {
   239  		if cs == 0 && cp == 0 && strings.HasPrefix(str[i:], delim) {
   240  			return str[:i], str[i:], nil
   241  		}
   242  		switch str[i] {
   243  		case '[':
   244  			cs++
   245  		case ']':
   246  			if cs--; cs < 0 { // An unmatched ']' is legal.
   247  				cs = 0
   248  			}
   249  		case '(':
   250  			if cs == 0 {
   251  				cp++
   252  			}
   253  		case ')':
   254  			if cs == 0 {
   255  				cp--
   256  			}
   257  		case '\\':
   258  			i++
   259  		}
   260  		i++
   261  	}
   262  	return str, "", errNoDelim
   263  }