github.com/jgbaldwinbrown/perf@v0.1.1/benchproc/internal/parse/tok.go (about) 1 // Copyright 2022 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package parse 6 7 import ( 8 "errors" 9 "fmt" 10 "regexp" 11 "strconv" 12 "strings" 13 "unicode" 14 "unicode/utf8" 15 ) 16 17 // A SyntaxError is an error produced by parsing a malformed expression. 18 type SyntaxError struct { 19 Query string // The original query string 20 Off int // Byte offset of the error in Query 21 Msg string // Error message 22 } 23 24 func (e *SyntaxError) Error() string { 25 // Translate byte offset to a rune offset. 26 pos := 0 27 for i, r := range e.Query { 28 if i >= e.Off { 29 break 30 } 31 if unicode.IsGraphic(r) { 32 pos++ 33 } 34 } 35 return fmt.Sprintf("syntax error: %s\n\t%s\n\t%*s^", e.Msg, e.Query, pos, "") 36 } 37 38 type errorTracker struct { 39 qOrig string 40 err *SyntaxError 41 } 42 43 func (t *errorTracker) error(q string, msg string) { 44 off := len(t.qOrig) - len(q) 45 if t.err == nil { 46 t.err = &SyntaxError{t.qOrig, off, msg} 47 } 48 } 49 50 // A tok is a single token in the filter/projection lexical syntax. 51 type tok struct { 52 // Kind specifies the category of this token. It is either 'w' 53 // or 'q' for an unquoted or quoted word, respectively, 'r' 54 // for a regexp, an operator character, or 0 for the 55 // end-of-string token. 56 Kind byte 57 Off int // Byte offset of the beginning of this token 58 Tok string // Literal token contents; quoted words are unescaped 59 Regexp *regexp.Regexp 60 } 61 62 type tokenizer struct { 63 q string 64 errt *errorTracker 65 } 66 67 func newTokenizer(q string) tokenizer { 68 return tokenizer{q, &errorTracker{q, nil}} 69 } 70 71 func isOp(ch rune) bool { 72 return ch == '(' || ch == ')' || ch == ':' || ch == '@' || ch == ',' 73 } 74 75 // At the beginning of a word, we accept "-" and "*" as operators, 76 // but in the middle of words we treat them as part of the word. 77 func isStartOp(ch rune) bool { 78 return isOp(ch) || ch == '-' || ch == '*' 79 } 80 81 func isSpace(q string) int { 82 if q[0] == ' ' { 83 return 1 84 } 85 r, size := utf8.DecodeRuneInString(q) 86 if unicode.IsSpace(r) { 87 return size 88 } 89 return 0 90 } 91 92 // keyOrOp returns the next key or operator token. 93 // A key may be a bare word or a quoted word. 94 func (t *tokenizer) keyOrOp() (tok, tokenizer) { 95 return t.next(false) 96 } 97 98 // valueOrOp returns the next value or operator token. 99 // A value may be a bare word, a quoted word, or a regexp. 100 func (t *tokenizer) valueOrOp() (tok, tokenizer) { 101 return t.next(true) 102 } 103 104 // end asserts that t has reached the end of the token stream. If it 105 // has not, it returns a tokenizer the reports an error. 106 func (t *tokenizer) end() tokenizer { 107 if tok, _ := t.keyOrOp(); tok.Kind != 0 { 108 _, t2 := t.error("unexpected " + strconv.Quote(tok.Tok)) 109 return t2 110 } 111 return *t 112 } 113 114 func (t *tokenizer) next(allowRegexp bool) (tok, tokenizer) { 115 for len(t.q) > 0 { 116 if isStartOp(rune(t.q[0])) { 117 return t.tok(t.q[0], t.q[:1], t.q[1:]) 118 } else if n := isSpace(t.q); n > 0 { 119 t.q = t.q[n:] 120 } else if allowRegexp && t.q[0] == '/' { 121 return t.regexp() 122 } else if t.q[0] == '"' { 123 return t.quotedWord() 124 } else { 125 return t.bareWord() 126 } 127 } 128 // Add an EOF token. This eliminates the need for lots of 129 // bounds checks in the parser and gives the EOF a position. 130 return t.tok(0, "", "") 131 } 132 133 func (t *tokenizer) tok(kind byte, token string, rest string) (tok, tokenizer) { 134 off := len(t.errt.qOrig) - len(t.q) 135 return tok{kind, off, token, nil}, tokenizer{rest, t.errt} 136 } 137 138 func (t *tokenizer) error(msg string) (tok, tokenizer) { 139 t.errt.error(t.q, msg) 140 // Move to the end. 141 return t.tok(0, "", "") 142 } 143 144 func (t *tokenizer) quotedWord() (tok, tokenizer) { 145 pos := 1 // Skip initial " 146 for pos < len(t.q) && (t.q[pos] != '"' || t.q[pos-1] == '\\') { 147 pos++ 148 } 149 if pos == len(t.q) { 150 return t.error("missing end quote") 151 } 152 // Parse the quoted string. 153 word, err := strconv.Unquote(t.q[:pos+1]) 154 if err != nil { 155 return t.error("bad escape sequence") 156 } 157 return t.tok('q', word, t.q[pos+1:]) 158 } 159 160 func (t *tokenizer) bareWord() (tok, tokenizer) { 161 // Consume until a space or operator. We only take "-" 162 // as an operator immediately following another space 163 // or operator so things like "foo-bar" work as 164 // expected. 165 end := len(t.q) 166 for i, r := range t.q { 167 if unicode.IsSpace(r) || isOp(r) { 168 end = i 169 break 170 } 171 } 172 word := t.q[:end] 173 if word == "AND" { 174 return t.tok('A', word, t.q[end:]) 175 } else if word == "OR" { 176 return t.tok('O', word, t.q[end:]) 177 } 178 return t.tok('w', word, t.q[end:]) 179 } 180 181 // quoteWord returns a string that tokenizes as the word s. 182 func quoteWord(s string) string { 183 if len(s) == 0 { 184 return `""` 185 } 186 for i, r := range s { 187 switch r { 188 case '"', ' ', '\a', '\b': 189 return strconv.Quote(s) 190 } 191 if isOp(r) || unicode.IsSpace(r) || (i == 0 && (r == '-' || r == '*')) { 192 return strconv.Quote(s) 193 } 194 } 195 // No quoting necessary. 196 return s 197 } 198 199 func (t *tokenizer) regexp() (tok, tokenizer) { 200 expr, rest, err := regexpParseUntil(t.q[1:], "/") 201 if err == errNoDelim { 202 return t.error("missing close \"/\"") 203 } else if err != nil { 204 return t.error(err.Error()) 205 } 206 207 r, err := regexp.Compile(expr) 208 if err != nil { 209 return t.error(err.Error()) 210 } 211 212 // To avoid confusion when "/" appears in the regexp itself, 213 // we require space or an operator after the close "/". 214 q2 := rest[1:] 215 if !(q2 == "" || unicode.IsSpace(rune(q2[0])) || isStartOp(rune(q2[0]))) { 216 t.q = q2 217 return t.error("regexp must be followed by space or an operator (unescaped \"/\"?)") 218 } 219 220 tok, next := t.tok('r', expr, q2) 221 tok.Regexp = r 222 return tok, next 223 } 224 225 var errNoDelim = errors.New("unterminated regexp") 226 227 // regexpParseUntil parses a regular expression from the beginning of str 228 // until the string delim appears at the top level of the expression. 229 // It returns the regular expression prefix of str and the remainder of str. 230 // If successful, rest will always begin with delim. 231 // If delim does not appear at the top level of str, it returns str, "", errNoDelim. 232 // 233 // TODO: There are corner cases this doesn't get right. Replace it 234 // with a standard library call if #44254 is implemented. 235 func regexpParseUntil(str, delim string) (expr, rest string, err error) { 236 cs := 0 237 cp := 0 238 for i := 0; i < len(str); { 239 if cs == 0 && cp == 0 && strings.HasPrefix(str[i:], delim) { 240 return str[:i], str[i:], nil 241 } 242 switch str[i] { 243 case '[': 244 cs++ 245 case ']': 246 if cs--; cs < 0 { // An unmatched ']' is legal. 247 cs = 0 248 } 249 case '(': 250 if cs == 0 { 251 cp++ 252 } 253 case ')': 254 if cs == 0 { 255 cp-- 256 } 257 case '\\': 258 i++ 259 } 260 i++ 261 } 262 return str, "", errNoDelim 263 }