github.com/bugraaydogar/snapd@v0.0.0-20210315170335-8c70bb858939/strutil/shlex/shlex.go (about) 1 /* 2 Copyright 2012 Google Inc. All Rights Reserved. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 /* 18 Package shlex implements a simple lexer which splits input in to tokens using 19 shell-style rules for quoting and commenting. 20 21 The basic use case uses the default ASCII lexer to split a string into sub-strings: 22 23 shlex.Split("one \"two three\" four") -> []string{"one", "two three", "four"} 24 25 To process a stream of strings: 26 27 l := NewLexer(os.Stdin) 28 for ; token, err := l.Next(); err != nil { 29 // process token 30 } 31 32 To access the raw token stream (which includes tokens for comments): 33 34 t := NewTokenizer(os.Stdin) 35 for ; token, err := t.Next(); err != nil { 36 // process token 37 } 38 39 */ 40 package shlex 41 42 import ( 43 "bufio" 44 "fmt" 45 "io" 46 "strings" 47 ) 48 49 // TokenType is a top-level token classification: A word, space, comment, unknown. 50 type TokenType int 51 52 // runeTokenClass is the type of a UTF-8 character classification: A quote, space, escape. 53 type runeTokenClass int 54 55 // the internal state used by the lexer state machine 56 type lexerState int 57 58 // Token is a (type, value) pair representing a lexographical token. 59 type Token struct { 60 tokenType TokenType 61 value string 62 } 63 64 // Equal reports whether tokens a, and b, are equal. 65 // Two tokens are equal if both their types and values are equal. A nil token can 66 // never be equal to another token. 67 func (a *Token) Equal(b *Token) bool { 68 if a == nil || b == nil { 69 return false 70 } 71 if a.tokenType != b.tokenType { 72 return false 73 } 74 return a.value == b.value 75 } 76 77 // Named classes of UTF-8 runes 78 const ( 79 spaceRunes = " \t\r\n" 80 escapingQuoteRunes = `"` 81 nonEscapingQuoteRunes = "'" 82 escapeRunes = `\` 83 commentRunes = "#" 84 ) 85 86 // Classes of rune token 87 const ( 88 unknownRuneClass runeTokenClass = iota 89 spaceRuneClass 90 escapingQuoteRuneClass 91 nonEscapingQuoteRuneClass 92 escapeRuneClass 93 commentRuneClass 94 eofRuneClass 95 ) 96 97 // Classes of lexographic token 98 const ( 99 UnknownToken TokenType = iota 100 WordToken 101 SpaceToken 102 CommentToken 103 ) 104 105 // Lexer state machine states 106 const ( 107 startState lexerState = iota // no runes have been seen 108 inWordState // processing regular runes in a word 109 escapingState // we have just consumed an escape rune; the next rune is literal 110 escapingQuotedState // we have just consumed an escape rune within a quoted string 111 quotingEscapingState // we are within a quoted string that supports escaping ("...") 112 quotingState // we are within a string that does not support escaping ('...') 113 commentState // we are within a comment (everything following an unquoted or unescaped # 114 ) 115 116 // tokenClassifier is used for classifying rune characters. 117 type tokenClassifier map[rune]runeTokenClass 118 119 func (typeMap tokenClassifier) addRuneClass(runes string, tokenType runeTokenClass) { 120 for _, runeChar := range runes { 121 typeMap[runeChar] = tokenType 122 } 123 } 124 125 // newDefaultClassifier creates a new classifier for ASCII characters. 126 func newDefaultClassifier() tokenClassifier { 127 t := tokenClassifier{} 128 t.addRuneClass(spaceRunes, spaceRuneClass) 129 t.addRuneClass(escapingQuoteRunes, escapingQuoteRuneClass) 130 t.addRuneClass(nonEscapingQuoteRunes, nonEscapingQuoteRuneClass) 131 t.addRuneClass(escapeRunes, escapeRuneClass) 132 t.addRuneClass(commentRunes, commentRuneClass) 133 return t 134 } 135 136 // ClassifyRune classifiees a rune 137 func (t tokenClassifier) ClassifyRune(runeVal rune) runeTokenClass { 138 return t[runeVal] 139 } 140 141 // Lexer turns an input stream into a sequence of tokens. Whitespace and comments are skipped. 142 type Lexer Tokenizer 143 144 // NewLexer creates a new lexer from an input stream. 145 func NewLexer(r io.Reader) *Lexer { 146 147 return (*Lexer)(NewTokenizer(r)) 148 } 149 150 // Next returns the next word, or an error. If there are no more words, 151 // the error will be io.EOF. 152 func (l *Lexer) Next() (string, error) { 153 for { 154 token, err := (*Tokenizer)(l).Next() 155 if err != nil { 156 return "", err 157 } 158 switch token.tokenType { 159 case WordToken: 160 return token.value, nil 161 case CommentToken: 162 // skip comments 163 default: 164 return "", fmt.Errorf("Unknown token type: %v", token.tokenType) 165 } 166 } 167 } 168 169 // Tokenizer turns an input stream into a sequence of typed tokens 170 type Tokenizer struct { 171 input bufio.Reader 172 classifier tokenClassifier 173 } 174 175 // NewTokenizer creates a new tokenizer from an input stream. 176 func NewTokenizer(r io.Reader) *Tokenizer { 177 input := bufio.NewReader(r) 178 classifier := newDefaultClassifier() 179 return &Tokenizer{ 180 input: *input, 181 classifier: classifier} 182 } 183 184 // scanStream scans the stream for the next token using the internal state machine. 185 // It will panic if it encounters a rune which it does not know how to handle. 186 func (t *Tokenizer) scanStream() (*Token, error) { 187 state := startState 188 var tokenType TokenType 189 var value []rune 190 var nextRune rune 191 var nextRuneType runeTokenClass 192 var err error 193 194 for { 195 nextRune, _, err = t.input.ReadRune() 196 nextRuneType = t.classifier.ClassifyRune(nextRune) 197 198 if err == io.EOF { 199 nextRuneType = eofRuneClass 200 err = nil 201 } else if err != nil { 202 return nil, err 203 } 204 205 switch state { 206 case startState: // no runes read yet 207 { 208 switch nextRuneType { 209 case eofRuneClass: 210 { 211 return nil, io.EOF 212 } 213 case spaceRuneClass: 214 { 215 } 216 case escapingQuoteRuneClass: 217 { 218 tokenType = WordToken 219 state = quotingEscapingState 220 } 221 case nonEscapingQuoteRuneClass: 222 { 223 tokenType = WordToken 224 state = quotingState 225 } 226 case escapeRuneClass: 227 { 228 tokenType = WordToken 229 state = escapingState 230 } 231 case commentRuneClass: 232 { 233 tokenType = CommentToken 234 state = commentState 235 } 236 default: 237 { 238 tokenType = WordToken 239 value = append(value, nextRune) 240 state = inWordState 241 } 242 } 243 } 244 case inWordState: // in a regular word 245 { 246 switch nextRuneType { 247 case eofRuneClass: 248 { 249 token := &Token{ 250 tokenType: tokenType, 251 value: string(value)} 252 return token, err 253 } 254 case spaceRuneClass: 255 { 256 t.input.UnreadRune() 257 token := &Token{ 258 tokenType: tokenType, 259 value: string(value)} 260 return token, err 261 } 262 case escapingQuoteRuneClass: 263 { 264 state = quotingEscapingState 265 } 266 case nonEscapingQuoteRuneClass: 267 { 268 state = quotingState 269 } 270 case escapeRuneClass: 271 { 272 state = escapingState 273 } 274 default: 275 { 276 value = append(value, nextRune) 277 } 278 } 279 } 280 case escapingState: // the rune after an escape character 281 { 282 switch nextRuneType { 283 case eofRuneClass: 284 { 285 err = fmt.Errorf("EOF found after escape character") 286 token := &Token{ 287 tokenType: tokenType, 288 value: string(value)} 289 return token, err 290 } 291 default: 292 { 293 state = inWordState 294 value = append(value, nextRune) 295 } 296 } 297 } 298 case escapingQuotedState: // the next rune after an escape character, in double quotes 299 { 300 switch nextRuneType { 301 case eofRuneClass: 302 { 303 err = fmt.Errorf("EOF found after escape character") 304 token := &Token{ 305 tokenType: tokenType, 306 value: string(value)} 307 return token, err 308 } 309 default: 310 { 311 state = quotingEscapingState 312 value = append(value, nextRune) 313 } 314 } 315 } 316 case quotingEscapingState: // in escaping double quotes 317 { 318 switch nextRuneType { 319 case eofRuneClass: 320 { 321 err = fmt.Errorf("EOF found when expecting closing quote") 322 token := &Token{ 323 tokenType: tokenType, 324 value: string(value)} 325 return token, err 326 } 327 case escapingQuoteRuneClass: 328 { 329 state = inWordState 330 } 331 case escapeRuneClass: 332 { 333 state = escapingQuotedState 334 } 335 default: 336 { 337 value = append(value, nextRune) 338 } 339 } 340 } 341 case quotingState: // in non-escaping single quotes 342 { 343 switch nextRuneType { 344 case eofRuneClass: 345 { 346 err = fmt.Errorf("EOF found when expecting closing quote") 347 token := &Token{ 348 tokenType: tokenType, 349 value: string(value)} 350 return token, err 351 } 352 case nonEscapingQuoteRuneClass: 353 { 354 state = inWordState 355 } 356 default: 357 { 358 value = append(value, nextRune) 359 } 360 } 361 } 362 case commentState: // in a comment 363 { 364 switch nextRuneType { 365 case eofRuneClass: 366 { 367 token := &Token{ 368 tokenType: tokenType, 369 value: string(value)} 370 return token, err 371 } 372 case spaceRuneClass: 373 { 374 if nextRune == '\n' { 375 token := &Token{ 376 tokenType: tokenType, 377 value: string(value)} 378 return token, err 379 } else { 380 value = append(value, nextRune) 381 } 382 } 383 default: 384 { 385 value = append(value, nextRune) 386 } 387 } 388 } 389 default: 390 { 391 return nil, fmt.Errorf("Unexpected state: %v", state) 392 } 393 } 394 } 395 } 396 397 // Next returns the next token in the stream. 398 func (t *Tokenizer) Next() (*Token, error) { 399 return t.scanStream() 400 } 401 402 // Split partitions a string into a slice of strings. 403 func Split(s string) ([]string, error) { 404 l := NewLexer(strings.NewReader(s)) 405 subStrings := make([]string, 0) 406 for { 407 word, err := l.Next() 408 if err != nil { 409 if err == io.EOF { 410 return subStrings, nil 411 } 412 return subStrings, err 413 } 414 subStrings = append(subStrings, word) 415 } 416 }