github.com/bugraaydogar/snapd@v0.0.0-20210315170335-8c70bb858939/strutil/shlex/shlex.go (about)

     1  /*
     2  Copyright 2012 Google Inc. All Rights Reserved.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  /*
    18  Package shlex implements a simple lexer which splits input in to tokens using
    19  shell-style rules for quoting and commenting.
    20  
    21  The basic use case uses the default ASCII lexer to split a string into sub-strings:
    22  
    23    shlex.Split("one \"two three\" four") -> []string{"one", "two three", "four"}
    24  
    25  To process a stream of strings:
    26  
    27    l := NewLexer(os.Stdin)
    28    for ; token, err := l.Next(); err != nil {
    29    	// process token
    30    }
    31  
    32  To access the raw token stream (which includes tokens for comments):
    33  
    34    t := NewTokenizer(os.Stdin)
    35    for ; token, err := t.Next(); err != nil {
    36  	// process token
    37    }
    38  
    39  */
    40  package shlex
    41  
    42  import (
    43  	"bufio"
    44  	"fmt"
    45  	"io"
    46  	"strings"
    47  )
    48  
    49  // TokenType is a top-level token classification: A word, space, comment, unknown.
    50  type TokenType int
    51  
    52  // runeTokenClass is the type of a UTF-8 character classification: A quote, space, escape.
    53  type runeTokenClass int
    54  
    55  // the internal state used by the lexer state machine
    56  type lexerState int
    57  
    58  // Token is a (type, value) pair representing a lexographical token.
    59  type Token struct {
    60  	tokenType TokenType
    61  	value     string
    62  }
    63  
    64  // Equal reports whether tokens a, and b, are equal.
    65  // Two tokens are equal if both their types and values are equal. A nil token can
    66  // never be equal to another token.
    67  func (a *Token) Equal(b *Token) bool {
    68  	if a == nil || b == nil {
    69  		return false
    70  	}
    71  	if a.tokenType != b.tokenType {
    72  		return false
    73  	}
    74  	return a.value == b.value
    75  }
    76  
    77  // Named classes of UTF-8 runes
    78  const (
    79  	spaceRunes            = " \t\r\n"
    80  	escapingQuoteRunes    = `"`
    81  	nonEscapingQuoteRunes = "'"
    82  	escapeRunes           = `\`
    83  	commentRunes          = "#"
    84  )
    85  
    86  // Classes of rune token
    87  const (
    88  	unknownRuneClass runeTokenClass = iota
    89  	spaceRuneClass
    90  	escapingQuoteRuneClass
    91  	nonEscapingQuoteRuneClass
    92  	escapeRuneClass
    93  	commentRuneClass
    94  	eofRuneClass
    95  )
    96  
    97  // Classes of lexographic token
    98  const (
    99  	UnknownToken TokenType = iota
   100  	WordToken
   101  	SpaceToken
   102  	CommentToken
   103  )
   104  
   105  // Lexer state machine states
   106  const (
   107  	startState           lexerState = iota // no runes have been seen
   108  	inWordState                            // processing regular runes in a word
   109  	escapingState                          // we have just consumed an escape rune; the next rune is literal
   110  	escapingQuotedState                    // we have just consumed an escape rune within a quoted string
   111  	quotingEscapingState                   // we are within a quoted string that supports escaping ("...")
   112  	quotingState                           // we are within a string that does not support escaping ('...')
   113  	commentState                           // we are within a comment (everything following an unquoted or unescaped #
   114  )
   115  
   116  // tokenClassifier is used for classifying rune characters.
   117  type tokenClassifier map[rune]runeTokenClass
   118  
   119  func (typeMap tokenClassifier) addRuneClass(runes string, tokenType runeTokenClass) {
   120  	for _, runeChar := range runes {
   121  		typeMap[runeChar] = tokenType
   122  	}
   123  }
   124  
   125  // newDefaultClassifier creates a new classifier for ASCII characters.
   126  func newDefaultClassifier() tokenClassifier {
   127  	t := tokenClassifier{}
   128  	t.addRuneClass(spaceRunes, spaceRuneClass)
   129  	t.addRuneClass(escapingQuoteRunes, escapingQuoteRuneClass)
   130  	t.addRuneClass(nonEscapingQuoteRunes, nonEscapingQuoteRuneClass)
   131  	t.addRuneClass(escapeRunes, escapeRuneClass)
   132  	t.addRuneClass(commentRunes, commentRuneClass)
   133  	return t
   134  }
   135  
   136  // ClassifyRune classifiees a rune
   137  func (t tokenClassifier) ClassifyRune(runeVal rune) runeTokenClass {
   138  	return t[runeVal]
   139  }
   140  
   141  // Lexer turns an input stream into a sequence of tokens. Whitespace and comments are skipped.
   142  type Lexer Tokenizer
   143  
   144  // NewLexer creates a new lexer from an input stream.
   145  func NewLexer(r io.Reader) *Lexer {
   146  
   147  	return (*Lexer)(NewTokenizer(r))
   148  }
   149  
   150  // Next returns the next word, or an error. If there are no more words,
   151  // the error will be io.EOF.
   152  func (l *Lexer) Next() (string, error) {
   153  	for {
   154  		token, err := (*Tokenizer)(l).Next()
   155  		if err != nil {
   156  			return "", err
   157  		}
   158  		switch token.tokenType {
   159  		case WordToken:
   160  			return token.value, nil
   161  		case CommentToken:
   162  			// skip comments
   163  		default:
   164  			return "", fmt.Errorf("Unknown token type: %v", token.tokenType)
   165  		}
   166  	}
   167  }
   168  
   169  // Tokenizer turns an input stream into a sequence of typed tokens
   170  type Tokenizer struct {
   171  	input      bufio.Reader
   172  	classifier tokenClassifier
   173  }
   174  
   175  // NewTokenizer creates a new tokenizer from an input stream.
   176  func NewTokenizer(r io.Reader) *Tokenizer {
   177  	input := bufio.NewReader(r)
   178  	classifier := newDefaultClassifier()
   179  	return &Tokenizer{
   180  		input:      *input,
   181  		classifier: classifier}
   182  }
   183  
   184  // scanStream scans the stream for the next token using the internal state machine.
   185  // It will panic if it encounters a rune which it does not know how to handle.
   186  func (t *Tokenizer) scanStream() (*Token, error) {
   187  	state := startState
   188  	var tokenType TokenType
   189  	var value []rune
   190  	var nextRune rune
   191  	var nextRuneType runeTokenClass
   192  	var err error
   193  
   194  	for {
   195  		nextRune, _, err = t.input.ReadRune()
   196  		nextRuneType = t.classifier.ClassifyRune(nextRune)
   197  
   198  		if err == io.EOF {
   199  			nextRuneType = eofRuneClass
   200  			err = nil
   201  		} else if err != nil {
   202  			return nil, err
   203  		}
   204  
   205  		switch state {
   206  		case startState: // no runes read yet
   207  			{
   208  				switch nextRuneType {
   209  				case eofRuneClass:
   210  					{
   211  						return nil, io.EOF
   212  					}
   213  				case spaceRuneClass:
   214  					{
   215  					}
   216  				case escapingQuoteRuneClass:
   217  					{
   218  						tokenType = WordToken
   219  						state = quotingEscapingState
   220  					}
   221  				case nonEscapingQuoteRuneClass:
   222  					{
   223  						tokenType = WordToken
   224  						state = quotingState
   225  					}
   226  				case escapeRuneClass:
   227  					{
   228  						tokenType = WordToken
   229  						state = escapingState
   230  					}
   231  				case commentRuneClass:
   232  					{
   233  						tokenType = CommentToken
   234  						state = commentState
   235  					}
   236  				default:
   237  					{
   238  						tokenType = WordToken
   239  						value = append(value, nextRune)
   240  						state = inWordState
   241  					}
   242  				}
   243  			}
   244  		case inWordState: // in a regular word
   245  			{
   246  				switch nextRuneType {
   247  				case eofRuneClass:
   248  					{
   249  						token := &Token{
   250  							tokenType: tokenType,
   251  							value:     string(value)}
   252  						return token, err
   253  					}
   254  				case spaceRuneClass:
   255  					{
   256  						t.input.UnreadRune()
   257  						token := &Token{
   258  							tokenType: tokenType,
   259  							value:     string(value)}
   260  						return token, err
   261  					}
   262  				case escapingQuoteRuneClass:
   263  					{
   264  						state = quotingEscapingState
   265  					}
   266  				case nonEscapingQuoteRuneClass:
   267  					{
   268  						state = quotingState
   269  					}
   270  				case escapeRuneClass:
   271  					{
   272  						state = escapingState
   273  					}
   274  				default:
   275  					{
   276  						value = append(value, nextRune)
   277  					}
   278  				}
   279  			}
   280  		case escapingState: // the rune after an escape character
   281  			{
   282  				switch nextRuneType {
   283  				case eofRuneClass:
   284  					{
   285  						err = fmt.Errorf("EOF found after escape character")
   286  						token := &Token{
   287  							tokenType: tokenType,
   288  							value:     string(value)}
   289  						return token, err
   290  					}
   291  				default:
   292  					{
   293  						state = inWordState
   294  						value = append(value, nextRune)
   295  					}
   296  				}
   297  			}
   298  		case escapingQuotedState: // the next rune after an escape character, in double quotes
   299  			{
   300  				switch nextRuneType {
   301  				case eofRuneClass:
   302  					{
   303  						err = fmt.Errorf("EOF found after escape character")
   304  						token := &Token{
   305  							tokenType: tokenType,
   306  							value:     string(value)}
   307  						return token, err
   308  					}
   309  				default:
   310  					{
   311  						state = quotingEscapingState
   312  						value = append(value, nextRune)
   313  					}
   314  				}
   315  			}
   316  		case quotingEscapingState: // in escaping double quotes
   317  			{
   318  				switch nextRuneType {
   319  				case eofRuneClass:
   320  					{
   321  						err = fmt.Errorf("EOF found when expecting closing quote")
   322  						token := &Token{
   323  							tokenType: tokenType,
   324  							value:     string(value)}
   325  						return token, err
   326  					}
   327  				case escapingQuoteRuneClass:
   328  					{
   329  						state = inWordState
   330  					}
   331  				case escapeRuneClass:
   332  					{
   333  						state = escapingQuotedState
   334  					}
   335  				default:
   336  					{
   337  						value = append(value, nextRune)
   338  					}
   339  				}
   340  			}
   341  		case quotingState: // in non-escaping single quotes
   342  			{
   343  				switch nextRuneType {
   344  				case eofRuneClass:
   345  					{
   346  						err = fmt.Errorf("EOF found when expecting closing quote")
   347  						token := &Token{
   348  							tokenType: tokenType,
   349  							value:     string(value)}
   350  						return token, err
   351  					}
   352  				case nonEscapingQuoteRuneClass:
   353  					{
   354  						state = inWordState
   355  					}
   356  				default:
   357  					{
   358  						value = append(value, nextRune)
   359  					}
   360  				}
   361  			}
   362  		case commentState: // in a comment
   363  			{
   364  				switch nextRuneType {
   365  				case eofRuneClass:
   366  					{
   367  						token := &Token{
   368  							tokenType: tokenType,
   369  							value:     string(value)}
   370  						return token, err
   371  					}
   372  				case spaceRuneClass:
   373  					{
   374  						if nextRune == '\n' {
   375  							token := &Token{
   376  								tokenType: tokenType,
   377  								value:     string(value)}
   378  							return token, err
   379  						} else {
   380  							value = append(value, nextRune)
   381  						}
   382  					}
   383  				default:
   384  					{
   385  						value = append(value, nextRune)
   386  					}
   387  				}
   388  			}
   389  		default:
   390  			{
   391  				return nil, fmt.Errorf("Unexpected state: %v", state)
   392  			}
   393  		}
   394  	}
   395  }
   396  
   397  // Next returns the next token in the stream.
   398  func (t *Tokenizer) Next() (*Token, error) {
   399  	return t.scanStream()
   400  }
   401  
   402  // Split partitions a string into a slice of strings.
   403  func Split(s string) ([]string, error) {
   404  	l := NewLexer(strings.NewReader(s))
   405  	subStrings := make([]string, 0)
   406  	for {
   407  		word, err := l.Next()
   408  		if err != nil {
   409  			if err == io.EOF {
   410  				return subStrings, nil
   411  			}
   412  			return subStrings, err
   413  		}
   414  		subStrings = append(subStrings, word)
   415  	}
   416  }