gitee.com/mysnapcore/mysnapd@v0.1.0/strutil/shlex/shlex.go (about)

     1  /*
     2  Copyright 2012 Google Inc. All Rights Reserved.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  /*
    18  Package shlex implements a simple lexer which splits input in to tokens using
    19  shell-style rules for quoting and commenting.
    20  
    21  The basic use case uses the default ASCII lexer to split a string into sub-strings:
    22  
    23    shlex.Split("one \"two three\" four") -> []string{"one", "two three", "four"}
    24  
    25  To process a stream of strings:
    26  
    27    l := NewLexer(os.Stdin)
    28    for ; token, err := l.Next(); err != nil {
    29    	// process token
    30    }
    31  
    32  To access the raw token stream (which includes tokens for comments):
    33  
    34    t := NewTokenizer(os.Stdin)
    35    for ; token, err := t.Next(); err != nil {
    36  	// process token
    37    }
    38  
    39  */
    40  package shlex
    41  
    42  import (
    43  	"bufio"
    44  	"fmt"
    45  	"io"
    46  	"strings"
    47  )
    48  
    49  // TokenType is a top-level token classification: A word, space, comment, unknown.
    50  type TokenType int
    51  
    52  // runeTokenClass is the type of a UTF-8 character classification: A quote, space, escape.
    53  type runeTokenClass int
    54  
    55  // the internal state used by the lexer state machine
    56  type lexerState int
    57  
    58  // Token is a (type, value) pair representing a lexographical token.
    59  type Token struct {
    60  	tokenType TokenType
    61  	value     string
    62  }
    63  
    64  // Equal reports whether tokens a, and b, are equal.
    65  // Two tokens are equal if both their types and values are equal. A nil token can
    66  // never be equal to another token.
    67  func (a *Token) Equal(b *Token) bool {
    68  	if a == nil || b == nil {
    69  		return false
    70  	}
    71  	if a.tokenType != b.tokenType {
    72  		return false
    73  	}
    74  	return a.value == b.value
    75  }
    76  
    77  // Named classes of UTF-8 runes
    78  const (
    79  	spaceRunes            = " \t\r\n"
    80  	escapingQuoteRunes    = `"`
    81  	nonEscapingQuoteRunes = "'"
    82  	escapeRunes           = `\`
    83  	commentRunes          = "#"
    84  )
    85  
    86  // Classes of rune token
    87  const (
    88  	//nolint:deadcode
    89  	unknownRuneClass runeTokenClass = iota
    90  	spaceRuneClass
    91  	escapingQuoteRuneClass
    92  	nonEscapingQuoteRuneClass
    93  	escapeRuneClass
    94  	commentRuneClass
    95  	eofRuneClass
    96  )
    97  
    98  // Classes of lexographic token
    99  const (
   100  	UnknownToken TokenType = iota
   101  	WordToken
   102  	SpaceToken
   103  	CommentToken
   104  )
   105  
   106  // Lexer state machine states
   107  const (
   108  	startState           lexerState = iota // no runes have been seen
   109  	inWordState                            // processing regular runes in a word
   110  	escapingState                          // we have just consumed an escape rune; the next rune is literal
   111  	escapingQuotedState                    // we have just consumed an escape rune within a quoted string
   112  	quotingEscapingState                   // we are within a quoted string that supports escaping ("...")
   113  	quotingState                           // we are within a string that does not support escaping ('...')
   114  	commentState                           // we are within a comment (everything following an unquoted or unescaped #
   115  )
   116  
   117  // tokenClassifier is used for classifying rune characters.
   118  type tokenClassifier map[rune]runeTokenClass
   119  
   120  func (typeMap tokenClassifier) addRuneClass(runes string, tokenType runeTokenClass) {
   121  	for _, runeChar := range runes {
   122  		typeMap[runeChar] = tokenType
   123  	}
   124  }
   125  
   126  // newDefaultClassifier creates a new classifier for ASCII characters.
   127  func newDefaultClassifier() tokenClassifier {
   128  	t := tokenClassifier{}
   129  	t.addRuneClass(spaceRunes, spaceRuneClass)
   130  	t.addRuneClass(escapingQuoteRunes, escapingQuoteRuneClass)
   131  	t.addRuneClass(nonEscapingQuoteRunes, nonEscapingQuoteRuneClass)
   132  	t.addRuneClass(escapeRunes, escapeRuneClass)
   133  	t.addRuneClass(commentRunes, commentRuneClass)
   134  	return t
   135  }
   136  
   137  // ClassifyRune classifiees a rune
   138  func (t tokenClassifier) ClassifyRune(runeVal rune) runeTokenClass {
   139  	return t[runeVal]
   140  }
   141  
   142  // Lexer turns an input stream into a sequence of tokens. Whitespace and comments are skipped.
   143  type Lexer Tokenizer
   144  
   145  // NewLexer creates a new lexer from an input stream.
   146  func NewLexer(r io.Reader) *Lexer {
   147  
   148  	return (*Lexer)(NewTokenizer(r))
   149  }
   150  
   151  // Next returns the next word, or an error. If there are no more words,
   152  // the error will be io.EOF.
   153  func (l *Lexer) Next() (string, error) {
   154  	for {
   155  		token, err := (*Tokenizer)(l).Next()
   156  		if err != nil {
   157  			return "", err
   158  		}
   159  		switch token.tokenType {
   160  		case WordToken:
   161  			return token.value, nil
   162  		case CommentToken:
   163  			// skip comments
   164  		default:
   165  			return "", fmt.Errorf("Unknown token type: %v", token.tokenType)
   166  		}
   167  	}
   168  }
   169  
   170  // Tokenizer turns an input stream into a sequence of typed tokens
   171  type Tokenizer struct {
   172  	input      bufio.Reader
   173  	classifier tokenClassifier
   174  }
   175  
   176  // NewTokenizer creates a new tokenizer from an input stream.
   177  func NewTokenizer(r io.Reader) *Tokenizer {
   178  	input := bufio.NewReader(r)
   179  	classifier := newDefaultClassifier()
   180  	return &Tokenizer{
   181  		input:      *input,
   182  		classifier: classifier}
   183  }
   184  
   185  // scanStream scans the stream for the next token using the internal state machine.
   186  // It will panic if it encounters a rune which it does not know how to handle.
   187  func (t *Tokenizer) scanStream() (*Token, error) {
   188  	state := startState
   189  	var tokenType TokenType
   190  	var value []rune
   191  	var nextRune rune
   192  	var nextRuneType runeTokenClass
   193  	var err error
   194  
   195  	for {
   196  		nextRune, _, err = t.input.ReadRune()
   197  		nextRuneType = t.classifier.ClassifyRune(nextRune)
   198  
   199  		if err == io.EOF {
   200  			nextRuneType = eofRuneClass
   201  			err = nil
   202  		} else if err != nil {
   203  			return nil, err
   204  		}
   205  
   206  		switch state {
   207  		case startState: // no runes read yet
   208  			{
   209  				switch nextRuneType {
   210  				case eofRuneClass:
   211  					{
   212  						return nil, io.EOF
   213  					}
   214  				case spaceRuneClass:
   215  					{
   216  					}
   217  				case escapingQuoteRuneClass:
   218  					{
   219  						tokenType = WordToken
   220  						state = quotingEscapingState
   221  					}
   222  				case nonEscapingQuoteRuneClass:
   223  					{
   224  						tokenType = WordToken
   225  						state = quotingState
   226  					}
   227  				case escapeRuneClass:
   228  					{
   229  						tokenType = WordToken
   230  						state = escapingState
   231  					}
   232  				case commentRuneClass:
   233  					{
   234  						tokenType = CommentToken
   235  						state = commentState
   236  					}
   237  				default:
   238  					{
   239  						tokenType = WordToken
   240  						value = append(value, nextRune)
   241  						state = inWordState
   242  					}
   243  				}
   244  			}
   245  		case inWordState: // in a regular word
   246  			{
   247  				switch nextRuneType {
   248  				case eofRuneClass:
   249  					{
   250  						token := &Token{
   251  							tokenType: tokenType,
   252  							value:     string(value)}
   253  						return token, err
   254  					}
   255  				case spaceRuneClass:
   256  					{
   257  						t.input.UnreadRune()
   258  						token := &Token{
   259  							tokenType: tokenType,
   260  							value:     string(value)}
   261  						return token, err
   262  					}
   263  				case escapingQuoteRuneClass:
   264  					{
   265  						state = quotingEscapingState
   266  					}
   267  				case nonEscapingQuoteRuneClass:
   268  					{
   269  						state = quotingState
   270  					}
   271  				case escapeRuneClass:
   272  					{
   273  						state = escapingState
   274  					}
   275  				default:
   276  					{
   277  						value = append(value, nextRune)
   278  					}
   279  				}
   280  			}
   281  		case escapingState: // the rune after an escape character
   282  			{
   283  				switch nextRuneType {
   284  				case eofRuneClass:
   285  					{
   286  						err = fmt.Errorf("EOF found after escape character")
   287  						token := &Token{
   288  							tokenType: tokenType,
   289  							value:     string(value)}
   290  						return token, err
   291  					}
   292  				default:
   293  					{
   294  						state = inWordState
   295  						value = append(value, nextRune)
   296  					}
   297  				}
   298  			}
   299  		case escapingQuotedState: // the next rune after an escape character, in double quotes
   300  			{
   301  				switch nextRuneType {
   302  				case eofRuneClass:
   303  					{
   304  						err = fmt.Errorf("EOF found after escape character")
   305  						token := &Token{
   306  							tokenType: tokenType,
   307  							value:     string(value)}
   308  						return token, err
   309  					}
   310  				default:
   311  					{
   312  						state = quotingEscapingState
   313  						value = append(value, nextRune)
   314  					}
   315  				}
   316  			}
   317  		case quotingEscapingState: // in escaping double quotes
   318  			{
   319  				switch nextRuneType {
   320  				case eofRuneClass:
   321  					{
   322  						err = fmt.Errorf("EOF found when expecting closing quote")
   323  						token := &Token{
   324  							tokenType: tokenType,
   325  							value:     string(value)}
   326  						return token, err
   327  					}
   328  				case escapingQuoteRuneClass:
   329  					{
   330  						state = inWordState
   331  					}
   332  				case escapeRuneClass:
   333  					{
   334  						state = escapingQuotedState
   335  					}
   336  				default:
   337  					{
   338  						value = append(value, nextRune)
   339  					}
   340  				}
   341  			}
   342  		case quotingState: // in non-escaping single quotes
   343  			{
   344  				switch nextRuneType {
   345  				case eofRuneClass:
   346  					{
   347  						err = fmt.Errorf("EOF found when expecting closing quote")
   348  						token := &Token{
   349  							tokenType: tokenType,
   350  							value:     string(value)}
   351  						return token, err
   352  					}
   353  				case nonEscapingQuoteRuneClass:
   354  					{
   355  						state = inWordState
   356  					}
   357  				default:
   358  					{
   359  						value = append(value, nextRune)
   360  					}
   361  				}
   362  			}
   363  		case commentState: // in a comment
   364  			{
   365  				switch nextRuneType {
   366  				case eofRuneClass:
   367  					{
   368  						token := &Token{
   369  							tokenType: tokenType,
   370  							value:     string(value)}
   371  						return token, err
   372  					}
   373  				case spaceRuneClass:
   374  					{
   375  						if nextRune == '\n' {
   376  							token := &Token{
   377  								tokenType: tokenType,
   378  								value:     string(value)}
   379  							return token, err
   380  						} else {
   381  							value = append(value, nextRune)
   382  						}
   383  					}
   384  				default:
   385  					{
   386  						value = append(value, nextRune)
   387  					}
   388  				}
   389  			}
   390  		default:
   391  			{
   392  				return nil, fmt.Errorf("Unexpected state: %v", state)
   393  			}
   394  		}
   395  	}
   396  }
   397  
   398  // Next returns the next token in the stream.
   399  func (t *Tokenizer) Next() (*Token, error) {
   400  	return t.scanStream()
   401  }
   402  
   403  // Split partitions a string into a slice of strings.
   404  func Split(s string) ([]string, error) {
   405  	l := NewLexer(strings.NewReader(s))
   406  	subStrings := make([]string, 0)
   407  	for {
   408  		word, err := l.Next()
   409  		if err != nil {
   410  			if err == io.EOF {
   411  				return subStrings, nil
   412  			}
   413  			return subStrings, err
   414  		}
   415  		subStrings = append(subStrings, word)
   416  	}
   417  }