github.com/u-root/u-root@v7.0.1-0.20200915234505-ad7babab0a8e+incompatible/pkg/pogosh/lexer.go (about)

     1  // Copyright 2020 the u-root Authors. All rights reserved
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package pogosh
     6  
     7  import (
     8  	"fmt"
     9  	"strings"
    10  )
    11  
    12  type token struct {
    13  	value string // TODO: make []byte
    14  	ttype tokenType
    15  }
    16  
    17  type tokenType uint8
    18  
    19  // Token types
    20  const (
    21  	ttError = iota // TODO: is this used?
    22  	ttEOF
    23  	ttWord
    24  	ttAssignmentWord
    25  	ttName
    26  	ttNewLine
    27  	ttIONumber
    28  	ttAndIf     // &&
    29  	ttOrIf      // ||
    30  	ttDSemi     // ;;
    31  	ttDLess     // <<
    32  	ttDGreat    // >>
    33  	ttLessAnd   // <&
    34  	ttGreatAnd  // >&
    35  	ttLessGreat // <>
    36  	ttDLessDash // <<-
    37  	ttClobber   // >|
    38  	ttIf        // if
    39  	ttThen      // then
    40  	ttElse      // else
    41  	ttElif      // elif
    42  	ttFi        // fi
    43  	ttDo        // do
    44  	ttDone      // done
    45  	ttCase      // case
    46  	ttEsac      // esac
    47  	ttWhile     // while
    48  	ttUntil     // until
    49  	ttFor       // for
    50  	ttLBrace    // {
    51  	ttRBrace    // }
    52  	ttBang      // !
    53  	ttIn        // in
    54  )
    55  
    56  var operators = map[string]tokenType{
    57  	"&&":  ttAndIf,
    58  	"||":  ttOrIf,
    59  	";;":  ttDSemi,
    60  	"<<":  ttDLess,
    61  	">>":  ttDGreat,
    62  	"<&":  ttLessAnd,
    63  	">&":  ttGreatAnd,
    64  	"<>":  ttLessGreat,
    65  	"<<-": ttDLessDash,
    66  	">|":  ttClobber,
    67  }
    68  
    69  var reservedWords = map[string]tokenType{
    70  	"if":    ttIf,
    71  	"then":  ttThen,
    72  	"else":  ttElse,
    73  	"elif":  ttElif,
    74  	"fi":    ttFi,
    75  	"do":    ttDo,
    76  	"done":  ttDone,
    77  	"case":  ttCase,
    78  	"esac":  ttEsac,
    79  	"while": ttWhile,
    80  	"until": ttUntil,
    81  	"for":   ttFor,
    82  	"{":     ttLBrace,
    83  	"}":     ttRBrace,
    84  	"!":     ttBang,
    85  	"in":    ttIn,
    86  }
    87  
    88  var portableCharSet = "\x00\a\b\t\n\v\f\r !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxy{|}~"
    89  
    90  // tokenize splits the input into an array of tokens.
    91  // TODO: memoize?
    92  func tokenize(script string) ([]token, error) {
    93  	ts := []token{}
    94  	b := 0 // Beginning of current token
    95  	i := 0 // Index of current character
    96  
    97  	// Tokenizer states
    98  	const (
    99  		sStart = iota
   100  		sEscape
   101  		sOperator
   102  		sWord
   103  		sWordEscape
   104  		sSingleQuote
   105  		sDoubleQuote
   106  		sDoubleQuoteEscape
   107  		sLineComment
   108  		sLineCommentEscape
   109  	)
   110  	state := sStart
   111  
   112  	// Iterate over each character + an imaginary blank character.
   113  	for {
   114  		// Current character being processed
   115  		var c byte
   116  
   117  		// Check for EOF
   118  		if i == len(script) {
   119  			switch state {
   120  			case sStart, sOperator, sWord, sLineComment:
   121  				// Use an imaginary blank character to delimit the last token.
   122  				c = ' '
   123  			default:
   124  				return ts, fmt.Errorf("INCOMPLETE") // TODO
   125  			}
   126  		} else {
   127  			c = script[i]
   128  		}
   129  
   130  		// The scanner is implemented with a DFA:
   131  		// * outer switch -- selects state
   132  		// * inner switch -- selects transition
   133  		switch state {
   134  
   135  		// Start state
   136  		case sStart:
   137  			b = i
   138  			// TODO: \r
   139  			switch c {
   140  			default:
   141  				state = sWord
   142  			case ' ', '\t':
   143  				state = sStart
   144  			case '\n':
   145  				ts = append(ts, token{script[i : i+1], ttNewLine})
   146  			case '\\':
   147  				state = sEscape
   148  			case '\'':
   149  				state = sSingleQuote
   150  			case '"':
   151  				state = sDoubleQuote
   152  			case '#':
   153  				state = sLineComment
   154  			case '&', '|', ';', '<', '>':
   155  				state = sOperator
   156  			}
   157  
   158  		// Escape
   159  		case sEscape:
   160  			switch c {
   161  			case '\n':
   162  				state = sStart
   163  			default:
   164  				state = sWord
   165  			}
   166  
   167  		// Words
   168  		case sWord:
   169  			switch c {
   170  			case ' ', '\t', '\n', '#', '&', '|', ';', '<', '>':
   171  				// The token may contain a line escape. This is cleaned up
   172  				// during variable expansion.
   173  				ts = append(ts, token{script[b:i], ttWord})
   174  				state = sStart
   175  				i--
   176  			case '\\':
   177  				state = sWordEscape
   178  			}
   179  		case sWordEscape:
   180  			state = sWord
   181  
   182  		// Single quotes
   183  		case sSingleQuote:
   184  			// This optimization iterates quicker.
   185  			for script[i] != '\'' {
   186  				i++
   187  			}
   188  			state = sWord
   189  
   190  		// Double quotes
   191  		case sDoubleQuote:
   192  			switch c {
   193  			case '"':
   194  				state = sWord
   195  			case '\\':
   196  				state = sDoubleQuoteEscape
   197  			}
   198  		case sDoubleQuoteEscape:
   199  			state = sDoubleQuote
   200  
   201  		// Line comment
   202  		case sLineComment:
   203  			switch c {
   204  			case '\n':
   205  				ts = append(ts, token{script[i : i+1], ttNewLine})
   206  				state = sStart
   207  			case '\\':
   208  				state = sLineCommentEscape
   209  			}
   210  		case sLineCommentEscape:
   211  			state = sLineComment
   212  
   213  		// Operators
   214  		case sOperator:
   215  			switch c {
   216  			case '&', '|', ';', '<', '>', '-':
   217  				_, ok := operators[script[b:i+1]]
   218  				if ok {
   219  					break
   220  				}
   221  				fallthrough
   222  			default:
   223  				word := strings.ReplaceAll(script[b:i], "\\\n", "")
   224  				op, ok := operators[word]
   225  				if ok {
   226  					ts = append(ts, token{word, op})
   227  				} else {
   228  					ts = append(ts, token{word, ttWord})
   229  				}
   230  				state = sStart
   231  				i--
   232  			}
   233  		}
   234  
   235  		if i == len(script) {
   236  			break
   237  		}
   238  		i++
   239  	}
   240  
   241  	return ts, nil
   242  }