github.com/u-root/u-root@v7.0.1-0.20200915234505-ad7babab0a8e+incompatible/pkg/pogosh/lexer.go (about) 1 // Copyright 2020 the u-root Authors. All rights reserved 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package pogosh 6 7 import ( 8 "fmt" 9 "strings" 10 ) 11 12 type token struct { 13 value string // TODO: make []byte 14 ttype tokenType 15 } 16 17 type tokenType uint8 18 19 // Token types 20 const ( 21 ttError = iota // TODO: is this used? 22 ttEOF 23 ttWord 24 ttAssignmentWord 25 ttName 26 ttNewLine 27 ttIONumber 28 ttAndIf // && 29 ttOrIf // || 30 ttDSemi // ;; 31 ttDLess // << 32 ttDGreat // >> 33 ttLessAnd // <& 34 ttGreatAnd // >& 35 ttLessGreat // <> 36 ttDLessDash // <<- 37 ttClobber // >| 38 ttIf // if 39 ttThen // then 40 ttElse // else 41 ttElif // elif 42 ttFi // fi 43 ttDo // do 44 ttDone // done 45 ttCase // case 46 ttEsac // esac 47 ttWhile // while 48 ttUntil // until 49 ttFor // for 50 ttLBrace // { 51 ttRBrace // } 52 ttBang // ! 53 ttIn // in 54 ) 55 56 var operators = map[string]tokenType{ 57 "&&": ttAndIf, 58 "||": ttOrIf, 59 ";;": ttDSemi, 60 "<<": ttDLess, 61 ">>": ttDGreat, 62 "<&": ttLessAnd, 63 ">&": ttGreatAnd, 64 "<>": ttLessGreat, 65 "<<-": ttDLessDash, 66 ">|": ttClobber, 67 } 68 69 var reservedWords = map[string]tokenType{ 70 "if": ttIf, 71 "then": ttThen, 72 "else": ttElse, 73 "elif": ttElif, 74 "fi": ttFi, 75 "do": ttDo, 76 "done": ttDone, 77 "case": ttCase, 78 "esac": ttEsac, 79 "while": ttWhile, 80 "until": ttUntil, 81 "for": ttFor, 82 "{": ttLBrace, 83 "}": ttRBrace, 84 "!": ttBang, 85 "in": ttIn, 86 } 87 88 var portableCharSet = "\x00\a\b\t\n\v\f\r !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxy{|}~" 89 90 // tokenize splits the input into an array of tokens. 91 // TODO: memoize? 92 func tokenize(script string) ([]token, error) { 93 ts := []token{} 94 b := 0 // Beginning of current token 95 i := 0 // Index of current character 96 97 // Tokenizer states 98 const ( 99 sStart = iota 100 sEscape 101 sOperator 102 sWord 103 sWordEscape 104 sSingleQuote 105 sDoubleQuote 106 sDoubleQuoteEscape 107 sLineComment 108 sLineCommentEscape 109 ) 110 state := sStart 111 112 // Iterate over each character + an imaginary blank character. 113 for { 114 // Current character being processed 115 var c byte 116 117 // Check for EOF 118 if i == len(script) { 119 switch state { 120 case sStart, sOperator, sWord, sLineComment: 121 // Use an imaginary blank character to delimit the last token. 122 c = ' ' 123 default: 124 return ts, fmt.Errorf("INCOMPLETE") // TODO 125 } 126 } else { 127 c = script[i] 128 } 129 130 // The scanner is implemented with a DFA: 131 // * outer switch -- selects state 132 // * inner switch -- selects transition 133 switch state { 134 135 // Start state 136 case sStart: 137 b = i 138 // TODO: \r 139 switch c { 140 default: 141 state = sWord 142 case ' ', '\t': 143 state = sStart 144 case '\n': 145 ts = append(ts, token{script[i : i+1], ttNewLine}) 146 case '\\': 147 state = sEscape 148 case '\'': 149 state = sSingleQuote 150 case '"': 151 state = sDoubleQuote 152 case '#': 153 state = sLineComment 154 case '&', '|', ';', '<', '>': 155 state = sOperator 156 } 157 158 // Escape 159 case sEscape: 160 switch c { 161 case '\n': 162 state = sStart 163 default: 164 state = sWord 165 } 166 167 // Words 168 case sWord: 169 switch c { 170 case ' ', '\t', '\n', '#', '&', '|', ';', '<', '>': 171 // The token may contain a line escape. This is cleaned up 172 // during variable expansion. 173 ts = append(ts, token{script[b:i], ttWord}) 174 state = sStart 175 i-- 176 case '\\': 177 state = sWordEscape 178 } 179 case sWordEscape: 180 state = sWord 181 182 // Single quotes 183 case sSingleQuote: 184 // This optimization iterates quicker. 185 for script[i] != '\'' { 186 i++ 187 } 188 state = sWord 189 190 // Double quotes 191 case sDoubleQuote: 192 switch c { 193 case '"': 194 state = sWord 195 case '\\': 196 state = sDoubleQuoteEscape 197 } 198 case sDoubleQuoteEscape: 199 state = sDoubleQuote 200 201 // Line comment 202 case sLineComment: 203 switch c { 204 case '\n': 205 ts = append(ts, token{script[i : i+1], ttNewLine}) 206 state = sStart 207 case '\\': 208 state = sLineCommentEscape 209 } 210 case sLineCommentEscape: 211 state = sLineComment 212 213 // Operators 214 case sOperator: 215 switch c { 216 case '&', '|', ';', '<', '>', '-': 217 _, ok := operators[script[b:i+1]] 218 if ok { 219 break 220 } 221 fallthrough 222 default: 223 word := strings.ReplaceAll(script[b:i], "\\\n", "") 224 op, ok := operators[word] 225 if ok { 226 ts = append(ts, token{word, op}) 227 } else { 228 ts = append(ts, token{word, ttWord}) 229 } 230 state = sStart 231 i-- 232 } 233 } 234 235 if i == len(script) { 236 break 237 } 238 i++ 239 } 240 241 return ts, nil 242 }