github.com/DQNEO/babygo@v0.0.3/scanner.go (about)

     1  package main
     2  
     3  import (
     4  	"github.com/DQNEO/babygo/lib/mylib"
     5  	"github.com/DQNEO/babygo/lib/strconv"
     6  )
     7  
     8  type scanner struct {
     9  	src        []uint8
    10  	ch         uint8
    11  	offset     int
    12  	nextOffset int
    13  	insertSemi bool
    14  }
    15  
    16  func (s *scanner) next() {
    17  	if s.nextOffset < len(s.src) {
    18  		s.offset = s.nextOffset
    19  		s.ch = s.src[s.offset]
    20  		s.nextOffset++
    21  	} else {
    22  		s.offset = len(s.src)
    23  		s.ch = 1 //EOF
    24  	}
    25  }
    26  
    27  var keywords []string
    28  
    29  func (s *scanner) Init(src []uint8) {
    30  	// https://golang.org/ref/spec#Keywords
    31  	keywords = []string{
    32  		"break", "default", "func", "interface", "select",
    33  		"case", "defer", "go", "map", "struct",
    34  		"chan", "else", "goto", "package", "switch",
    35  		"const", "fallthrough", "if", "range", "type",
    36  		"continue", "for", "import", "return", "var",
    37  	}
    38  	s.src = src
    39  	s.offset = 0
    40  	s.ch = ' '
    41  	s.nextOffset = 0
    42  	s.insertSemi = false
    43  	s.next()
    44  }
    45  
    46  func isLetter(ch uint8) bool {
    47  	if ch == '_' {
    48  		return true
    49  	}
    50  	return ('A' <= ch && ch <= 'Z') || ('a' <= ch && ch <= 'z')
    51  }
    52  
    53  func isDecimal(ch uint8) bool {
    54  	return '0' <= ch && ch <= '9'
    55  }
    56  
    57  func (s *scanner) scanIdentifier() string {
    58  	var offset = s.offset
    59  	for isLetter(s.ch) || isDecimal(s.ch) {
    60  		s.next()
    61  	}
    62  	return string(s.src[offset:s.offset])
    63  }
    64  
    65  func (s *scanner) scanNumber() string {
    66  	var offset = s.offset
    67  	for isDecimal(s.ch) {
    68  		s.next()
    69  	}
    70  	return string(s.src[offset:s.offset])
    71  }
    72  
    73  func (s *scanner) scanString() string {
    74  	var offset = s.offset - 1
    75  	var escaped bool
    76  	for !escaped && s.ch != '"' {
    77  		if s.ch == '\\' {
    78  			escaped = true
    79  			s.next()
    80  			s.next()
    81  			escaped = false
    82  			continue
    83  		}
    84  		s.next()
    85  	}
    86  	s.next() // consume ending '""
    87  	return string(s.src[offset:s.offset])
    88  }
    89  
    90  func (s *scanner) scanChar() string {
    91  	// '\'' opening already consumed
    92  	var offset = s.offset - 1
    93  	var ch uint8
    94  	for {
    95  		ch = s.ch
    96  		s.next()
    97  		if ch == '\'' {
    98  			break
    99  		}
   100  		if ch == '\\' {
   101  			s.next()
   102  		}
   103  	}
   104  
   105  	return string(s.src[offset:s.offset])
   106  }
   107  
   108  func (s *scanner) scanComment() string {
   109  	var offset = s.offset - 1
   110  	for s.ch != '\n' {
   111  		s.next()
   112  	}
   113  	return string(s.src[offset:s.offset])
   114  }
   115  
   116  type TokenContainer struct {
   117  	pos int    // what's this ?
   118  	tok string // token.Token
   119  	lit string // raw data
   120  }
   121  
   122  // https://golang.org/ref/spec#Tokens
   123  func (s *scanner) skipWhitespace() {
   124  	for s.ch == ' ' || s.ch == '\t' || (s.ch == '\n' && !s.insertSemi) || s.ch == '\r' {
   125  		s.next()
   126  	}
   127  }
   128  
   129  func (s *scanner) Scan() *TokenContainer {
   130  	s.skipWhitespace()
   131  	var tc = &TokenContainer{}
   132  	var lit string
   133  	var tok string
   134  	var insertSemi bool
   135  	var ch = s.ch
   136  	if isLetter(ch) {
   137  		lit = s.scanIdentifier()
   138  		if mylib.InArray(lit, keywords) {
   139  			tok = lit
   140  			switch tok {
   141  			case "break", "continue", "fallthrough", "return":
   142  				insertSemi = true
   143  			}
   144  		} else {
   145  			insertSemi = true
   146  			tok = "IDENT"
   147  		}
   148  	} else if isDecimal(ch) {
   149  		insertSemi = true
   150  		lit = s.scanNumber()
   151  		tok = "INT"
   152  	} else {
   153  		s.next()
   154  		switch ch {
   155  		case '\n':
   156  			tok = ";"
   157  			lit = "\n"
   158  			insertSemi = false
   159  		case '"': // double quote
   160  			insertSemi = true
   161  			lit = s.scanString()
   162  			tok = "STRING"
   163  		case '\'': // single quote
   164  			insertSemi = true
   165  			lit = s.scanChar()
   166  			tok = "CHAR"
   167  		// https://golang.org/ref/spec#Operators_and_punctuation
   168  		//	+    &     +=    &=     &&    ==    !=    (    )
   169  		//	-    |     -=    |=     ||    <     <=    [    ]
   170  		//  *    ^     *=    ^=     <-    >     >=    {    }
   171  		//	/    <<    /=    <<=    ++    =     :=    ,    ;
   172  		//	%    >>    %=    >>=    --    !     ...   .    :
   173  		//	&^          &^=
   174  		case ':': // :=, :
   175  			if s.ch == '=' {
   176  				s.next()
   177  				tok = ":="
   178  			} else {
   179  				tok = ":"
   180  			}
   181  		case '.': // ..., .
   182  			var peekCh = s.src[s.nextOffset]
   183  			if s.ch == '.' && peekCh == '.' {
   184  				s.next()
   185  				s.next()
   186  				tok = "..."
   187  			} else {
   188  				tok = "."
   189  			}
   190  		case ',':
   191  			tok = ","
   192  		case ';':
   193  			tok = ";"
   194  			lit = ";"
   195  		case '(':
   196  			tok = "("
   197  		case ')':
   198  			insertSemi = true
   199  			tok = ")"
   200  		case '[':
   201  			tok = "["
   202  		case ']':
   203  			insertSemi = true
   204  			tok = "]"
   205  		case '{':
   206  			tok = "{"
   207  		case '}':
   208  			insertSemi = true
   209  			tok = "}"
   210  		case '+': // +=, ++, +
   211  			switch s.ch {
   212  			case '=':
   213  				s.next()
   214  				tok = "+="
   215  			case '+':
   216  				s.next()
   217  				tok = "++"
   218  				insertSemi = true
   219  			default:
   220  				tok = "+"
   221  			}
   222  		case '-': // -= --  -
   223  			switch s.ch {
   224  			case '-':
   225  				s.next()
   226  				tok = "--"
   227  				insertSemi = true
   228  			case '=':
   229  				s.next()
   230  				tok = "-="
   231  			default:
   232  				tok = "-"
   233  			}
   234  		case '*': // *=  *
   235  			if s.ch == '=' {
   236  				s.next()
   237  				tok = "*="
   238  			} else {
   239  				tok = "*"
   240  			}
   241  		case '/':
   242  			if s.ch == '/' {
   243  				// comment
   244  				// @TODO block comment
   245  				if s.insertSemi {
   246  					s.ch = '/'
   247  					s.offset = s.offset - 1
   248  					s.nextOffset = s.offset + 1
   249  					tc.lit = "\n"
   250  					tc.tok = ";"
   251  					s.insertSemi = false
   252  					return tc
   253  				}
   254  				lit = s.scanComment()
   255  				tok = "COMMENT"
   256  			} else if s.ch == '=' {
   257  				tok = "/="
   258  			} else {
   259  				tok = "/"
   260  			}
   261  		case '%': // %= %
   262  			if s.ch == '=' {
   263  				s.next()
   264  				tok = "%="
   265  			} else {
   266  				tok = "%"
   267  			}
   268  		case '^': // ^= ^
   269  			if s.ch == '=' {
   270  				s.next()
   271  				tok = "^="
   272  			} else {
   273  				tok = "^"
   274  			}
   275  		case '<': //  <= <- <<= <<
   276  			switch s.ch {
   277  			case '-':
   278  				s.next()
   279  				tok = "<-"
   280  			case '=':
   281  				s.next()
   282  				tok = "<="
   283  			case '<':
   284  				var peekCh = s.src[s.nextOffset]
   285  				if peekCh == '=' {
   286  					s.next()
   287  					s.next()
   288  					tok = "<<="
   289  				} else {
   290  					s.next()
   291  					tok = "<<"
   292  				}
   293  			default:
   294  				tok = "<"
   295  			}
   296  		case '>': // >= >>= >> >
   297  			switch s.ch {
   298  			case '=':
   299  				s.next()
   300  				tok = ">="
   301  			case '>':
   302  				var peekCh = s.src[s.nextOffset]
   303  				if peekCh == '=' {
   304  					s.next()
   305  					s.next()
   306  					tok = ">>="
   307  				} else {
   308  					s.next()
   309  					tok = ">>"
   310  				}
   311  			default:
   312  				tok = ">"
   313  			}
   314  		case '=': // == =
   315  			if s.ch == '=' {
   316  				s.next()
   317  				tok = "=="
   318  			} else {
   319  				tok = "="
   320  			}
   321  		case '!': // !=, !
   322  			if s.ch == '=' {
   323  				s.next()
   324  				tok = "!="
   325  			} else {
   326  				tok = "!"
   327  			}
   328  		case '&': // & &= && &^ &^=
   329  			switch s.ch {
   330  			case '=':
   331  				s.next()
   332  				tok = "&="
   333  			case '&':
   334  				s.next()
   335  				tok = "&&"
   336  			case '^':
   337  				var peekCh = s.src[s.nextOffset]
   338  				if peekCh == '=' {
   339  					s.next()
   340  					s.next()
   341  					tok = "&^="
   342  				} else {
   343  					s.next()
   344  					tok = "&^"
   345  				}
   346  			default:
   347  				tok = "&"
   348  			}
   349  		case '|': // |= || |
   350  			switch s.ch {
   351  			case '|':
   352  				s.next()
   353  				tok = "||"
   354  			case '=':
   355  				s.next()
   356  				tok = "|="
   357  			default:
   358  				tok = "|"
   359  			}
   360  		case 1:
   361  			tok = "EOF"
   362  		default:
   363  			panic2(__func__, "unknown char:"+string([]uint8{ch})+":"+strconv.Itoa(int(ch)))
   364  			tok = "UNKNOWN"
   365  		}
   366  	}
   367  	tc.lit = lit
   368  	tc.pos = 0
   369  	tc.tok = tok
   370  	s.insertSemi = insertSemi
   371  	return tc
   372  }