github.com/wfusion/gofusion@v1.1.14/common/utils/sqlparser/lexer.go (about)

     1  package sqlparser
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"io"
     7  	"strings"
     8  	"unicode"
     9  )
    10  
    11  type Lexer struct {
    12  	r   io.RuneReader
    13  	buf bytes.Buffer
    14  
    15  	ch   rune
    16  	pos  Pos
    17  	full bool
    18  }
    19  
    20  func NewLexer(r io.Reader) *Lexer {
    21  	return &Lexer{
    22  		r:   bufio.NewReader(r),
    23  		pos: Pos{Offset: -1, Line: 1},
    24  	}
    25  }
    26  
    27  func (l *Lexer) Lex() (pos Pos, token Token, lit string) {
    28  	for {
    29  		if ch := l.peek(); ch == -1 {
    30  			return l.pos, EOF, ""
    31  		} else if unicode.IsSpace(ch) {
    32  			l.read()
    33  			continue
    34  		} else if isDigit(ch) || ch == '.' {
    35  			return l.lexNumber()
    36  		} else if ch == 'x' || ch == 'X' {
    37  			return l.lexBlob()
    38  		} else if isAlpha(ch) || ch == '_' {
    39  			return l.lexUnquotedIdent(l.pos, "")
    40  		} else if ch == '"' || ch == '`' {
    41  			return l.lexQuotedIdent(ch)
    42  		} else if ch == '\'' {
    43  			return l.lexString()
    44  		} else if ch == '?' || ch == ':' || ch == '@' || ch == '$' {
    45  			return l.lexBind()
    46  		}
    47  
    48  		switch ch, pos := l.read(); ch {
    49  		case ';':
    50  			return pos, SEMI, ";"
    51  		case '(':
    52  			return pos, LP, "("
    53  		case ')':
    54  			return pos, RP, ")"
    55  		case ',':
    56  			return pos, COMMA, ","
    57  		case '!':
    58  			if l.peek() == '=' {
    59  				l.read()
    60  				return pos, NE, "!="
    61  			}
    62  			return pos, BITNOT, "!"
    63  		case '=':
    64  			return pos, EQ, "="
    65  		case '<':
    66  			if l.peek() == '=' {
    67  				l.read()
    68  				return pos, LE, "<="
    69  			} else if l.peek() == '<' {
    70  				l.read()
    71  				return pos, LSHIFT, "<<"
    72  			} else if l.peek() == '>' {
    73  				l.read()
    74  				return pos, LG, "<>"
    75  			}
    76  			return pos, LT, "<"
    77  		case '>':
    78  			if l.peek() == '=' {
    79  				l.read()
    80  				return pos, GE, ">="
    81  			} else if l.peek() == '>' {
    82  				l.read()
    83  				return pos, RSHIFT, ">>"
    84  			}
    85  			return pos, GT, ">"
    86  		case '&':
    87  			return pos, BITAND, "&"
    88  		case '|':
    89  			if l.peek() == '|' {
    90  				l.read()
    91  				return pos, CONCAT, "||"
    92  			}
    93  			return pos, BITOR, "|"
    94  		case '+':
    95  			return pos, PLUS, "+"
    96  		case '-':
    97  			return pos, MINUS, "-"
    98  		case '*':
    99  			return pos, STAR, "*"
   100  		case '/':
   101  			if l.peek() == '*' {
   102  				return l.lexMultilineComment()
   103  			}
   104  			return pos, SLASH, "/"
   105  		case '%':
   106  			return pos, REM, "%"
   107  		default:
   108  			return pos, ILLEGAL, string(ch)
   109  		}
   110  	}
   111  }
   112  
   113  func (l *Lexer) lexUnquotedIdent(pos Pos, prefix string) (Pos, Token, string) {
   114  	assert(isUnquotedIdent(l.peek()))
   115  
   116  	l.buf.Reset()
   117  	l.buf.WriteString(prefix)
   118  	for ch, _ := l.read(); isUnquotedIdent(ch); ch, _ = l.read() {
   119  		l.buf.WriteRune(ch)
   120  	}
   121  	l.unread()
   122  
   123  	lit := l.buf.String()
   124  	tok := Lookup(lit)
   125  	return pos, tok, lit
   126  }
   127  
   128  func (l *Lexer) lexQuotedIdent(char rune) (Pos, Token, string) {
   129  	ch, pos := l.read()
   130  	assert(ch == char)
   131  
   132  	l.buf.Reset()
   133  	l.buf.WriteRune(char)
   134  	for {
   135  		ch, _ := l.read()
   136  		if ch == -1 {
   137  			return pos, ILLEGAL, l.buf.String()
   138  		} else if ch == char {
   139  			if l.peek() == char { // escaped quote
   140  				l.read()
   141  				l.buf.WriteRune(char)
   142  				continue
   143  			}
   144  			l.buf.WriteRune(char)
   145  			return pos, QIDENT, l.buf.String()
   146  		}
   147  		l.buf.WriteRune(ch)
   148  	}
   149  }
   150  
   151  func (l *Lexer) lexString() (Pos, Token, string) {
   152  	ch, pos := l.read()
   153  	assert(ch == '\'')
   154  
   155  	l.buf.Reset()
   156  	for {
   157  		ch, _ := l.read()
   158  		if ch == -1 {
   159  			return pos, ILLEGAL, `'` + l.buf.String()
   160  		} else if ch == '\'' {
   161  			if l.peek() == '\'' { // escaped quote
   162  				l.read()
   163  				l.buf.WriteRune('\'')
   164  				continue
   165  			}
   166  			return pos, STRING, l.buf.String()
   167  		}
   168  		l.buf.WriteRune(ch)
   169  	}
   170  }
   171  
   172  func (l *Lexer) lexMultilineComment() (Pos, Token, string) {
   173  	ch, pos := l.read()
   174  	assert(ch == '*')
   175  
   176  	l.buf.Reset()
   177  	for {
   178  		ch, _ := l.read()
   179  		if ch == -1 {
   180  			return pos, ILLEGAL, `/*` + l.buf.String()
   181  		} else if ch == '*' {
   182  			if l.peek() == '/' {
   183  				l.read()
   184  				l.read()
   185  				return pos, MLCOMMENT, strings.Trim(l.buf.String(), " ")
   186  			}
   187  		}
   188  		l.buf.WriteRune(ch)
   189  	}
   190  }
   191  
   192  func (l *Lexer) lexBind() (Pos, Token, string) {
   193  	start, pos := l.read()
   194  
   195  	l.buf.Reset()
   196  	l.buf.WriteRune(start)
   197  
   198  	// Question mark starts a numeric bind.
   199  	if start == '?' {
   200  		for isDigit(l.peek()) {
   201  			ch, _ := l.read()
   202  			l.buf.WriteRune(ch)
   203  		}
   204  		return pos, BIND, l.buf.String()
   205  	}
   206  
   207  	// All other characters start an alphanumeric bind.
   208  	assert(start == ':' || start == '@' || start == '$')
   209  	for isUnquotedIdent(l.peek()) {
   210  		ch, _ := l.read()
   211  		l.buf.WriteRune(ch)
   212  	}
   213  	return pos, BIND, l.buf.String()
   214  }
   215  
   216  func (l *Lexer) lexBlob() (Pos, Token, string) {
   217  	start, pos := l.read()
   218  	assert(start == 'x' || start == 'X')
   219  
   220  	// If the next character is not a quote, it's an IDENT.
   221  	if isUnquotedIdent(l.peek()) {
   222  		return l.lexUnquotedIdent(pos, string(start))
   223  	} else if l.peek() != '\'' {
   224  		return pos, IDENT, string(start)
   225  	}
   226  	ch, _ := l.read()
   227  	assert(ch == '\'')
   228  
   229  	l.buf.Reset()
   230  	for i := 0; ; i++ {
   231  		ch, _ := l.read()
   232  		if ch == '\'' {
   233  			return pos, BLOB, l.buf.String()
   234  		} else if ch == -1 {
   235  			return pos, ILLEGAL, string(start) + `'` + l.buf.String()
   236  		} else if !isHex(ch) {
   237  			return pos, ILLEGAL, string(start) + `'` + l.buf.String() + string(ch)
   238  		}
   239  		l.buf.WriteRune(ch)
   240  	}
   241  }
   242  
   243  func (l *Lexer) lexNumber() (Pos, Token, string) {
   244  	assert(isDigit(l.peek()) || l.peek() == '.')
   245  	pos := l.pos
   246  	tok := INTEGER
   247  
   248  	l.buf.Reset()
   249  
   250  	// Read whole number if starting with a digit.
   251  	if isDigit(l.peek()) {
   252  		for isDigit(l.peek()) {
   253  			ch, _ := l.read()
   254  			l.buf.WriteRune(ch)
   255  		}
   256  	}
   257  
   258  	// Read decimal and successive digitl.
   259  	if l.peek() == '.' {
   260  		tok = FLOAT
   261  
   262  		ch, _ := l.read()
   263  		l.buf.WriteRune(ch)
   264  
   265  		for isDigit(l.peek()) {
   266  			ch, _ := l.read()
   267  			l.buf.WriteRune(ch)
   268  		}
   269  	}
   270  
   271  	// Read exponent with optional +/- sign.
   272  	if ch := l.peek(); ch == 'e' || ch == 'E' {
   273  		tok = FLOAT
   274  
   275  		ch, _ := l.read()
   276  		l.buf.WriteRune(ch)
   277  
   278  		if l.peek() == '+' || l.peek() == '-' {
   279  			ch, _ := l.read()
   280  			l.buf.WriteRune(ch)
   281  			if !isDigit(l.peek()) {
   282  				return pos, ILLEGAL, l.buf.String()
   283  			}
   284  			for isDigit(l.peek()) {
   285  				ch, _ := l.read()
   286  				l.buf.WriteRune(ch)
   287  			}
   288  		} else if isDigit(l.peek()) {
   289  			for isDigit(l.peek()) {
   290  				ch, _ := l.read()
   291  				l.buf.WriteRune(ch)
   292  			}
   293  		} else {
   294  			return pos, ILLEGAL, l.buf.String()
   295  		}
   296  	}
   297  
   298  	lit := l.buf.String()
   299  	if lit == "." {
   300  		return pos, DOT, lit
   301  	}
   302  	return pos, tok, lit
   303  }
   304  
   305  func (l *Lexer) read() (rune, Pos) {
   306  	if l.full {
   307  		l.full = false
   308  		return l.ch, l.pos
   309  	}
   310  
   311  	var err error
   312  	l.ch, _, err = l.r.ReadRune()
   313  	if err != nil {
   314  		l.ch = -1
   315  		return l.ch, l.pos
   316  	}
   317  
   318  	l.pos.Offset++
   319  	if l.ch == '\n' {
   320  		l.pos.Line++
   321  		l.pos.Column = 0
   322  	} else {
   323  		l.pos.Column++
   324  	}
   325  	return l.ch, l.pos
   326  }
   327  
   328  func (l *Lexer) peek() rune {
   329  	if !l.full {
   330  		l.read()
   331  		l.unread()
   332  	}
   333  	return l.ch
   334  }
   335  
   336  func (l *Lexer) unread() {
   337  	assert(!l.full)
   338  	l.full = true
   339  }
   340  
   341  func isDigit(ch rune) bool {
   342  	return ch >= '0' && ch <= '9'
   343  }
   344  
   345  func isAlpha(ch rune) bool {
   346  	return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')
   347  }
   348  
   349  func isHex(ch rune) bool {
   350  	return isDigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F')
   351  }
   352  
   353  func isUnquotedIdent(ch rune) bool {
   354  	return isAlpha(ch) || isDigit(ch) || ch == '_'
   355  }
   356  
   357  // IsInteger returns true if s only contains digits.
   358  func IsInteger(s string) bool {
   359  	for _, ch := range s {
   360  		if !isDigit(ch) {
   361  			return false
   362  		}
   363  	}
   364  	return s != ""
   365  }