github.com/mitranim/sqlb@v0.7.2/sqlb_tokenizer.go (about)

     1  package sqlb
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"strconv"
     7  	"strings"
     8  	"unicode/utf8"
     9  )
    10  
    11  /*
    12  Partial SQL tokenizer used internally by `(*Prep).Parse` to parse queries, in
    13  particular to convert named parameters into other expressions.
    14  
    15  Goals:
    16  
    17  	* Correctly parse whitespace, comments, quoted strings and identifiers,
    18  	  ordinal parameters, named parameters.
    19  
    20  	* Decently fast and allocation-free tokenization.
    21  
    22  Non-goals:
    23  
    24  	* Full SQL parser.
    25  
    26  Notable limitations:
    27  
    28  	* No special support for dollar-quoted strings, which are rarely if ever used
    29  	  in dynamically-generated queries.
    30  */
    31  type Tokenizer struct {
    32  	Source    string
    33  	Transform func(Token) Token
    34  	cursor    int
    35  	next      Token
    36  }
    37  
    38  /*
    39  Returns the next token if possible. When the tokenizer reaches the end, this
    40  returns an empty `Token{}`. Call `Token.IsInvalid` to detect the end.
    41  */
    42  func (self *Tokenizer) Next() Token {
    43  	for {
    44  		token := self.nextToken()
    45  		if token.IsInvalid() {
    46  			return Token{}
    47  		}
    48  
    49  		if self.Transform != nil {
    50  			token = self.Transform(token)
    51  			if token.IsInvalid() {
    52  				continue
    53  			}
    54  		}
    55  
    56  		return token
    57  	}
    58  }
    59  
    60  func (self *Tokenizer) nextToken() Token {
    61  	next := self.next
    62  	if !next.IsInvalid() {
    63  		self.next = Token{}
    64  		return next
    65  	}
    66  
    67  	start := self.cursor
    68  
    69  	for self.more() {
    70  		mid := self.cursor
    71  		if self.maybeWhitespace(); self.cursor > mid {
    72  			return self.choose(start, mid, TokenTypeWhitespace)
    73  		}
    74  		if self.maybeQuotedSingle(); self.cursor > mid {
    75  			return self.choose(start, mid, TokenTypeQuotedSingle)
    76  		}
    77  		if self.maybeQuotedDouble(); self.cursor > mid {
    78  			return self.choose(start, mid, TokenTypeQuotedDouble)
    79  		}
    80  		if self.maybeQuotedGrave(); self.cursor > mid {
    81  			return self.choose(start, mid, TokenTypeQuotedGrave)
    82  		}
    83  		if self.maybeCommentLine(); self.cursor > mid {
    84  			return self.choose(start, mid, TokenTypeCommentLine)
    85  		}
    86  		if self.maybeCommentBlock(); self.cursor > mid {
    87  			return self.choose(start, mid, TokenTypeCommentBlock)
    88  		}
    89  		if self.maybeDoubleColon(); self.cursor > mid {
    90  			return self.choose(start, mid, TokenTypeDoubleColon)
    91  		}
    92  		if self.maybeOrdinalParam(); self.cursor > mid {
    93  			return self.choose(start, mid, TokenTypeOrdinalParam)
    94  		}
    95  		if self.maybeNamedParam(); self.cursor > mid {
    96  			return self.choose(start, mid, TokenTypeNamedParam)
    97  		}
    98  		self.char()
    99  	}
   100  
   101  	if self.cursor > start {
   102  		return Token{self.from(start), TokenTypeText}
   103  	}
   104  	return Token{}
   105  }
   106  
   107  func (self *Tokenizer) choose(start, mid int, typ TokenType) Token {
   108  	tok := Token{self.from(mid), typ}
   109  	if mid > start {
   110  		self.setNext(tok)
   111  		return Token{self.Source[start:mid], TokenTypeText}
   112  	}
   113  	return tok
   114  }
   115  
   116  func (self *Tokenizer) setNext(val Token) {
   117  	if !self.next.IsInvalid() {
   118  		panic(ErrInternal{Err{
   119  			`parsing SQL`,
   120  			errf(
   121  				`internal error: attempted to overwrite non-empty pending token %#v with %#v`,
   122  				self.next, val,
   123  			),
   124  		}})
   125  	}
   126  	self.next = val
   127  }
   128  
   129  func (self *Tokenizer) maybeWhitespace() {
   130  	for self.more() && charsetWhitespace.has(self.headByte()) {
   131  		self.scan(1)
   132  	}
   133  }
   134  
   135  func (self *Tokenizer) maybeQuotedSingle() {
   136  	self.maybeStringBetweenBytes(quoteSingle, quoteSingle)
   137  }
   138  
   139  func (self *Tokenizer) maybeQuotedDouble() {
   140  	self.maybeStringBetweenBytes(quoteDouble, quoteDouble)
   141  }
   142  
   143  func (self *Tokenizer) maybeQuotedGrave() {
   144  	self.maybeStringBetweenBytes(quoteGrave, quoteGrave)
   145  }
   146  
   147  func (self *Tokenizer) maybeCommentLine() {
   148  	if !self.scannedString(commentLinePrefix) {
   149  		return
   150  	}
   151  	for self.more() && !self.scannedNewline() && self.scannedChar() {
   152  	}
   153  }
   154  
   155  // TODO support nested block comments, which are valid in SQL.
   156  func (self *Tokenizer) maybeCommentBlock() {
   157  	self.maybeStringBetween(commentBlockPrefix, commentBlockSuffix)
   158  }
   159  
   160  func (self *Tokenizer) maybeDoubleColon() {
   161  	self.maybeString(doubleColonPrefix)
   162  }
   163  
   164  func (self *Tokenizer) maybeOrdinalParam() {
   165  	start := self.cursor
   166  	if !self.scannedByte(ordinalParamPrefix) {
   167  		return
   168  	}
   169  	if !self.scannedDigits() {
   170  		self.cursor = start
   171  	}
   172  }
   173  
   174  func (self *Tokenizer) maybeNamedParam() {
   175  	start := self.cursor
   176  	if !self.scannedByte(namedParamPrefix) {
   177  		return
   178  	}
   179  	if !self.scannedIdent() {
   180  		self.cursor = start
   181  	}
   182  }
   183  
   184  func (self *Tokenizer) maybeString(val string) {
   185  	_ = self.scannedString(val)
   186  }
   187  
   188  func (self *Tokenizer) scannedNewline() bool {
   189  	start := self.cursor
   190  	self.maybeNewline()
   191  	return self.cursor > start
   192  }
   193  
   194  func (self *Tokenizer) maybeNewline() {
   195  	self.scan(leadingNewlineSize(self.rest()))
   196  }
   197  
   198  func (self *Tokenizer) scannedChar() bool {
   199  	start := self.cursor
   200  	self.char()
   201  	return self.cursor > start
   202  }
   203  
   204  func (self *Tokenizer) char() {
   205  	_, size := utf8.DecodeRuneInString(self.rest())
   206  	self.scan(size)
   207  }
   208  
   209  func (self *Tokenizer) scannedDigits() bool {
   210  	start := self.cursor
   211  	self.maybeDigits()
   212  	return self.cursor > start
   213  }
   214  
   215  func (self *Tokenizer) maybeDigits() {
   216  	for self.more() && charsetDigitDec.has(self.headByte()) {
   217  		self.scan(1)
   218  	}
   219  }
   220  
   221  func (self *Tokenizer) scannedIdent() bool {
   222  	start := self.cursor
   223  	self.maybeIdent()
   224  	return self.cursor > start
   225  }
   226  
   227  func (self *Tokenizer) maybeIdent() {
   228  	if !self.scannedByteIn(charsetIdentStart) {
   229  		return
   230  	}
   231  	for self.more() && self.scannedByteIn(charsetIdent) {
   232  	}
   233  }
   234  
   235  func (self *Tokenizer) maybeStringBetween(prefix, suffix string) {
   236  	if !self.scannedString(prefix) {
   237  		return
   238  	}
   239  
   240  	for self.more() {
   241  		if self.scannedString(suffix) {
   242  			return
   243  		}
   244  		self.char()
   245  	}
   246  
   247  	panic(ErrUnexpectedEOF{Err{
   248  		`parsing SQL`,
   249  		fmt.Errorf(`expected closing %q, got unexpected %w`, suffix, io.EOF),
   250  	}})
   251  }
   252  
   253  func (self *Tokenizer) maybeStringBetweenBytes(prefix, suffix byte) {
   254  	if !self.scannedByte(prefix) {
   255  		return
   256  	}
   257  
   258  	for self.more() {
   259  		if self.scannedByte(suffix) {
   260  			return
   261  		}
   262  		self.char()
   263  	}
   264  
   265  	panic(ErrUnexpectedEOF{Err{
   266  		`parsing SQL`,
   267  		fmt.Errorf(`expected closing %q, got unexpected %w`, rune(suffix), io.EOF),
   268  	}})
   269  }
   270  
   271  func (self *Tokenizer) scan(val int) {
   272  	self.cursor += val
   273  }
   274  
   275  func (self *Tokenizer) more() bool {
   276  	return self.cursor < len(self.Source)
   277  }
   278  
   279  func (self *Tokenizer) rest() string {
   280  	return self.Source[self.cursor:]
   281  }
   282  
   283  func (self *Tokenizer) from(start int) string {
   284  	return self.Source[start:self.cursor]
   285  }
   286  
   287  func (self *Tokenizer) headByte() byte {
   288  	return self.Source[self.cursor]
   289  }
   290  
   291  func (self *Tokenizer) scannedByte(val byte) bool {
   292  	if self.headByte() == val {
   293  		self.scan(1)
   294  		return true
   295  	}
   296  	return false
   297  }
   298  
   299  func (self *Tokenizer) scannedByteIn(val *charset) bool {
   300  	if val.has(self.headByte()) {
   301  		self.scan(1)
   302  		return true
   303  	}
   304  	return false
   305  }
   306  
   307  func (self *Tokenizer) scannedString(val string) bool {
   308  	if strings.HasPrefix(self.rest(), val) {
   309  		self.scan(len(val))
   310  		return true
   311  	}
   312  	return false
   313  }
   314  
   315  // Part of `Token`.
   316  type TokenType byte
   317  
   318  const (
   319  	TokenTypeInvalid TokenType = iota
   320  	TokenTypeText
   321  	TokenTypeWhitespace
   322  	TokenTypeQuotedSingle
   323  	TokenTypeQuotedDouble
   324  	TokenTypeQuotedGrave
   325  	TokenTypeCommentLine
   326  	TokenTypeCommentBlock
   327  	TokenTypeDoubleColon
   328  	TokenTypeOrdinalParam
   329  	TokenTypeNamedParam
   330  )
   331  
   332  // Represents an arbitrary chunk of SQL text parsed by `Tokenizer`.
   333  type Token struct {
   334  	Text string
   335  	Type TokenType
   336  }
   337  
   338  /*
   339  True if the token's type is `TokenTypeInvalid`. This is used to detect end of
   340  iteration when calling `(*Tokenizer).Next`.
   341  */
   342  func (self Token) IsInvalid() bool {
   343  	return self.Type == TokenTypeInvalid
   344  }
   345  
   346  // Implement `fmt.Stringer` for debug purposes.
   347  func (self Token) String() string { return self.Text }
   348  
   349  /*
   350  Assumes that the token has `TokenTypeOrdinalParam` and looks like a
   351  Postgres-style ordinal param: "$1", "$2" and so on. Parses and returns the
   352  number. Panics if the text had the wrong structure.
   353  */
   354  func (self Token) ParseOrdinalParam() OrdinalParam {
   355  	rest, err := trimPrefixByte(self.Text, ordinalParamPrefix)
   356  	try(errOrdinal(err))
   357  
   358  	val, err := strconv.Atoi(rest)
   359  	try(errOrdinal(err))
   360  
   361  	return OrdinalParam(val)
   362  }
   363  
   364  /*
   365  Assumes that the token has `TokenTypeNamedParam` and looks like a Postgres-style
   366  named param: ":one", ":two" and so on. Parses and returns the parameter's name
   367  without the leading ":". Panics if the text had the wrong structure.
   368  */
   369  func (self Token) ParseNamedParam() NamedParam {
   370  	rest, err := trimPrefixByte(self.Text, namedParamPrefix)
   371  	try(errNamed(err))
   372  	return NamedParam(rest)
   373  }