github.com/mitranim/sqlb@v0.7.2/sqlb_tokenizer.go (about) 1 package sqlb 2 3 import ( 4 "fmt" 5 "io" 6 "strconv" 7 "strings" 8 "unicode/utf8" 9 ) 10 11 /* 12 Partial SQL tokenizer used internally by `(*Prep).Parse` to parse queries, in 13 particular to convert named parameters into other expressions. 14 15 Goals: 16 17 * Correctly parse whitespace, comments, quoted strings and identifiers, 18 ordinal parameters, named parameters. 19 20 * Decently fast and allocation-free tokenization. 21 22 Non-goals: 23 24 * Full SQL parser. 25 26 Notable limitations: 27 28 * No special support for dollar-quoted strings, which are rarely if ever used 29 in dynamically-generated queries. 30 */ 31 type Tokenizer struct { 32 Source string 33 Transform func(Token) Token 34 cursor int 35 next Token 36 } 37 38 /* 39 Returns the next token if possible. When the tokenizer reaches the end, this 40 returns an empty `Token{}`. Call `Token.IsInvalid` to detect the end. 41 */ 42 func (self *Tokenizer) Next() Token { 43 for { 44 token := self.nextToken() 45 if token.IsInvalid() { 46 return Token{} 47 } 48 49 if self.Transform != nil { 50 token = self.Transform(token) 51 if token.IsInvalid() { 52 continue 53 } 54 } 55 56 return token 57 } 58 } 59 60 func (self *Tokenizer) nextToken() Token { 61 next := self.next 62 if !next.IsInvalid() { 63 self.next = Token{} 64 return next 65 } 66 67 start := self.cursor 68 69 for self.more() { 70 mid := self.cursor 71 if self.maybeWhitespace(); self.cursor > mid { 72 return self.choose(start, mid, TokenTypeWhitespace) 73 } 74 if self.maybeQuotedSingle(); self.cursor > mid { 75 return self.choose(start, mid, TokenTypeQuotedSingle) 76 } 77 if self.maybeQuotedDouble(); self.cursor > mid { 78 return self.choose(start, mid, TokenTypeQuotedDouble) 79 } 80 if self.maybeQuotedGrave(); self.cursor > mid { 81 return self.choose(start, mid, TokenTypeQuotedGrave) 82 } 83 if self.maybeCommentLine(); self.cursor > mid { 84 return self.choose(start, mid, TokenTypeCommentLine) 85 } 86 if self.maybeCommentBlock(); self.cursor > mid { 87 return self.choose(start, mid, TokenTypeCommentBlock) 88 } 89 if self.maybeDoubleColon(); self.cursor > mid { 90 return self.choose(start, mid, TokenTypeDoubleColon) 91 } 92 if self.maybeOrdinalParam(); self.cursor > mid { 93 return self.choose(start, mid, TokenTypeOrdinalParam) 94 } 95 if self.maybeNamedParam(); self.cursor > mid { 96 return self.choose(start, mid, TokenTypeNamedParam) 97 } 98 self.char() 99 } 100 101 if self.cursor > start { 102 return Token{self.from(start), TokenTypeText} 103 } 104 return Token{} 105 } 106 107 func (self *Tokenizer) choose(start, mid int, typ TokenType) Token { 108 tok := Token{self.from(mid), typ} 109 if mid > start { 110 self.setNext(tok) 111 return Token{self.Source[start:mid], TokenTypeText} 112 } 113 return tok 114 } 115 116 func (self *Tokenizer) setNext(val Token) { 117 if !self.next.IsInvalid() { 118 panic(ErrInternal{Err{ 119 `parsing SQL`, 120 errf( 121 `internal error: attempted to overwrite non-empty pending token %#v with %#v`, 122 self.next, val, 123 ), 124 }}) 125 } 126 self.next = val 127 } 128 129 func (self *Tokenizer) maybeWhitespace() { 130 for self.more() && charsetWhitespace.has(self.headByte()) { 131 self.scan(1) 132 } 133 } 134 135 func (self *Tokenizer) maybeQuotedSingle() { 136 self.maybeStringBetweenBytes(quoteSingle, quoteSingle) 137 } 138 139 func (self *Tokenizer) maybeQuotedDouble() { 140 self.maybeStringBetweenBytes(quoteDouble, quoteDouble) 141 } 142 143 func (self *Tokenizer) maybeQuotedGrave() { 144 self.maybeStringBetweenBytes(quoteGrave, quoteGrave) 145 } 146 147 func (self *Tokenizer) maybeCommentLine() { 148 if !self.scannedString(commentLinePrefix) { 149 return 150 } 151 for self.more() && !self.scannedNewline() && self.scannedChar() { 152 } 153 } 154 155 // TODO support nested block comments, which are valid in SQL. 156 func (self *Tokenizer) maybeCommentBlock() { 157 self.maybeStringBetween(commentBlockPrefix, commentBlockSuffix) 158 } 159 160 func (self *Tokenizer) maybeDoubleColon() { 161 self.maybeString(doubleColonPrefix) 162 } 163 164 func (self *Tokenizer) maybeOrdinalParam() { 165 start := self.cursor 166 if !self.scannedByte(ordinalParamPrefix) { 167 return 168 } 169 if !self.scannedDigits() { 170 self.cursor = start 171 } 172 } 173 174 func (self *Tokenizer) maybeNamedParam() { 175 start := self.cursor 176 if !self.scannedByte(namedParamPrefix) { 177 return 178 } 179 if !self.scannedIdent() { 180 self.cursor = start 181 } 182 } 183 184 func (self *Tokenizer) maybeString(val string) { 185 _ = self.scannedString(val) 186 } 187 188 func (self *Tokenizer) scannedNewline() bool { 189 start := self.cursor 190 self.maybeNewline() 191 return self.cursor > start 192 } 193 194 func (self *Tokenizer) maybeNewline() { 195 self.scan(leadingNewlineSize(self.rest())) 196 } 197 198 func (self *Tokenizer) scannedChar() bool { 199 start := self.cursor 200 self.char() 201 return self.cursor > start 202 } 203 204 func (self *Tokenizer) char() { 205 _, size := utf8.DecodeRuneInString(self.rest()) 206 self.scan(size) 207 } 208 209 func (self *Tokenizer) scannedDigits() bool { 210 start := self.cursor 211 self.maybeDigits() 212 return self.cursor > start 213 } 214 215 func (self *Tokenizer) maybeDigits() { 216 for self.more() && charsetDigitDec.has(self.headByte()) { 217 self.scan(1) 218 } 219 } 220 221 func (self *Tokenizer) scannedIdent() bool { 222 start := self.cursor 223 self.maybeIdent() 224 return self.cursor > start 225 } 226 227 func (self *Tokenizer) maybeIdent() { 228 if !self.scannedByteIn(charsetIdentStart) { 229 return 230 } 231 for self.more() && self.scannedByteIn(charsetIdent) { 232 } 233 } 234 235 func (self *Tokenizer) maybeStringBetween(prefix, suffix string) { 236 if !self.scannedString(prefix) { 237 return 238 } 239 240 for self.more() { 241 if self.scannedString(suffix) { 242 return 243 } 244 self.char() 245 } 246 247 panic(ErrUnexpectedEOF{Err{ 248 `parsing SQL`, 249 fmt.Errorf(`expected closing %q, got unexpected %w`, suffix, io.EOF), 250 }}) 251 } 252 253 func (self *Tokenizer) maybeStringBetweenBytes(prefix, suffix byte) { 254 if !self.scannedByte(prefix) { 255 return 256 } 257 258 for self.more() { 259 if self.scannedByte(suffix) { 260 return 261 } 262 self.char() 263 } 264 265 panic(ErrUnexpectedEOF{Err{ 266 `parsing SQL`, 267 fmt.Errorf(`expected closing %q, got unexpected %w`, rune(suffix), io.EOF), 268 }}) 269 } 270 271 func (self *Tokenizer) scan(val int) { 272 self.cursor += val 273 } 274 275 func (self *Tokenizer) more() bool { 276 return self.cursor < len(self.Source) 277 } 278 279 func (self *Tokenizer) rest() string { 280 return self.Source[self.cursor:] 281 } 282 283 func (self *Tokenizer) from(start int) string { 284 return self.Source[start:self.cursor] 285 } 286 287 func (self *Tokenizer) headByte() byte { 288 return self.Source[self.cursor] 289 } 290 291 func (self *Tokenizer) scannedByte(val byte) bool { 292 if self.headByte() == val { 293 self.scan(1) 294 return true 295 } 296 return false 297 } 298 299 func (self *Tokenizer) scannedByteIn(val *charset) bool { 300 if val.has(self.headByte()) { 301 self.scan(1) 302 return true 303 } 304 return false 305 } 306 307 func (self *Tokenizer) scannedString(val string) bool { 308 if strings.HasPrefix(self.rest(), val) { 309 self.scan(len(val)) 310 return true 311 } 312 return false 313 } 314 315 // Part of `Token`. 316 type TokenType byte 317 318 const ( 319 TokenTypeInvalid TokenType = iota 320 TokenTypeText 321 TokenTypeWhitespace 322 TokenTypeQuotedSingle 323 TokenTypeQuotedDouble 324 TokenTypeQuotedGrave 325 TokenTypeCommentLine 326 TokenTypeCommentBlock 327 TokenTypeDoubleColon 328 TokenTypeOrdinalParam 329 TokenTypeNamedParam 330 ) 331 332 // Represents an arbitrary chunk of SQL text parsed by `Tokenizer`. 333 type Token struct { 334 Text string 335 Type TokenType 336 } 337 338 /* 339 True if the token's type is `TokenTypeInvalid`. This is used to detect end of 340 iteration when calling `(*Tokenizer).Next`. 341 */ 342 func (self Token) IsInvalid() bool { 343 return self.Type == TokenTypeInvalid 344 } 345 346 // Implement `fmt.Stringer` for debug purposes. 347 func (self Token) String() string { return self.Text } 348 349 /* 350 Assumes that the token has `TokenTypeOrdinalParam` and looks like a 351 Postgres-style ordinal param: "$1", "$2" and so on. Parses and returns the 352 number. Panics if the text had the wrong structure. 353 */ 354 func (self Token) ParseOrdinalParam() OrdinalParam { 355 rest, err := trimPrefixByte(self.Text, ordinalParamPrefix) 356 try(errOrdinal(err)) 357 358 val, err := strconv.Atoi(rest) 359 try(errOrdinal(err)) 360 361 return OrdinalParam(val) 362 } 363 364 /* 365 Assumes that the token has `TokenTypeNamedParam` and looks like a Postgres-style 366 named param: ":one", ":two" and so on. Parses and returns the parameter's name 367 without the leading ":". Panics if the text had the wrong structure. 368 */ 369 func (self Token) ParseNamedParam() NamedParam { 370 rest, err := trimPrefixByte(self.Text, namedParamPrefix) 371 try(errNamed(err)) 372 return NamedParam(rest) 373 }