github.com/supabase/cli@v1.168.1/internal/utils/parser/token.go (about) 1 package parser 2 3 import ( 4 "bufio" 5 "io" 6 "strings" 7 "unicode/utf8" 8 9 "github.com/go-errors/errors" 10 "github.com/spf13/viper" 11 ) 12 13 // Equal to `startBufSize` from `bufio/scan.go` 14 const startBufSize = 4096 15 16 // MaxScannerCapacity defaults to 64 * 1024 which is not enough for certain lines 17 // containing e.g. geographical data. 256K ought to be enough for anybody... 18 var MaxScannerCapacity = 256 * 1024 19 20 // State transition table for tokenizer: 21 // 22 // Ready -> Ready (default) 23 // Ready -> Error (on invalid syntax) 24 // Ready -> Done (on ;, emit token) 25 // Ready -> Done (on EOF, emit token) 26 // 27 // Ready -> Comment (on --) 28 // Comment -> Comment (default) 29 // Comment -> Ready (on \n) 30 // 31 // Ready -> Block (on /*) 32 // Block -> Block (on /*, +-depth) 33 // Block -> Ready (on */, depth 0) 34 // 35 // Ready -> Quote (on ') 36 // Quote -> Quote (on '', default) 37 // Quote -> Ready (on ') 38 // 39 // Ready -> Dollar (on $tag$) 40 // Dollar -> Dollar (default) 41 // Dollar -> Ready (on $tag$) 42 // 43 // Ready -> Escape (on \) 44 // Escape -> Ready (on next) 45 type tokenizer struct { 46 state State 47 last int 48 } 49 50 func (t *tokenizer) ScanToken(data []byte, atEOF bool) (advance int, token []byte, err error) { 51 // If we requested more data, resume from last position. 52 for width := 1; t.last < len(data); t.last += width { 53 r, width := utf8.DecodeRune(data[t.last:]) 54 end := t.last + width 55 t.state = t.state.Next(r, data[:end]) 56 // Emit token 57 if t.state == nil { 58 t.last = 0 59 t.state = &ReadyState{} 60 return end, data[:end], nil 61 } 62 } 63 if !atEOF || len(data) == 0 { 64 // Request more data or end the stream 65 return 0, nil, nil 66 } 67 // We're at EOF. If we have a final, non-terminated token, return it. 68 return len(data), data, nil 69 } 70 71 // Use bufio.Scanner to split a PostgreSQL string into multiple statements. 72 // 73 // The core problem is to figure out whether the current ; separator is inside 74 // an escaped string literal. PostgreSQL has multiple ways of opening a string 75 // literal, $$, ', --, /*, etc. We use a FSM to guarantee these states are 76 // entered exclusively. If not in one of the above escape states, the next ; 77 // token can be parsed as statement separator. 78 // 79 // Each statement is split as it is, without removing comments or white spaces. 80 func Split(sql io.Reader, transform ...func(string) string) (stats []string, err error) { 81 t := tokenizer{state: &ReadyState{}} 82 scanner := bufio.NewScanner(sql) 83 84 // Increase scanner capacity to support very long lines containing e.g. geodata 85 buf := make([]byte, startBufSize) 86 maxbuf := int(viper.GetSizeInBytes("SCANNER_BUFFER_SIZE")) 87 if maxbuf == 0 { 88 maxbuf = MaxScannerCapacity 89 } 90 scanner.Buffer(buf, maxbuf) 91 scanner.Split(t.ScanToken) 92 93 var token string 94 for scanner.Scan() { 95 token = scanner.Text() 96 trim := token 97 for _, apply := range transform { 98 trim = apply(trim) 99 } 100 if len(trim) > 0 { 101 stats = append(stats, trim) 102 } 103 } 104 err = scanner.Err() 105 if err != nil { 106 err = errors.Errorf("%w\nAfter statement %d: %s", err, len(stats), token) 107 } 108 if errors.Is(err, bufio.ErrTooLong) { 109 err = errors.Errorf("%w\nTry setting SUPABASE_SCANNER_BUFFER_SIZE=5MB (current size is %dKB)", err, maxbuf>>10) 110 } 111 return stats, err 112 } 113 114 func SplitAndTrim(sql io.Reader) (stats []string, err error) { 115 return Split(sql, func(token string) string { 116 return strings.TrimRight(token, ";") 117 }, strings.TrimSpace) 118 }