github.com/Redstoneguy129/cli@v0.0.0-20230211220159-15dca4e91917/internal/utils/parser/token.go (about) 1 package parser 2 3 import ( 4 "bufio" 5 "fmt" 6 "io" 7 "strings" 8 "unicode/utf8" 9 10 "github.com/Redstoneguy129/cli/internal/utils" 11 "github.com/spf13/viper" 12 ) 13 14 const ( 15 // Default max capacity is 64 * 1024 which is not enough for certain lines 16 // containing e.g. geographical data. 17 // 256K ought to be enough for anybody... 18 MaxScannerCapacity = 256 * 1024 19 // Equal to `startBufSize` from `bufio/scan.go` 20 startBufSize = 4096 21 ) 22 23 // State transition table for tokenizer: 24 // 25 // Ready -> Ready (default) 26 // Ready -> Error (on invalid syntax) 27 // Ready -> Done (on ;, emit token) 28 // Ready -> Done (on EOF, emit token) 29 // 30 // Ready -> Comment (on --) 31 // Comment -> Comment (default) 32 // Comment -> Ready (on \n) 33 // 34 // Ready -> Block (on /*) 35 // Block -> Block (on /*, +-depth) 36 // Block -> Ready (on */, depth 0) 37 // 38 // Ready -> Quote (on ') 39 // Quote -> Quote (on '', default) 40 // Quote -> Ready (on ') 41 // 42 // Ready -> Dollar (on $tag$) 43 // Dollar -> Dollar (default) 44 // Dollar -> Ready (on $tag$) 45 // 46 // Ready -> Escape (on \) 47 // Escape -> Ready (on next) 48 type tokenizer struct { 49 state State 50 last int 51 } 52 53 func (t *tokenizer) ScanToken(data []byte, atEOF bool) (advance int, token []byte, err error) { 54 // If we requested more data, resume from last position. 55 for width := 1; t.last < len(data); t.last += width { 56 r, width := utf8.DecodeRune(data[t.last:]) 57 end := t.last + width 58 t.state = t.state.Next(r, data[:end]) 59 // Emit token 60 if t.state == nil { 61 t.last = 0 62 t.state = &ReadyState{} 63 return end, data[:end], nil 64 } 65 } 66 if !atEOF || len(data) == 0 { 67 // Request more data or end the stream 68 return 0, nil, nil 69 } 70 // We're at EOF. If we have a final, non-terminated token, return it. 71 return len(data), data, nil 72 } 73 74 // Use bufio.Scanner to split a PostgreSQL string into multiple statements. 75 // 76 // The core problem is to figure out whether the current ; separator is inside 77 // an escaped string literal. PostgreSQL has multiple ways of opening a string 78 // literal, $$, ', --, /*, etc. We use a FSM to guarantee these states are 79 // entered exclusively. If not in one of the above escape states, the next ; 80 // token can be parsed as statement separator. 81 // 82 // Each statement is split as it is, without removing comments or white spaces. 83 func Split(sql io.Reader, transform ...func(string) string) (stats []string, err error) { 84 t := tokenizer{state: &ReadyState{}} 85 scanner := bufio.NewScanner(sql) 86 87 // Increase scanner capacity to support very long lines containing e.g. geodata 88 buf := make([]byte, startBufSize) 89 maxbuf := viper.GetSizeInBytes("SCANNER_BUFFER_SIZE") 90 if maxbuf == 0 { 91 maxbuf = MaxScannerCapacity 92 } 93 scanner.Buffer(buf, int(maxbuf)) 94 scanner.Split(t.ScanToken) 95 96 var token string 97 for scanner.Scan() { 98 token = scanner.Text() 99 trim := token 100 for _, apply := range transform { 101 trim = apply(trim) 102 } 103 if len(trim) > 0 { 104 stats = append(stats, trim) 105 } 106 } 107 err = scanner.Err() 108 if err != nil { 109 err = fmt.Errorf("%v\nAfter statement %d: %s", err, len(stats), utils.Aqua(token)) 110 } 111 return stats, err 112 } 113 114 func SplitAndTrim(sql io.Reader) (stats []string, err error) { 115 return Split(sql, func(token string) string { 116 return strings.TrimRight(token, ";") 117 }, strings.TrimSpace) 118 }