github.com/Redstoneguy129/cli@v0.0.0-20230211220159-15dca4e91917/internal/utils/parser/token.go (about)

     1  package parser
     2  
     3  import (
     4  	"bufio"
     5  	"fmt"
     6  	"io"
     7  	"strings"
     8  	"unicode/utf8"
     9  
    10  	"github.com/Redstoneguy129/cli/internal/utils"
    11  	"github.com/spf13/viper"
    12  )
    13  
    14  const (
    15  	// Default max capacity is 64 * 1024 which is not enough for certain lines
    16  	// containing e.g. geographical data.
    17  	// 256K ought to be enough for anybody...
    18  	MaxScannerCapacity = 256 * 1024
    19  	// Equal to `startBufSize` from `bufio/scan.go`
    20  	startBufSize = 4096
    21  )
    22  
    23  // State transition table for tokenizer:
    24  //
    25  //	Ready -> Ready (default)
    26  //	Ready -> Error (on invalid syntax)
    27  //	Ready -> Done (on ;, emit token)
    28  //	Ready -> Done (on EOF, emit token)
    29  //
    30  //	Ready -> Comment (on --)
    31  //	Comment -> Comment (default)
    32  //	Comment -> Ready (on \n)
    33  //
    34  //	Ready -> Block (on /*)
    35  //	Block -> Block (on /*, +-depth)
    36  //	Block -> Ready (on */, depth 0)
    37  //
    38  //	Ready -> Quote (on ')
    39  //	Quote -> Quote (on '', default)
    40  //	Quote -> Ready (on ')
    41  //
    42  //	Ready -> Dollar (on $tag$)
    43  //	Dollar -> Dollar (default)
    44  //	Dollar -> Ready (on $tag$)
    45  //
    46  //	Ready -> Escape (on \)
    47  //	Escape -> Ready (on next)
    48  type tokenizer struct {
    49  	state State
    50  	last  int
    51  }
    52  
    53  func (t *tokenizer) ScanToken(data []byte, atEOF bool) (advance int, token []byte, err error) {
    54  	// If we requested more data, resume from last position.
    55  	for width := 1; t.last < len(data); t.last += width {
    56  		r, width := utf8.DecodeRune(data[t.last:])
    57  		end := t.last + width
    58  		t.state = t.state.Next(r, data[:end])
    59  		// Emit token
    60  		if t.state == nil {
    61  			t.last = 0
    62  			t.state = &ReadyState{}
    63  			return end, data[:end], nil
    64  		}
    65  	}
    66  	if !atEOF || len(data) == 0 {
    67  		// Request more data or end the stream
    68  		return 0, nil, nil
    69  	}
    70  	// We're at EOF. If we have a final, non-terminated token, return it.
    71  	return len(data), data, nil
    72  }
    73  
    74  // Use bufio.Scanner to split a PostgreSQL string into multiple statements.
    75  //
    76  // The core problem is to figure out whether the current ; separator is inside
    77  // an escaped string literal. PostgreSQL has multiple ways of opening a string
    78  // literal, $$, ', --, /*, etc. We use a FSM to guarantee these states are
    79  // entered exclusively. If not in one of the above escape states, the next ;
    80  // token can be parsed as statement separator.
    81  //
    82  // Each statement is split as it is, without removing comments or white spaces.
    83  func Split(sql io.Reader, transform ...func(string) string) (stats []string, err error) {
    84  	t := tokenizer{state: &ReadyState{}}
    85  	scanner := bufio.NewScanner(sql)
    86  
    87  	// Increase scanner capacity to support very long lines containing e.g. geodata
    88  	buf := make([]byte, startBufSize)
    89  	maxbuf := viper.GetSizeInBytes("SCANNER_BUFFER_SIZE")
    90  	if maxbuf == 0 {
    91  		maxbuf = MaxScannerCapacity
    92  	}
    93  	scanner.Buffer(buf, int(maxbuf))
    94  	scanner.Split(t.ScanToken)
    95  
    96  	var token string
    97  	for scanner.Scan() {
    98  		token = scanner.Text()
    99  		trim := token
   100  		for _, apply := range transform {
   101  			trim = apply(trim)
   102  		}
   103  		if len(trim) > 0 {
   104  			stats = append(stats, trim)
   105  		}
   106  	}
   107  	err = scanner.Err()
   108  	if err != nil {
   109  		err = fmt.Errorf("%v\nAfter statement %d: %s", err, len(stats), utils.Aqua(token))
   110  	}
   111  	return stats, err
   112  }
   113  
   114  func SplitAndTrim(sql io.Reader) (stats []string, err error) {
   115  	return Split(sql, func(token string) string {
   116  		return strings.TrimRight(token, ";")
   117  	}, strings.TrimSpace)
   118  }