github.com/snowflakedb/gosnowflake@v1.9.0/chunk.go (about)

     1  // Copyright (c) 2018-2022 Snowflake Computing Inc. All rights reserved.
     2  
     3  package gosnowflake
     4  
     5  import (
     6  	"bytes"
     7  	"fmt"
     8  	"io"
     9  
    10  	"unicode"
    11  	"unicode/utf16"
    12  	"unicode/utf8"
    13  )
    14  
    15  const (
    16  	defaultChunkBufferSize  int64 = 8 << 10 // 8k
    17  	defaultStringBufferSize int64 = 512
    18  )
    19  
    20  type largeChunkDecoder struct {
    21  	r io.Reader
    22  
    23  	rows  int // hint for number of rows
    24  	cells int // hint for number of cells/row
    25  
    26  	rem int // bytes remaining in rbuf
    27  	ptr int // position in rbuf
    28  
    29  	rbuf []byte
    30  	sbuf *bytes.Buffer // buffer for decodeString
    31  
    32  	ioError error
    33  }
    34  
    35  func decodeLargeChunk(r io.Reader, rowCount int, cellCount int) ([][]*string, error) {
    36  	logger.Info("custom JSON Decoder")
    37  	lcd := largeChunkDecoder{
    38  		r, rowCount, cellCount,
    39  		0, 0,
    40  		make([]byte, defaultChunkBufferSize),
    41  		bytes.NewBuffer(make([]byte, defaultStringBufferSize)),
    42  		nil,
    43  	}
    44  
    45  	rows, err := lcd.decode()
    46  	if lcd.ioError != nil && lcd.ioError != io.EOF {
    47  		return nil, lcd.ioError
    48  	} else if err != nil {
    49  		return nil, err
    50  	}
    51  
    52  	return rows, nil
    53  }
    54  
    55  func (lcd *largeChunkDecoder) mkError(s string) error {
    56  	return fmt.Errorf("corrupt chunk: %s", s)
    57  }
    58  
    59  func (lcd *largeChunkDecoder) decode() ([][]*string, error) {
    60  	if lcd.nextByteNonWhitespace() != '[' {
    61  		return nil, lcd.mkError("expected chunk to begin with '['")
    62  	}
    63  
    64  	rows := make([][]*string, 0, lcd.rows)
    65  	if lcd.nextByteNonWhitespace() == ']' {
    66  		return rows, nil // special case of an empty chunk
    67  	}
    68  	lcd.rewind(1)
    69  
    70  OuterLoop:
    71  	for {
    72  		row, err := lcd.decodeRow()
    73  		if err != nil {
    74  			return nil, err
    75  		}
    76  		rows = append(rows, row)
    77  
    78  		switch c := lcd.nextByteNonWhitespace(); {
    79  		case c == ',':
    80  			continue // more elements in the array
    81  		case c == ']':
    82  			return rows, nil // we've scanned the whole chunk
    83  		default:
    84  			break OuterLoop
    85  		}
    86  	}
    87  	return nil, lcd.mkError("invalid row boundary")
    88  }
    89  
    90  func (lcd *largeChunkDecoder) decodeRow() ([]*string, error) {
    91  	if lcd.nextByteNonWhitespace() != '[' {
    92  		return nil, lcd.mkError("expected row to begin with '['")
    93  	}
    94  
    95  	row := make([]*string, 0, lcd.cells)
    96  	if lcd.nextByteNonWhitespace() == ']' {
    97  		return row, nil // special case of an empty row
    98  	}
    99  	lcd.rewind(1)
   100  
   101  OuterLoop:
   102  	for {
   103  		cell, err := lcd.decodeCell()
   104  		if err != nil {
   105  			return nil, err
   106  		}
   107  		row = append(row, cell)
   108  
   109  		switch c := lcd.nextByteNonWhitespace(); {
   110  		case c == ',':
   111  			continue // more elements in the array
   112  		case c == ']':
   113  			return row, nil // we've scanned the whole row
   114  		default:
   115  			break OuterLoop
   116  		}
   117  	}
   118  	return nil, lcd.mkError("invalid cell boundary")
   119  }
   120  
   121  func (lcd *largeChunkDecoder) decodeCell() (*string, error) {
   122  	c := lcd.nextByteNonWhitespace()
   123  	if c == '"' {
   124  		s, err := lcd.decodeString()
   125  		return &s, err
   126  	} else if c == 'n' {
   127  		if lcd.nextByte() == 'u' &&
   128  			lcd.nextByte() == 'l' &&
   129  			lcd.nextByte() == 'l' {
   130  			return nil, nil
   131  		}
   132  	}
   133  	return nil, lcd.mkError("cell begins with unexpected byte")
   134  }
   135  
   136  // TODO we can optimize this further by optimistically searching
   137  // the read buffer for the next string. If it's short enough and
   138  // doesn't contain any escaped characters, we can construct the
   139  // return string directly without writing to the sbuf
   140  func (lcd *largeChunkDecoder) decodeString() (string, error) {
   141  	lcd.sbuf.Reset()
   142  	for {
   143  		// NOTE if you make changes here, ensure this
   144  		// variable does not escape to the heap
   145  		c := lcd.nextByte()
   146  		if c == '"' {
   147  			break
   148  		} else if c == '\\' {
   149  			if err := lcd.decodeEscaped(); err != nil {
   150  				return "", err
   151  			}
   152  		} else if c < ' ' {
   153  			return "", lcd.mkError("unexpected control character")
   154  		} else if c < utf8.RuneSelf {
   155  			lcd.sbuf.WriteByte(c)
   156  		} else {
   157  			lcd.rewind(1)
   158  			lcd.sbuf.WriteRune(lcd.readRune())
   159  		}
   160  	}
   161  	return lcd.sbuf.String(), nil
   162  }
   163  
   164  func (lcd *largeChunkDecoder) decodeEscaped() error {
   165  	// NOTE if you make changes here, ensure this
   166  	// variable does not escape to the heap
   167  	c := lcd.nextByte()
   168  
   169  	switch c {
   170  	case '"', '\\', '/', '\'':
   171  		lcd.sbuf.WriteByte(c)
   172  	case 'b':
   173  		lcd.sbuf.WriteByte('\b')
   174  	case 'f':
   175  		lcd.sbuf.WriteByte('\f')
   176  	case 'n':
   177  		lcd.sbuf.WriteByte('\n')
   178  	case 'r':
   179  		lcd.sbuf.WriteByte('\r')
   180  	case 't':
   181  		lcd.sbuf.WriteByte('\t')
   182  	case 'u':
   183  		rr := lcd.getu4()
   184  		if rr < 0 {
   185  			return lcd.mkError("invalid escape sequence")
   186  		}
   187  		if utf16.IsSurrogate(rr) {
   188  			rr1, size := lcd.getu4WithPrefix()
   189  			if dec := utf16.DecodeRune(rr, rr1); dec != unicode.ReplacementChar {
   190  				// A valid pair; consume.
   191  				lcd.sbuf.WriteRune(dec)
   192  				break
   193  			}
   194  			// Invalid surrogate; fall back to replacement rune.
   195  			lcd.rewind(size)
   196  			rr = unicode.ReplacementChar
   197  		}
   198  		lcd.sbuf.WriteRune(rr)
   199  	default:
   200  		return lcd.mkError("invalid escape sequence: " + string(c))
   201  	}
   202  	return nil
   203  }
   204  
   205  func (lcd *largeChunkDecoder) readRune() rune {
   206  	lcd.ensureBytes(4)
   207  	r, size := utf8.DecodeRune(lcd.rbuf[lcd.ptr:])
   208  	lcd.ptr += size
   209  	lcd.rem -= size
   210  	return r
   211  }
   212  
   213  func (lcd *largeChunkDecoder) getu4WithPrefix() (rune, int) {
   214  	lcd.ensureBytes(6)
   215  
   216  	// NOTE take a snapshot of the cursor state. If this
   217  	// is not a valid rune, then we need to roll back to
   218  	// where we were before we began consuming bytes
   219  	ptr := lcd.ptr
   220  
   221  	if lcd.nextByte() != '\\' {
   222  		return -1, lcd.ptr - ptr
   223  	}
   224  	if lcd.nextByte() != 'u' {
   225  		return -1, lcd.ptr - ptr
   226  	}
   227  	r := lcd.getu4()
   228  	return r, lcd.ptr - ptr
   229  }
   230  
   231  func (lcd *largeChunkDecoder) getu4() rune {
   232  	var r rune
   233  	for i := 0; i < 4; i++ {
   234  		c := lcd.nextByte()
   235  		switch {
   236  		case '0' <= c && c <= '9':
   237  			c = c - '0'
   238  		case 'a' <= c && c <= 'f':
   239  			c = c - 'a' + 10
   240  		case 'A' <= c && c <= 'F':
   241  			c = c - 'A' + 10
   242  		default:
   243  			return -1
   244  		}
   245  		r = r*16 + rune(c)
   246  	}
   247  	return r
   248  }
   249  
   250  func (lcd *largeChunkDecoder) nextByteNonWhitespace() byte {
   251  	for {
   252  		c := lcd.nextByte()
   253  		switch c {
   254  		case ' ', '\t', '\n', '\r':
   255  			continue
   256  		default:
   257  			return c
   258  		}
   259  	}
   260  }
   261  
   262  func (lcd *largeChunkDecoder) rewind(n int) {
   263  	lcd.ptr -= n
   264  	lcd.rem += n
   265  }
   266  
   267  func (lcd *largeChunkDecoder) nextByte() byte {
   268  	if lcd.rem == 0 {
   269  		if lcd.ioError != nil {
   270  			return 0
   271  		}
   272  
   273  		lcd.ptr = 0
   274  		lcd.rem = lcd.fillBuffer(lcd.rbuf)
   275  		if lcd.rem == 0 {
   276  			return 0
   277  		}
   278  	}
   279  
   280  	b := lcd.rbuf[lcd.ptr]
   281  	lcd.ptr++
   282  
   283  	lcd.rem--
   284  	return b
   285  }
   286  
   287  func (lcd *largeChunkDecoder) ensureBytes(n int) {
   288  	if lcd.rem <= n {
   289  		rbuf := make([]byte, defaultChunkBufferSize)
   290  		// NOTE when the buffer reads from the stream, there's no
   291  		// guarantee that it will actually be filled. As such we
   292  		// must use (ptr+rem) to compute the end of the slice.
   293  		off := copy(rbuf, lcd.rbuf[lcd.ptr:lcd.ptr+lcd.rem])
   294  		add := lcd.fillBuffer(rbuf[off:])
   295  
   296  		lcd.ptr = 0
   297  		lcd.rem += add
   298  		lcd.rbuf = rbuf
   299  	}
   300  }
   301  
   302  func (lcd *largeChunkDecoder) fillBuffer(b []byte) int {
   303  	n, err := lcd.r.Read(b)
   304  	if err != nil && err != io.EOF {
   305  		lcd.ioError = err
   306  		return 0
   307  	} else if n <= 0 {
   308  		lcd.ioError = io.EOF
   309  		return 0
   310  	}
   311  	return n
   312  }