github.com/snowflakedb/gosnowflake@v1.9.0/chunk.go (about) 1 // Copyright (c) 2018-2022 Snowflake Computing Inc. All rights reserved. 2 3 package gosnowflake 4 5 import ( 6 "bytes" 7 "fmt" 8 "io" 9 10 "unicode" 11 "unicode/utf16" 12 "unicode/utf8" 13 ) 14 15 const ( 16 defaultChunkBufferSize int64 = 8 << 10 // 8k 17 defaultStringBufferSize int64 = 512 18 ) 19 20 type largeChunkDecoder struct { 21 r io.Reader 22 23 rows int // hint for number of rows 24 cells int // hint for number of cells/row 25 26 rem int // bytes remaining in rbuf 27 ptr int // position in rbuf 28 29 rbuf []byte 30 sbuf *bytes.Buffer // buffer for decodeString 31 32 ioError error 33 } 34 35 func decodeLargeChunk(r io.Reader, rowCount int, cellCount int) ([][]*string, error) { 36 logger.Info("custom JSON Decoder") 37 lcd := largeChunkDecoder{ 38 r, rowCount, cellCount, 39 0, 0, 40 make([]byte, defaultChunkBufferSize), 41 bytes.NewBuffer(make([]byte, defaultStringBufferSize)), 42 nil, 43 } 44 45 rows, err := lcd.decode() 46 if lcd.ioError != nil && lcd.ioError != io.EOF { 47 return nil, lcd.ioError 48 } else if err != nil { 49 return nil, err 50 } 51 52 return rows, nil 53 } 54 55 func (lcd *largeChunkDecoder) mkError(s string) error { 56 return fmt.Errorf("corrupt chunk: %s", s) 57 } 58 59 func (lcd *largeChunkDecoder) decode() ([][]*string, error) { 60 if lcd.nextByteNonWhitespace() != '[' { 61 return nil, lcd.mkError("expected chunk to begin with '['") 62 } 63 64 rows := make([][]*string, 0, lcd.rows) 65 if lcd.nextByteNonWhitespace() == ']' { 66 return rows, nil // special case of an empty chunk 67 } 68 lcd.rewind(1) 69 70 OuterLoop: 71 for { 72 row, err := lcd.decodeRow() 73 if err != nil { 74 return nil, err 75 } 76 rows = append(rows, row) 77 78 switch c := lcd.nextByteNonWhitespace(); { 79 case c == ',': 80 continue // more elements in the array 81 case c == ']': 82 return rows, nil // we've scanned the whole chunk 83 default: 84 break OuterLoop 85 } 86 } 87 return nil, lcd.mkError("invalid row boundary") 88 } 89 90 func (lcd *largeChunkDecoder) decodeRow() ([]*string, error) { 91 if lcd.nextByteNonWhitespace() != '[' { 92 return nil, lcd.mkError("expected row to begin with '['") 93 } 94 95 row := make([]*string, 0, lcd.cells) 96 if lcd.nextByteNonWhitespace() == ']' { 97 return row, nil // special case of an empty row 98 } 99 lcd.rewind(1) 100 101 OuterLoop: 102 for { 103 cell, err := lcd.decodeCell() 104 if err != nil { 105 return nil, err 106 } 107 row = append(row, cell) 108 109 switch c := lcd.nextByteNonWhitespace(); { 110 case c == ',': 111 continue // more elements in the array 112 case c == ']': 113 return row, nil // we've scanned the whole row 114 default: 115 break OuterLoop 116 } 117 } 118 return nil, lcd.mkError("invalid cell boundary") 119 } 120 121 func (lcd *largeChunkDecoder) decodeCell() (*string, error) { 122 c := lcd.nextByteNonWhitespace() 123 if c == '"' { 124 s, err := lcd.decodeString() 125 return &s, err 126 } else if c == 'n' { 127 if lcd.nextByte() == 'u' && 128 lcd.nextByte() == 'l' && 129 lcd.nextByte() == 'l' { 130 return nil, nil 131 } 132 } 133 return nil, lcd.mkError("cell begins with unexpected byte") 134 } 135 136 // TODO we can optimize this further by optimistically searching 137 // the read buffer for the next string. If it's short enough and 138 // doesn't contain any escaped characters, we can construct the 139 // return string directly without writing to the sbuf 140 func (lcd *largeChunkDecoder) decodeString() (string, error) { 141 lcd.sbuf.Reset() 142 for { 143 // NOTE if you make changes here, ensure this 144 // variable does not escape to the heap 145 c := lcd.nextByte() 146 if c == '"' { 147 break 148 } else if c == '\\' { 149 if err := lcd.decodeEscaped(); err != nil { 150 return "", err 151 } 152 } else if c < ' ' { 153 return "", lcd.mkError("unexpected control character") 154 } else if c < utf8.RuneSelf { 155 lcd.sbuf.WriteByte(c) 156 } else { 157 lcd.rewind(1) 158 lcd.sbuf.WriteRune(lcd.readRune()) 159 } 160 } 161 return lcd.sbuf.String(), nil 162 } 163 164 func (lcd *largeChunkDecoder) decodeEscaped() error { 165 // NOTE if you make changes here, ensure this 166 // variable does not escape to the heap 167 c := lcd.nextByte() 168 169 switch c { 170 case '"', '\\', '/', '\'': 171 lcd.sbuf.WriteByte(c) 172 case 'b': 173 lcd.sbuf.WriteByte('\b') 174 case 'f': 175 lcd.sbuf.WriteByte('\f') 176 case 'n': 177 lcd.sbuf.WriteByte('\n') 178 case 'r': 179 lcd.sbuf.WriteByte('\r') 180 case 't': 181 lcd.sbuf.WriteByte('\t') 182 case 'u': 183 rr := lcd.getu4() 184 if rr < 0 { 185 return lcd.mkError("invalid escape sequence") 186 } 187 if utf16.IsSurrogate(rr) { 188 rr1, size := lcd.getu4WithPrefix() 189 if dec := utf16.DecodeRune(rr, rr1); dec != unicode.ReplacementChar { 190 // A valid pair; consume. 191 lcd.sbuf.WriteRune(dec) 192 break 193 } 194 // Invalid surrogate; fall back to replacement rune. 195 lcd.rewind(size) 196 rr = unicode.ReplacementChar 197 } 198 lcd.sbuf.WriteRune(rr) 199 default: 200 return lcd.mkError("invalid escape sequence: " + string(c)) 201 } 202 return nil 203 } 204 205 func (lcd *largeChunkDecoder) readRune() rune { 206 lcd.ensureBytes(4) 207 r, size := utf8.DecodeRune(lcd.rbuf[lcd.ptr:]) 208 lcd.ptr += size 209 lcd.rem -= size 210 return r 211 } 212 213 func (lcd *largeChunkDecoder) getu4WithPrefix() (rune, int) { 214 lcd.ensureBytes(6) 215 216 // NOTE take a snapshot of the cursor state. If this 217 // is not a valid rune, then we need to roll back to 218 // where we were before we began consuming bytes 219 ptr := lcd.ptr 220 221 if lcd.nextByte() != '\\' { 222 return -1, lcd.ptr - ptr 223 } 224 if lcd.nextByte() != 'u' { 225 return -1, lcd.ptr - ptr 226 } 227 r := lcd.getu4() 228 return r, lcd.ptr - ptr 229 } 230 231 func (lcd *largeChunkDecoder) getu4() rune { 232 var r rune 233 for i := 0; i < 4; i++ { 234 c := lcd.nextByte() 235 switch { 236 case '0' <= c && c <= '9': 237 c = c - '0' 238 case 'a' <= c && c <= 'f': 239 c = c - 'a' + 10 240 case 'A' <= c && c <= 'F': 241 c = c - 'A' + 10 242 default: 243 return -1 244 } 245 r = r*16 + rune(c) 246 } 247 return r 248 } 249 250 func (lcd *largeChunkDecoder) nextByteNonWhitespace() byte { 251 for { 252 c := lcd.nextByte() 253 switch c { 254 case ' ', '\t', '\n', '\r': 255 continue 256 default: 257 return c 258 } 259 } 260 } 261 262 func (lcd *largeChunkDecoder) rewind(n int) { 263 lcd.ptr -= n 264 lcd.rem += n 265 } 266 267 func (lcd *largeChunkDecoder) nextByte() byte { 268 if lcd.rem == 0 { 269 if lcd.ioError != nil { 270 return 0 271 } 272 273 lcd.ptr = 0 274 lcd.rem = lcd.fillBuffer(lcd.rbuf) 275 if lcd.rem == 0 { 276 return 0 277 } 278 } 279 280 b := lcd.rbuf[lcd.ptr] 281 lcd.ptr++ 282 283 lcd.rem-- 284 return b 285 } 286 287 func (lcd *largeChunkDecoder) ensureBytes(n int) { 288 if lcd.rem <= n { 289 rbuf := make([]byte, defaultChunkBufferSize) 290 // NOTE when the buffer reads from the stream, there's no 291 // guarantee that it will actually be filled. As such we 292 // must use (ptr+rem) to compute the end of the slice. 293 off := copy(rbuf, lcd.rbuf[lcd.ptr:lcd.ptr+lcd.rem]) 294 add := lcd.fillBuffer(rbuf[off:]) 295 296 lcd.ptr = 0 297 lcd.rem += add 298 lcd.rbuf = rbuf 299 } 300 } 301 302 func (lcd *largeChunkDecoder) fillBuffer(b []byte) int { 303 n, err := lcd.r.Read(b) 304 if err != nil && err != io.EOF { 305 lcd.ioError = err 306 return 0 307 } else if n <= 0 { 308 lcd.ioError = io.EOF 309 return 0 310 } 311 return n 312 }