github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/type.go (about) 1 package parquet 2 3 import ( 4 "bytes" 5 "encoding/json" 6 "fmt" 7 "math/bits" 8 "reflect" 9 "time" 10 "unsafe" 11 12 "github.com/parquet-go/parquet-go/deprecated" 13 "github.com/parquet-go/parquet-go/encoding" 14 "github.com/parquet-go/parquet-go/format" 15 "github.com/parquet-go/parquet-go/internal/unsafecast" 16 ) 17 18 // Kind is an enumeration type representing the physical types supported by the 19 // parquet type system. 20 type Kind int8 21 22 const ( 23 Boolean Kind = Kind(format.Boolean) 24 Int32 Kind = Kind(format.Int32) 25 Int64 Kind = Kind(format.Int64) 26 Int96 Kind = Kind(format.Int96) 27 Float Kind = Kind(format.Float) 28 Double Kind = Kind(format.Double) 29 ByteArray Kind = Kind(format.ByteArray) 30 FixedLenByteArray Kind = Kind(format.FixedLenByteArray) 31 ) 32 33 // String returns a human-readable representation of the physical type. 34 func (k Kind) String() string { return format.Type(k).String() } 35 36 // Value constructs a value from k and v. 37 // 38 // The method panics if the data is not a valid representation of the value 39 // kind; for example, if the kind is Int32 but the data is not 4 bytes long. 40 func (k Kind) Value(v []byte) Value { 41 x, err := parseValue(k, v) 42 if err != nil { 43 panic(err) 44 } 45 return x 46 } 47 48 // The Type interface represents logical types of the parquet type system. 49 // 50 // Types are immutable and therefore safe to access from multiple goroutines. 51 type Type interface { 52 // Returns a human-readable representation of the parquet type. 53 String() string 54 55 // Returns the Kind value representing the underlying physical type. 56 // 57 // The method panics if it is called on a group type. 58 Kind() Kind 59 60 // For integer and floating point physical types, the method returns the 61 // size of values in bits. 62 // 63 // For fixed-length byte arrays, the method returns the size of elements 64 // in bytes. 65 // 66 // For other types, the value is zero. 67 Length() int 68 69 // Returns an estimation of the number of bytes required to hold the given 70 // number of values of this type in memory. 71 // 72 // The method returns zero for group types. 73 EstimateSize(numValues int) int 74 75 // Returns an estimation of the number of values of this type that can be 76 // held in the given byte size. 77 // 78 // The method returns zero for group types. 79 EstimateNumValues(size int) int 80 81 // Compares two values and returns a negative integer if a < b, positive if 82 // a > b, or zero if a == b. 83 // 84 // The values' Kind must match the type, otherwise the result is undefined. 85 // 86 // The method panics if it is called on a group type. 87 Compare(a, b Value) int 88 89 // ColumnOrder returns the type's column order. For group types, this method 90 // returns nil. 91 // 92 // The order describes the comparison logic implemented by the Less method. 93 // 94 // As an optimization, the method may return the same pointer across 95 // multiple calls. Applications must treat the returned value as immutable, 96 // mutating the value will result in undefined behavior. 97 ColumnOrder() *format.ColumnOrder 98 99 // Returns the physical type as a *format.Type value. For group types, this 100 // method returns nil. 101 // 102 // As an optimization, the method may return the same pointer across 103 // multiple calls. Applications must treat the returned value as immutable, 104 // mutating the value will result in undefined behavior. 105 PhysicalType() *format.Type 106 107 // Returns the logical type as a *format.LogicalType value. When the logical 108 // type is unknown, the method returns nil. 109 // 110 // As an optimization, the method may return the same pointer across 111 // multiple calls. Applications must treat the returned value as immutable, 112 // mutating the value will result in undefined behavior. 113 LogicalType() *format.LogicalType 114 115 // Returns the logical type's equivalent converted type. When there are 116 // no equivalent converted type, the method returns nil. 117 // 118 // As an optimization, the method may return the same pointer across 119 // multiple calls. Applications must treat the returned value as immutable, 120 // mutating the value will result in undefined behavior. 121 ConvertedType() *deprecated.ConvertedType 122 123 // Creates a column indexer for values of this type. 124 // 125 // The size limit is a hint to the column indexer that it is allowed to 126 // truncate the page boundaries to the given size. Only BYTE_ARRAY and 127 // FIXED_LEN_BYTE_ARRAY types currently take this value into account. 128 // 129 // A value of zero or less means no limits. 130 // 131 // The method panics if it is called on a group type. 132 NewColumnIndexer(sizeLimit int) ColumnIndexer 133 134 // Creates a row group buffer column for values of this type. 135 // 136 // Column buffers are created using the index of the column they are 137 // accumulating values in memory for (relative to the parent schema), 138 // and the size of their memory buffer. 139 // 140 // The application may give an estimate of the number of values it expects 141 // to write to the buffer as second argument. This estimate helps set the 142 // initialize buffer capacity but is not a hard limit, the underlying memory 143 // buffer will grown as needed to allow more values to be written. Programs 144 // may use the Size method of the column buffer (or the parent row group, 145 // when relevant) to determine how many bytes are being used, and perform a 146 // flush of the buffers to a storage layer. 147 // 148 // The method panics if it is called on a group type. 149 NewColumnBuffer(columnIndex, numValues int) ColumnBuffer 150 151 // Creates a dictionary holding values of this type. 152 // 153 // The dictionary retains the data buffer, it does not make a copy of it. 154 // If the application needs to share ownership of the memory buffer, it must 155 // ensure that it will not be modified while the page is in use, or it must 156 // make a copy of it prior to creating the dictionary. 157 // 158 // The method panics if the data type does not correspond to the parquet 159 // type it is called on. 160 NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary 161 162 // Creates a page belonging to a column at the given index, backed by the 163 // data buffer. 164 // 165 // The page retains the data buffer, it does not make a copy of it. If the 166 // application needs to share ownership of the memory buffer, it must ensure 167 // that it will not be modified while the page is in use, or it must make a 168 // copy of it prior to creating the page. 169 // 170 // The method panics if the data type does not correspond to the parquet 171 // type it is called on. 172 NewPage(columnIndex, numValues int, data encoding.Values) Page 173 174 // Creates an encoding.Values instance backed by the given buffers. 175 // 176 // The offsets is only used by BYTE_ARRAY types, where it represents the 177 // positions of each variable length value in the values buffer. 178 // 179 // The following expression creates an empty instance for any type: 180 // 181 // values := typ.NewValues(nil, nil) 182 // 183 // The method panics if it is called on group types. 184 NewValues(values []byte, offsets []uint32) encoding.Values 185 186 // Assuming the src buffer contains PLAIN encoded values of the type it is 187 // called on, applies the given encoding and produces the output to the dst 188 // buffer passed as first argument by dispatching the call to one of the 189 // encoding methods. 190 Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) 191 192 // Assuming the src buffer contains values encoding in the given encoding, 193 // decodes the input and produces the encoded values into the dst output 194 // buffer passed as first argument by dispatching the call to one of the 195 // encoding methods. 196 Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) 197 198 // Returns an estimation of the output size after decoding the values passed 199 // as first argument with the given encoding. 200 // 201 // For most types, this is similar to calling EstimateSize with the known 202 // number of encoded values. For variable size types, using this method may 203 // provide a more precise result since it can inspect the input buffer. 204 EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int 205 206 // Assigns a Parquet value to a Go value. Returns an error if assignment is 207 // not possible. The source Value must be an expected logical type for the 208 // receiver. This can be accomplished using ConvertValue. 209 AssignValue(dst reflect.Value, src Value) error 210 211 // Convert a Parquet Value of the given Type into a Parquet Value that is 212 // compatible with the receiver. The returned Value is suitable to be passed 213 // to AssignValue. 214 ConvertValue(val Value, typ Type) (Value, error) 215 } 216 217 var ( 218 BooleanType Type = booleanType{} 219 Int32Type Type = int32Type{} 220 Int64Type Type = int64Type{} 221 Int96Type Type = int96Type{} 222 FloatType Type = floatType{} 223 DoubleType Type = doubleType{} 224 ByteArrayType Type = byteArrayType{} 225 ) 226 227 // In the current parquet version supported by this library, only type-defined 228 // orders are supported. 229 var typeDefinedColumnOrder = format.ColumnOrder{ 230 TypeOrder: new(format.TypeDefinedOrder), 231 } 232 233 var physicalTypes = [...]format.Type{ 234 0: format.Boolean, 235 1: format.Int32, 236 2: format.Int64, 237 3: format.Int96, 238 4: format.Float, 239 5: format.Double, 240 6: format.ByteArray, 241 7: format.FixedLenByteArray, 242 } 243 244 var convertedTypes = [...]deprecated.ConvertedType{ 245 0: deprecated.UTF8, 246 1: deprecated.Map, 247 2: deprecated.MapKeyValue, 248 3: deprecated.List, 249 4: deprecated.Enum, 250 5: deprecated.Decimal, 251 6: deprecated.Date, 252 7: deprecated.TimeMillis, 253 8: deprecated.TimeMicros, 254 9: deprecated.TimestampMillis, 255 10: deprecated.TimestampMicros, 256 11: deprecated.Uint8, 257 12: deprecated.Uint16, 258 13: deprecated.Uint32, 259 14: deprecated.Uint64, 260 15: deprecated.Int8, 261 16: deprecated.Int16, 262 17: deprecated.Int32, 263 18: deprecated.Int64, 264 19: deprecated.Json, 265 20: deprecated.Bson, 266 21: deprecated.Interval, 267 } 268 269 type booleanType struct{} 270 271 func (t booleanType) String() string { return "BOOLEAN" } 272 func (t booleanType) Kind() Kind { return Boolean } 273 func (t booleanType) Length() int { return 1 } 274 func (t booleanType) EstimateSize(n int) int { return (n + 7) / 8 } 275 func (t booleanType) EstimateNumValues(n int) int { return 8 * n } 276 func (t booleanType) Compare(a, b Value) int { return compareBool(a.boolean(), b.boolean()) } 277 func (t booleanType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 278 func (t booleanType) LogicalType() *format.LogicalType { return nil } 279 func (t booleanType) ConvertedType() *deprecated.ConvertedType { return nil } 280 func (t booleanType) PhysicalType() *format.Type { return &physicalTypes[Boolean] } 281 282 func (t booleanType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 283 return newBooleanColumnIndexer() 284 } 285 286 func (t booleanType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 287 return newBooleanColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 288 } 289 290 func (t booleanType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 291 return newBooleanDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 292 } 293 294 func (t booleanType) NewPage(columnIndex, numValues int, data encoding.Values) Page { 295 return newBooleanPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 296 } 297 298 func (t booleanType) NewValues(values []byte, _ []uint32) encoding.Values { 299 return encoding.BooleanValues(values) 300 } 301 302 func (t booleanType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { 303 return encoding.EncodeBoolean(dst, src, enc) 304 } 305 306 func (t booleanType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { 307 return encoding.DecodeBoolean(dst, src, enc) 308 } 309 310 func (t booleanType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { 311 return t.EstimateSize(numValues) 312 } 313 314 func (t booleanType) AssignValue(dst reflect.Value, src Value) error { 315 v := src.boolean() 316 switch dst.Kind() { 317 case reflect.Bool: 318 dst.SetBool(v) 319 default: 320 dst.Set(reflect.ValueOf(v)) 321 } 322 return nil 323 } 324 325 func (t booleanType) ConvertValue(val Value, typ Type) (Value, error) { 326 switch typ.(type) { 327 case *stringType: 328 return convertStringToBoolean(val) 329 } 330 switch typ.Kind() { 331 case Boolean: 332 return val, nil 333 case Int32: 334 return convertInt32ToBoolean(val) 335 case Int64: 336 return convertInt64ToBoolean(val) 337 case Int96: 338 return convertInt96ToBoolean(val) 339 case Float: 340 return convertFloatToBoolean(val) 341 case Double: 342 return convertDoubleToBoolean(val) 343 case ByteArray, FixedLenByteArray: 344 return convertByteArrayToBoolean(val) 345 default: 346 return makeValueKind(Boolean), nil 347 } 348 } 349 350 type int32Type struct{} 351 352 func (t int32Type) String() string { return "INT32" } 353 func (t int32Type) Kind() Kind { return Int32 } 354 func (t int32Type) Length() int { return 32 } 355 func (t int32Type) EstimateSize(n int) int { return 4 * n } 356 func (t int32Type) EstimateNumValues(n int) int { return n / 4 } 357 func (t int32Type) Compare(a, b Value) int { return compareInt32(a.int32(), b.int32()) } 358 func (t int32Type) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 359 func (t int32Type) LogicalType() *format.LogicalType { return nil } 360 func (t int32Type) ConvertedType() *deprecated.ConvertedType { return nil } 361 func (t int32Type) PhysicalType() *format.Type { return &physicalTypes[Int32] } 362 363 func (t int32Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { 364 return newInt32ColumnIndexer() 365 } 366 367 func (t int32Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 368 return newInt32ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 369 } 370 371 func (t int32Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 372 return newInt32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 373 } 374 375 func (t int32Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { 376 return newInt32Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 377 } 378 379 func (t int32Type) NewValues(values []byte, _ []uint32) encoding.Values { 380 return encoding.Int32ValuesFromBytes(values) 381 } 382 383 func (t int32Type) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { 384 return encoding.EncodeInt32(dst, src, enc) 385 } 386 387 func (t int32Type) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { 388 return encoding.DecodeInt32(dst, src, enc) 389 } 390 391 func (t int32Type) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { 392 return t.EstimateSize(numValues) 393 } 394 395 func (t int32Type) AssignValue(dst reflect.Value, src Value) error { 396 v := src.int32() 397 switch dst.Kind() { 398 case reflect.Int8, reflect.Int16, reflect.Int32: 399 dst.SetInt(int64(v)) 400 case reflect.Uint8, reflect.Uint16, reflect.Uint32: 401 dst.SetUint(uint64(v)) 402 default: 403 dst.Set(reflect.ValueOf(v)) 404 } 405 return nil 406 } 407 408 func (t int32Type) ConvertValue(val Value, typ Type) (Value, error) { 409 switch typ.(type) { 410 case *stringType: 411 return convertStringToInt32(val) 412 } 413 switch typ.Kind() { 414 case Boolean: 415 return convertBooleanToInt32(val) 416 case Int32: 417 return val, nil 418 case Int64: 419 return convertInt64ToInt32(val) 420 case Int96: 421 return convertInt96ToInt32(val) 422 case Float: 423 return convertFloatToInt32(val) 424 case Double: 425 return convertDoubleToInt32(val) 426 case ByteArray, FixedLenByteArray: 427 return convertByteArrayToInt32(val) 428 default: 429 return makeValueKind(Int32), nil 430 } 431 } 432 433 type int64Type struct{} 434 435 func (t int64Type) String() string { return "INT64" } 436 func (t int64Type) Kind() Kind { return Int64 } 437 func (t int64Type) Length() int { return 64 } 438 func (t int64Type) EstimateSize(n int) int { return 8 * n } 439 func (t int64Type) EstimateNumValues(n int) int { return n / 8 } 440 func (t int64Type) Compare(a, b Value) int { return compareInt64(a.int64(), b.int64()) } 441 func (t int64Type) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 442 func (t int64Type) LogicalType() *format.LogicalType { return nil } 443 func (t int64Type) ConvertedType() *deprecated.ConvertedType { return nil } 444 func (t int64Type) PhysicalType() *format.Type { return &physicalTypes[Int64] } 445 446 func (t int64Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { 447 return newInt64ColumnIndexer() 448 } 449 450 func (t int64Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 451 return newInt64ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 452 } 453 454 func (t int64Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 455 return newInt64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 456 } 457 458 func (t int64Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { 459 return newInt64Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 460 } 461 462 func (t int64Type) NewValues(values []byte, _ []uint32) encoding.Values { 463 return encoding.Int64ValuesFromBytes(values) 464 } 465 466 func (t int64Type) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { 467 return encoding.EncodeInt64(dst, src, enc) 468 } 469 470 func (t int64Type) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { 471 return encoding.DecodeInt64(dst, src, enc) 472 } 473 474 func (t int64Type) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { 475 return t.EstimateSize(numValues) 476 } 477 478 func (t int64Type) AssignValue(dst reflect.Value, src Value) error { 479 v := src.int64() 480 switch dst.Kind() { 481 case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int: 482 dst.SetInt(v) 483 case reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uint, reflect.Uintptr: 484 dst.SetUint(uint64(v)) 485 default: 486 dst.Set(reflect.ValueOf(v)) 487 } 488 return nil 489 } 490 491 func (t int64Type) ConvertValue(val Value, typ Type) (Value, error) { 492 switch typ.(type) { 493 case *stringType: 494 return convertStringToInt64(val) 495 } 496 switch typ.Kind() { 497 case Boolean: 498 return convertBooleanToInt64(val) 499 case Int32: 500 return convertInt32ToInt64(val) 501 case Int64: 502 return val, nil 503 case Int96: 504 return convertInt96ToInt64(val) 505 case Float: 506 return convertFloatToInt64(val) 507 case Double: 508 return convertDoubleToInt64(val) 509 case ByteArray, FixedLenByteArray: 510 return convertByteArrayToInt64(val) 511 default: 512 return makeValueKind(Int64), nil 513 } 514 } 515 516 type int96Type struct{} 517 518 func (t int96Type) String() string { return "INT96" } 519 520 func (t int96Type) Kind() Kind { return Int96 } 521 func (t int96Type) Length() int { return 96 } 522 func (t int96Type) EstimateSize(n int) int { return 12 * n } 523 func (t int96Type) EstimateNumValues(n int) int { return n / 12 } 524 func (t int96Type) Compare(a, b Value) int { return compareInt96(a.int96(), b.int96()) } 525 func (t int96Type) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 526 func (t int96Type) LogicalType() *format.LogicalType { return nil } 527 func (t int96Type) ConvertedType() *deprecated.ConvertedType { return nil } 528 func (t int96Type) PhysicalType() *format.Type { return &physicalTypes[Int96] } 529 530 func (t int96Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { 531 return newInt96ColumnIndexer() 532 } 533 534 func (t int96Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 535 return newInt96ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 536 } 537 538 func (t int96Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 539 return newInt96Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 540 } 541 542 func (t int96Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { 543 return newInt96Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 544 } 545 546 func (t int96Type) NewValues(values []byte, _ []uint32) encoding.Values { 547 return encoding.Int96ValuesFromBytes(values) 548 } 549 550 func (t int96Type) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { 551 return encoding.EncodeInt96(dst, src, enc) 552 } 553 554 func (t int96Type) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { 555 return encoding.DecodeInt96(dst, src, enc) 556 } 557 558 func (t int96Type) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { 559 return t.EstimateSize(numValues) 560 } 561 562 func (t int96Type) AssignValue(dst reflect.Value, src Value) error { 563 v := src.Int96() 564 dst.Set(reflect.ValueOf(v)) 565 return nil 566 } 567 568 func (t int96Type) ConvertValue(val Value, typ Type) (Value, error) { 569 switch typ.(type) { 570 case *stringType: 571 return convertStringToInt96(val) 572 } 573 switch typ.Kind() { 574 case Boolean: 575 return convertBooleanToInt96(val) 576 case Int32: 577 return convertInt32ToInt96(val) 578 case Int64: 579 return convertInt64ToInt96(val) 580 case Int96: 581 return val, nil 582 case Float: 583 return convertFloatToInt96(val) 584 case Double: 585 return convertDoubleToInt96(val) 586 case ByteArray, FixedLenByteArray: 587 return convertByteArrayToInt96(val) 588 default: 589 return makeValueKind(Int96), nil 590 } 591 } 592 593 type floatType struct{} 594 595 func (t floatType) String() string { return "FLOAT" } 596 func (t floatType) Kind() Kind { return Float } 597 func (t floatType) Length() int { return 32 } 598 func (t floatType) EstimateSize(n int) int { return 4 * n } 599 func (t floatType) EstimateNumValues(n int) int { return n / 4 } 600 func (t floatType) Compare(a, b Value) int { return compareFloat32(a.float(), b.float()) } 601 func (t floatType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 602 func (t floatType) LogicalType() *format.LogicalType { return nil } 603 func (t floatType) ConvertedType() *deprecated.ConvertedType { return nil } 604 func (t floatType) PhysicalType() *format.Type { return &physicalTypes[Float] } 605 606 func (t floatType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 607 return newFloatColumnIndexer() 608 } 609 610 func (t floatType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 611 return newFloatColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 612 } 613 614 func (t floatType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 615 return newFloatDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 616 } 617 618 func (t floatType) NewPage(columnIndex, numValues int, data encoding.Values) Page { 619 return newFloatPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 620 } 621 622 func (t floatType) NewValues(values []byte, _ []uint32) encoding.Values { 623 return encoding.FloatValuesFromBytes(values) 624 } 625 626 func (t floatType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { 627 return encoding.EncodeFloat(dst, src, enc) 628 } 629 630 func (t floatType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { 631 return encoding.DecodeFloat(dst, src, enc) 632 } 633 634 func (t floatType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { 635 return t.EstimateSize(numValues) 636 } 637 638 func (t floatType) AssignValue(dst reflect.Value, src Value) error { 639 v := src.float() 640 switch dst.Kind() { 641 case reflect.Float32, reflect.Float64: 642 dst.SetFloat(float64(v)) 643 default: 644 dst.Set(reflect.ValueOf(v)) 645 } 646 return nil 647 } 648 649 func (t floatType) ConvertValue(val Value, typ Type) (Value, error) { 650 switch typ.(type) { 651 case *stringType: 652 return convertStringToFloat(val) 653 } 654 switch typ.Kind() { 655 case Boolean: 656 return convertBooleanToFloat(val) 657 case Int32: 658 return convertInt32ToFloat(val) 659 case Int64: 660 return convertInt64ToFloat(val) 661 case Int96: 662 return convertInt96ToFloat(val) 663 case Float: 664 return val, nil 665 case Double: 666 return convertDoubleToFloat(val) 667 case ByteArray, FixedLenByteArray: 668 return convertByteArrayToFloat(val) 669 default: 670 return makeValueKind(Float), nil 671 } 672 } 673 674 type doubleType struct{} 675 676 func (t doubleType) String() string { return "DOUBLE" } 677 func (t doubleType) Kind() Kind { return Double } 678 func (t doubleType) Length() int { return 64 } 679 func (t doubleType) EstimateSize(n int) int { return 8 * n } 680 func (t doubleType) EstimateNumValues(n int) int { return n / 8 } 681 func (t doubleType) Compare(a, b Value) int { return compareFloat64(a.double(), b.double()) } 682 func (t doubleType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 683 func (t doubleType) LogicalType() *format.LogicalType { return nil } 684 func (t doubleType) ConvertedType() *deprecated.ConvertedType { return nil } 685 func (t doubleType) PhysicalType() *format.Type { return &physicalTypes[Double] } 686 687 func (t doubleType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 688 return newDoubleColumnIndexer() 689 } 690 691 func (t doubleType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 692 return newDoubleColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 693 } 694 695 func (t doubleType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 696 return newDoubleDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 697 } 698 699 func (t doubleType) NewPage(columnIndex, numValues int, data encoding.Values) Page { 700 return newDoublePage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 701 } 702 703 func (t doubleType) NewValues(values []byte, _ []uint32) encoding.Values { 704 return encoding.DoubleValuesFromBytes(values) 705 } 706 707 func (t doubleType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { 708 return encoding.EncodeDouble(dst, src, enc) 709 } 710 711 func (t doubleType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { 712 return encoding.DecodeDouble(dst, src, enc) 713 } 714 715 func (t doubleType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { 716 return t.EstimateSize(numValues) 717 } 718 719 func (t doubleType) AssignValue(dst reflect.Value, src Value) error { 720 v := src.double() 721 switch dst.Kind() { 722 case reflect.Float32, reflect.Float64: 723 dst.SetFloat(v) 724 default: 725 dst.Set(reflect.ValueOf(v)) 726 } 727 return nil 728 } 729 730 func (t doubleType) ConvertValue(val Value, typ Type) (Value, error) { 731 switch typ.(type) { 732 case *stringType: 733 return convertStringToDouble(val) 734 } 735 switch typ.Kind() { 736 case Boolean: 737 return convertBooleanToDouble(val) 738 case Int32: 739 return convertInt32ToDouble(val) 740 case Int64: 741 return convertInt64ToDouble(val) 742 case Int96: 743 return convertInt96ToDouble(val) 744 case Float: 745 return convertFloatToDouble(val) 746 case Double: 747 return val, nil 748 case ByteArray, FixedLenByteArray: 749 return convertByteArrayToDouble(val) 750 default: 751 return makeValueKind(Double), nil 752 } 753 } 754 755 type byteArrayType struct{} 756 757 func (t byteArrayType) String() string { return "BYTE_ARRAY" } 758 func (t byteArrayType) Kind() Kind { return ByteArray } 759 func (t byteArrayType) Length() int { return 0 } 760 func (t byteArrayType) EstimateSize(n int) int { return estimatedSizeOfByteArrayValues * n } 761 func (t byteArrayType) EstimateNumValues(n int) int { return n / estimatedSizeOfByteArrayValues } 762 func (t byteArrayType) Compare(a, b Value) int { return bytes.Compare(a.byteArray(), b.byteArray()) } 763 func (t byteArrayType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 764 func (t byteArrayType) LogicalType() *format.LogicalType { return nil } 765 func (t byteArrayType) ConvertedType() *deprecated.ConvertedType { return nil } 766 func (t byteArrayType) PhysicalType() *format.Type { return &physicalTypes[ByteArray] } 767 768 func (t byteArrayType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 769 return newByteArrayColumnIndexer(sizeLimit) 770 } 771 772 func (t byteArrayType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 773 return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 774 } 775 776 func (t byteArrayType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 777 return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 778 } 779 780 func (t byteArrayType) NewPage(columnIndex, numValues int, data encoding.Values) Page { 781 return newByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 782 } 783 784 func (t byteArrayType) NewValues(values []byte, offsets []uint32) encoding.Values { 785 return encoding.ByteArrayValues(values, offsets) 786 } 787 788 func (t byteArrayType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { 789 return encoding.EncodeByteArray(dst, src, enc) 790 } 791 792 func (t byteArrayType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { 793 return encoding.DecodeByteArray(dst, src, enc) 794 } 795 796 func (t byteArrayType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { 797 return enc.EstimateDecodeByteArraySize(src) 798 } 799 800 func (t byteArrayType) AssignValue(dst reflect.Value, src Value) error { 801 v := src.byteArray() 802 switch dst.Kind() { 803 case reflect.String: 804 dst.SetString(string(v)) 805 case reflect.Slice: 806 dst.SetBytes(copyBytes(v)) 807 default: 808 val := reflect.ValueOf(string(v)) 809 dst.Set(val) 810 } 811 return nil 812 } 813 814 func (t byteArrayType) ConvertValue(val Value, typ Type) (Value, error) { 815 switch typ.Kind() { 816 case Boolean: 817 return convertBooleanToByteArray(val) 818 case Int32: 819 return convertInt32ToByteArray(val) 820 case Int64: 821 return convertInt64ToByteArray(val) 822 case Int96: 823 return convertInt96ToByteArray(val) 824 case Float: 825 return convertFloatToByteArray(val) 826 case Double: 827 return convertDoubleToByteArray(val) 828 case ByteArray, FixedLenByteArray: 829 return val, nil 830 default: 831 return makeValueKind(ByteArray), nil 832 } 833 } 834 835 type fixedLenByteArrayType struct{ length int } 836 837 func (t fixedLenByteArrayType) String() string { 838 return fmt.Sprintf("FIXED_LEN_BYTE_ARRAY(%d)", t.length) 839 } 840 841 func (t fixedLenByteArrayType) Kind() Kind { return FixedLenByteArray } 842 843 func (t fixedLenByteArrayType) Length() int { return t.length } 844 845 func (t fixedLenByteArrayType) EstimateSize(n int) int { return t.length * n } 846 847 func (t fixedLenByteArrayType) EstimateNumValues(n int) int { return n / t.length } 848 849 func (t fixedLenByteArrayType) Compare(a, b Value) int { 850 return bytes.Compare(a.byteArray(), b.byteArray()) 851 } 852 853 func (t fixedLenByteArrayType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 854 855 func (t fixedLenByteArrayType) LogicalType() *format.LogicalType { return nil } 856 857 func (t fixedLenByteArrayType) ConvertedType() *deprecated.ConvertedType { return nil } 858 859 func (t fixedLenByteArrayType) PhysicalType() *format.Type { return &physicalTypes[FixedLenByteArray] } 860 861 func (t fixedLenByteArrayType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 862 return newFixedLenByteArrayColumnIndexer(t.length, sizeLimit) 863 } 864 865 func (t fixedLenByteArrayType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 866 return newFixedLenByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 867 } 868 869 func (t fixedLenByteArrayType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 870 return newFixedLenByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 871 } 872 873 func (t fixedLenByteArrayType) NewPage(columnIndex, numValues int, data encoding.Values) Page { 874 return newFixedLenByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 875 } 876 877 func (t fixedLenByteArrayType) NewValues(values []byte, _ []uint32) encoding.Values { 878 return encoding.FixedLenByteArrayValues(values, t.length) 879 } 880 881 func (t fixedLenByteArrayType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { 882 return encoding.EncodeFixedLenByteArray(dst, src, enc) 883 } 884 885 func (t fixedLenByteArrayType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { 886 return encoding.DecodeFixedLenByteArray(dst, src, enc) 887 } 888 889 func (t fixedLenByteArrayType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { 890 return t.EstimateSize(numValues) 891 } 892 893 func (t fixedLenByteArrayType) AssignValue(dst reflect.Value, src Value) error { 894 v := src.byteArray() 895 switch dst.Kind() { 896 case reflect.Array: 897 if dst.Type().Elem().Kind() == reflect.Uint8 && dst.Len() == len(v) { 898 // This code could be implemented as a call to reflect.Copy but 899 // it would require creating a reflect.Value from v which causes 900 // the heap allocation to pack the []byte value. To avoid this 901 // overhead we instead convert the reflect.Value holding the 902 // destination array into a byte slice which allows us to use 903 // a more efficient call to copy. 904 d := unsafe.Slice((*byte)(unsafecast.PointerOfValue(dst)), len(v)) 905 copy(d, v) 906 return nil 907 } 908 case reflect.Slice: 909 dst.SetBytes(copyBytes(v)) 910 return nil 911 } 912 913 val := reflect.ValueOf(copyBytes(v)) 914 dst.Set(val) 915 return nil 916 } 917 918 func (t fixedLenByteArrayType) ConvertValue(val Value, typ Type) (Value, error) { 919 switch typ.(type) { 920 case *stringType: 921 return convertStringToFixedLenByteArray(val, t.length) 922 } 923 switch typ.Kind() { 924 case Boolean: 925 return convertBooleanToFixedLenByteArray(val, t.length) 926 case Int32: 927 return convertInt32ToFixedLenByteArray(val, t.length) 928 case Int64: 929 return convertInt64ToFixedLenByteArray(val, t.length) 930 case Int96: 931 return convertInt96ToFixedLenByteArray(val, t.length) 932 case Float: 933 return convertFloatToFixedLenByteArray(val, t.length) 934 case Double: 935 return convertDoubleToFixedLenByteArray(val, t.length) 936 case ByteArray, FixedLenByteArray: 937 return convertByteArrayToFixedLenByteArray(val, t.length) 938 default: 939 return makeValueBytes(FixedLenByteArray, make([]byte, t.length)), nil 940 } 941 } 942 943 type uint32Type struct{ int32Type } 944 945 func (t uint32Type) Compare(a, b Value) int { 946 return compareUint32(a.uint32(), b.uint32()) 947 } 948 949 func (t uint32Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { 950 return newUint32ColumnIndexer() 951 } 952 953 func (t uint32Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 954 return newUint32ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 955 } 956 957 func (t uint32Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 958 return newUint32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 959 } 960 961 func (t uint32Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { 962 return newUint32Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 963 } 964 965 type uint64Type struct{ int64Type } 966 967 func (t uint64Type) Compare(a, b Value) int { 968 return compareUint64(a.uint64(), b.uint64()) 969 } 970 971 func (t uint64Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { 972 return newUint64ColumnIndexer() 973 } 974 975 func (t uint64Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 976 return newUint64ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 977 } 978 979 func (t uint64Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 980 return newUint64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 981 } 982 983 func (t uint64Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { 984 return newUint64Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 985 } 986 987 // BE128 stands for "big-endian 128 bits". This type is used as a special case 988 // for fixed-length byte arrays of 16 bytes, which are commonly used to 989 // represent columns of random unique identifiers such as UUIDs. 990 // 991 // Comparisons of BE128 values use the natural byte order, the zeroth byte is 992 // the most significant byte. 993 // 994 // The special case is intended to provide optimizations based on the knowledge 995 // that the values are 16 bytes long. Stronger type checking can also be applied 996 // by the compiler when using [16]byte values rather than []byte, reducing the 997 // risk of errors on these common code paths. 998 type be128Type struct{} 999 1000 func (t be128Type) String() string { return "FIXED_LEN_BYTE_ARRAY(16)" } 1001 1002 func (t be128Type) Kind() Kind { return FixedLenByteArray } 1003 1004 func (t be128Type) Length() int { return 16 } 1005 1006 func (t be128Type) EstimateSize(n int) int { return 16 * n } 1007 1008 func (t be128Type) EstimateNumValues(n int) int { return n / 16 } 1009 1010 func (t be128Type) Compare(a, b Value) int { return compareBE128(a.be128(), b.be128()) } 1011 1012 func (t be128Type) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 1013 1014 func (t be128Type) LogicalType() *format.LogicalType { return nil } 1015 1016 func (t be128Type) ConvertedType() *deprecated.ConvertedType { return nil } 1017 1018 func (t be128Type) PhysicalType() *format.Type { return &physicalTypes[FixedLenByteArray] } 1019 1020 func (t be128Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { 1021 return newBE128ColumnIndexer() 1022 } 1023 1024 func (t be128Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 1025 return newBE128ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 1026 } 1027 1028 func (t be128Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 1029 return newBE128Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 1030 } 1031 1032 func (t be128Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { 1033 return newBE128Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 1034 } 1035 1036 func (t be128Type) NewValues(values []byte, _ []uint32) encoding.Values { 1037 return encoding.FixedLenByteArrayValues(values, 16) 1038 } 1039 1040 func (t be128Type) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { 1041 return encoding.EncodeFixedLenByteArray(dst, src, enc) 1042 } 1043 1044 func (t be128Type) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { 1045 return encoding.DecodeFixedLenByteArray(dst, src, enc) 1046 } 1047 1048 func (t be128Type) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { 1049 return t.EstimateSize(numValues) 1050 } 1051 1052 func (t be128Type) AssignValue(dst reflect.Value, src Value) error { 1053 return fixedLenByteArrayType{length: 16}.AssignValue(dst, src) 1054 } 1055 1056 func (t be128Type) ConvertValue(val Value, typ Type) (Value, error) { 1057 return fixedLenByteArrayType{length: 16}.ConvertValue(val, typ) 1058 } 1059 1060 // FixedLenByteArrayType constructs a type for fixed-length values of the given 1061 // size (in bytes). 1062 func FixedLenByteArrayType(length int) Type { 1063 switch length { 1064 case 16: 1065 return be128Type{} 1066 default: 1067 return fixedLenByteArrayType{length: length} 1068 } 1069 } 1070 1071 // Int constructs a leaf node of signed integer logical type of the given bit 1072 // width. 1073 // 1074 // The bit width must be one of 8, 16, 32, 64, or the function will panic. 1075 func Int(bitWidth int) Node { 1076 return Leaf(integerType(bitWidth, &signedIntTypes)) 1077 } 1078 1079 // Uint constructs a leaf node of unsigned integer logical type of the given 1080 // bit width. 1081 // 1082 // The bit width must be one of 8, 16, 32, 64, or the function will panic. 1083 func Uint(bitWidth int) Node { 1084 return Leaf(integerType(bitWidth, &unsignedIntTypes)) 1085 } 1086 1087 func integerType(bitWidth int, types *[4]intType) *intType { 1088 switch bitWidth { 1089 case 8: 1090 return &types[0] 1091 case 16: 1092 return &types[1] 1093 case 32: 1094 return &types[2] 1095 case 64: 1096 return &types[3] 1097 default: 1098 panic(fmt.Sprintf("cannot create a %d bits parquet integer node", bitWidth)) 1099 } 1100 } 1101 1102 var signedIntTypes = [...]intType{ 1103 {BitWidth: 8, IsSigned: true}, 1104 {BitWidth: 16, IsSigned: true}, 1105 {BitWidth: 32, IsSigned: true}, 1106 {BitWidth: 64, IsSigned: true}, 1107 } 1108 1109 var unsignedIntTypes = [...]intType{ 1110 {BitWidth: 8, IsSigned: false}, 1111 {BitWidth: 16, IsSigned: false}, 1112 {BitWidth: 32, IsSigned: false}, 1113 {BitWidth: 64, IsSigned: false}, 1114 } 1115 1116 type intType format.IntType 1117 1118 func (t *intType) baseType() Type { 1119 if t.IsSigned { 1120 if t.BitWidth == 64 { 1121 return int64Type{} 1122 } else { 1123 return int32Type{} 1124 } 1125 } else { 1126 if t.BitWidth == 64 { 1127 return uint64Type{} 1128 } else { 1129 return uint32Type{} 1130 } 1131 } 1132 } 1133 1134 func (t *intType) String() string { return (*format.IntType)(t).String() } 1135 1136 func (t *intType) Kind() Kind { return t.baseType().Kind() } 1137 1138 func (t *intType) Length() int { return int(t.BitWidth) } 1139 1140 func (t *intType) EstimateSize(n int) int { return (int(t.BitWidth) / 8) * n } 1141 1142 func (t *intType) EstimateNumValues(n int) int { return n / (int(t.BitWidth) / 8) } 1143 1144 func (t *intType) Compare(a, b Value) int { 1145 // This code is similar to t.baseType().Compare(a,b) but comparison methods 1146 // tend to be invoked a lot (e.g. when sorting) so avoiding the interface 1147 // indirection in this case yields much better throughput in some cases. 1148 if t.BitWidth == 64 { 1149 i1 := a.int64() 1150 i2 := b.int64() 1151 if t.IsSigned { 1152 return compareInt64(i1, i2) 1153 } else { 1154 return compareUint64(uint64(i1), uint64(i2)) 1155 } 1156 } else { 1157 i1 := a.int32() 1158 i2 := b.int32() 1159 if t.IsSigned { 1160 return compareInt32(i1, i2) 1161 } else { 1162 return compareUint32(uint32(i1), uint32(i2)) 1163 } 1164 } 1165 } 1166 1167 func (t *intType) ColumnOrder() *format.ColumnOrder { return t.baseType().ColumnOrder() } 1168 1169 func (t *intType) PhysicalType() *format.Type { return t.baseType().PhysicalType() } 1170 1171 func (t *intType) LogicalType() *format.LogicalType { 1172 return &format.LogicalType{Integer: (*format.IntType)(t)} 1173 } 1174 1175 func (t *intType) ConvertedType() *deprecated.ConvertedType { 1176 convertedType := bits.Len8(uint8(t.BitWidth)/8) - 1 // 8=>0, 16=>1, 32=>2, 64=>4 1177 if t.IsSigned { 1178 convertedType += int(deprecated.Int8) 1179 } else { 1180 convertedType += int(deprecated.Uint8) 1181 } 1182 return &convertedTypes[convertedType] 1183 } 1184 1185 func (t *intType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 1186 return t.baseType().NewColumnIndexer(sizeLimit) 1187 } 1188 1189 func (t *intType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 1190 return t.baseType().NewColumnBuffer(columnIndex, numValues) 1191 } 1192 1193 func (t *intType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 1194 return t.baseType().NewDictionary(columnIndex, numValues, data) 1195 } 1196 1197 func (t *intType) NewPage(columnIndex, numValues int, data encoding.Values) Page { 1198 return t.baseType().NewPage(columnIndex, numValues, data) 1199 } 1200 1201 func (t *intType) NewValues(values []byte, offsets []uint32) encoding.Values { 1202 return t.baseType().NewValues(values, offsets) 1203 } 1204 1205 func (t *intType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { 1206 return t.baseType().Encode(dst, src, enc) 1207 } 1208 1209 func (t *intType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { 1210 return t.baseType().Decode(dst, src, enc) 1211 } 1212 1213 func (t *intType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { 1214 return t.baseType().EstimateDecodeSize(numValues, src, enc) 1215 } 1216 1217 func (t *intType) AssignValue(dst reflect.Value, src Value) error { 1218 if t.BitWidth == 64 { 1219 return int64Type{}.AssignValue(dst, src) 1220 } else { 1221 return int32Type{}.AssignValue(dst, src) 1222 } 1223 } 1224 1225 func (t *intType) ConvertValue(val Value, typ Type) (Value, error) { 1226 if t.BitWidth == 64 { 1227 return int64Type{}.ConvertValue(val, typ) 1228 } else { 1229 return int32Type{}.ConvertValue(val, typ) 1230 } 1231 } 1232 1233 // Decimal constructs a leaf node of decimal logical type with the given 1234 // scale, precision, and underlying type. 1235 // 1236 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal 1237 func Decimal(scale, precision int, typ Type) Node { 1238 switch typ.Kind() { 1239 case Int32, Int64, FixedLenByteArray: 1240 default: 1241 panic("DECIMAL node must annotate Int32, Int64 or FixedLenByteArray but got " + typ.String()) 1242 } 1243 return Leaf(&decimalType{ 1244 decimal: format.DecimalType{ 1245 Scale: int32(scale), 1246 Precision: int32(precision), 1247 }, 1248 Type: typ, 1249 }) 1250 } 1251 1252 type decimalType struct { 1253 decimal format.DecimalType 1254 Type 1255 } 1256 1257 func (t *decimalType) String() string { return t.decimal.String() } 1258 1259 func (t *decimalType) LogicalType() *format.LogicalType { 1260 return &format.LogicalType{Decimal: &t.decimal} 1261 } 1262 1263 func (t *decimalType) ConvertedType() *deprecated.ConvertedType { 1264 return &convertedTypes[deprecated.Decimal] 1265 } 1266 1267 // String constructs a leaf node of UTF8 logical type. 1268 // 1269 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#string 1270 func String() Node { return Leaf(&stringType{}) } 1271 1272 type stringType format.StringType 1273 1274 func (t *stringType) String() string { return (*format.StringType)(t).String() } 1275 1276 func (t *stringType) Kind() Kind { return ByteArray } 1277 1278 func (t *stringType) Length() int { return 0 } 1279 1280 func (t *stringType) EstimateSize(n int) int { return byteArrayType{}.EstimateSize(n) } 1281 1282 func (t *stringType) EstimateNumValues(n int) int { return byteArrayType{}.EstimateNumValues(n) } 1283 1284 func (t *stringType) Compare(a, b Value) int { 1285 return bytes.Compare(a.byteArray(), b.byteArray()) 1286 } 1287 1288 func (t *stringType) ColumnOrder() *format.ColumnOrder { 1289 return &typeDefinedColumnOrder 1290 } 1291 1292 func (t *stringType) PhysicalType() *format.Type { 1293 return &physicalTypes[ByteArray] 1294 } 1295 1296 func (t *stringType) LogicalType() *format.LogicalType { 1297 return &format.LogicalType{UTF8: (*format.StringType)(t)} 1298 } 1299 1300 func (t *stringType) ConvertedType() *deprecated.ConvertedType { 1301 return &convertedTypes[deprecated.UTF8] 1302 } 1303 1304 func (t *stringType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 1305 return newByteArrayColumnIndexer(sizeLimit) 1306 } 1307 1308 func (t *stringType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 1309 return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 1310 } 1311 1312 func (t *stringType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 1313 return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 1314 } 1315 1316 func (t *stringType) NewPage(columnIndex, numValues int, data encoding.Values) Page { 1317 return newByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 1318 } 1319 1320 func (t *stringType) NewValues(values []byte, offsets []uint32) encoding.Values { 1321 return encoding.ByteArrayValues(values, offsets) 1322 } 1323 1324 func (t *stringType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { 1325 return encoding.EncodeByteArray(dst, src, enc) 1326 } 1327 1328 func (t *stringType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { 1329 return encoding.DecodeByteArray(dst, src, enc) 1330 } 1331 1332 func (t *stringType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { 1333 return byteArrayType{}.EstimateDecodeSize(numValues, src, enc) 1334 } 1335 1336 func (t *stringType) AssignValue(dst reflect.Value, src Value) error { 1337 return byteArrayType{}.AssignValue(dst, src) 1338 } 1339 1340 func (t *stringType) ConvertValue(val Value, typ Type) (Value, error) { 1341 switch t2 := typ.(type) { 1342 case *dateType: 1343 return convertDateToString(val) 1344 case *timeType: 1345 tz := t2.tz() 1346 if t2.Unit.Micros != nil { 1347 return convertTimeMicrosToString(val, tz) 1348 } else { 1349 return convertTimeMillisToString(val, tz) 1350 } 1351 } 1352 switch typ.Kind() { 1353 case Boolean: 1354 return convertBooleanToString(val) 1355 case Int32: 1356 return convertInt32ToString(val) 1357 case Int64: 1358 return convertInt64ToString(val) 1359 case Int96: 1360 return convertInt96ToString(val) 1361 case Float: 1362 return convertFloatToString(val) 1363 case Double: 1364 return convertDoubleToString(val) 1365 case ByteArray: 1366 return val, nil 1367 case FixedLenByteArray: 1368 return convertFixedLenByteArrayToString(val) 1369 default: 1370 return makeValueKind(ByteArray), nil 1371 } 1372 } 1373 1374 // UUID constructs a leaf node of UUID logical type. 1375 // 1376 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#uuid 1377 func UUID() Node { return Leaf(&uuidType{}) } 1378 1379 type uuidType format.UUIDType 1380 1381 func (t *uuidType) String() string { return (*format.UUIDType)(t).String() } 1382 1383 func (t *uuidType) Kind() Kind { return be128Type{}.Kind() } 1384 1385 func (t *uuidType) Length() int { return be128Type{}.Length() } 1386 1387 func (t *uuidType) EstimateSize(n int) int { return be128Type{}.EstimateSize(n) } 1388 1389 func (t *uuidType) EstimateNumValues(n int) int { return be128Type{}.EstimateNumValues(n) } 1390 1391 func (t *uuidType) Compare(a, b Value) int { return be128Type{}.Compare(a, b) } 1392 1393 func (t *uuidType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 1394 1395 func (t *uuidType) PhysicalType() *format.Type { return &physicalTypes[FixedLenByteArray] } 1396 1397 func (t *uuidType) LogicalType() *format.LogicalType { 1398 return &format.LogicalType{UUID: (*format.UUIDType)(t)} 1399 } 1400 1401 func (t *uuidType) ConvertedType() *deprecated.ConvertedType { return nil } 1402 1403 func (t *uuidType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 1404 return be128Type{}.NewColumnIndexer(sizeLimit) 1405 } 1406 1407 func (t *uuidType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 1408 return be128Type{}.NewDictionary(columnIndex, numValues, data) 1409 } 1410 1411 func (t *uuidType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 1412 return be128Type{}.NewColumnBuffer(columnIndex, numValues) 1413 } 1414 1415 func (t *uuidType) NewPage(columnIndex, numValues int, data encoding.Values) Page { 1416 return be128Type{}.NewPage(columnIndex, numValues, data) 1417 } 1418 1419 func (t *uuidType) NewValues(values []byte, offsets []uint32) encoding.Values { 1420 return be128Type{}.NewValues(values, offsets) 1421 } 1422 1423 func (t *uuidType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { 1424 return be128Type{}.Encode(dst, src, enc) 1425 } 1426 1427 func (t *uuidType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { 1428 return be128Type{}.Decode(dst, src, enc) 1429 } 1430 1431 func (t *uuidType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { 1432 return be128Type{}.EstimateDecodeSize(numValues, src, enc) 1433 } 1434 1435 func (t *uuidType) AssignValue(dst reflect.Value, src Value) error { 1436 return be128Type{}.AssignValue(dst, src) 1437 } 1438 1439 func (t *uuidType) ConvertValue(val Value, typ Type) (Value, error) { 1440 return be128Type{}.ConvertValue(val, typ) 1441 } 1442 1443 // Enum constructs a leaf node with a logical type representing enumerations. 1444 // 1445 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#enum 1446 func Enum() Node { return Leaf(&enumType{}) } 1447 1448 type enumType format.EnumType 1449 1450 func (t *enumType) String() string { return (*format.EnumType)(t).String() } 1451 1452 func (t *enumType) Kind() Kind { return new(stringType).Kind() } 1453 1454 func (t *enumType) Length() int { return new(stringType).Length() } 1455 1456 func (t *enumType) EstimateSize(n int) int { return new(stringType).EstimateSize(n) } 1457 1458 func (t *enumType) EstimateNumValues(n int) int { return new(stringType).EstimateNumValues(n) } 1459 1460 func (t *enumType) Compare(a, b Value) int { return new(stringType).Compare(a, b) } 1461 1462 func (t *enumType) ColumnOrder() *format.ColumnOrder { return new(stringType).ColumnOrder() } 1463 1464 func (t *enumType) PhysicalType() *format.Type { return new(stringType).PhysicalType() } 1465 1466 func (t *enumType) LogicalType() *format.LogicalType { 1467 return &format.LogicalType{Enum: (*format.EnumType)(t)} 1468 } 1469 1470 func (t *enumType) ConvertedType() *deprecated.ConvertedType { 1471 return &convertedTypes[deprecated.Enum] 1472 } 1473 1474 func (t *enumType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 1475 return new(stringType).NewColumnIndexer(sizeLimit) 1476 } 1477 1478 func (t *enumType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 1479 return new(stringType).NewDictionary(columnIndex, numValues, data) 1480 } 1481 1482 func (t *enumType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 1483 return new(stringType).NewColumnBuffer(columnIndex, numValues) 1484 } 1485 1486 func (t *enumType) NewPage(columnIndex, numValues int, data encoding.Values) Page { 1487 return new(stringType).NewPage(columnIndex, numValues, data) 1488 } 1489 1490 func (t *enumType) NewValues(values []byte, offsets []uint32) encoding.Values { 1491 return new(stringType).NewValues(values, offsets) 1492 } 1493 1494 func (t *enumType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { 1495 return new(stringType).Encode(dst, src, enc) 1496 } 1497 1498 func (t *enumType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { 1499 return new(stringType).Decode(dst, src, enc) 1500 } 1501 1502 func (t *enumType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { 1503 return new(stringType).EstimateDecodeSize(numValues, src, enc) 1504 } 1505 1506 func (t *enumType) AssignValue(dst reflect.Value, src Value) error { 1507 return new(stringType).AssignValue(dst, src) 1508 } 1509 1510 func (t *enumType) ConvertValue(val Value, typ Type) (Value, error) { 1511 switch typ.(type) { 1512 case *byteArrayType, *stringType, *enumType: 1513 return val, nil 1514 default: 1515 return val, invalidConversion(val, "ENUM", typ.String()) 1516 } 1517 } 1518 1519 // JSON constructs a leaf node of JSON logical type. 1520 // 1521 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#json 1522 func JSON() Node { return Leaf(&jsonType{}) } 1523 1524 type jsonType format.JsonType 1525 1526 func (t *jsonType) String() string { return (*format.JsonType)(t).String() } 1527 1528 func (t *jsonType) Kind() Kind { return byteArrayType{}.Kind() } 1529 1530 func (t *jsonType) Length() int { return byteArrayType{}.Length() } 1531 1532 func (t *jsonType) EstimateSize(n int) int { return byteArrayType{}.EstimateSize(n) } 1533 1534 func (t *jsonType) EstimateNumValues(n int) int { return byteArrayType{}.EstimateNumValues(n) } 1535 1536 func (t *jsonType) Compare(a, b Value) int { return byteArrayType{}.Compare(a, b) } 1537 1538 func (t *jsonType) ColumnOrder() *format.ColumnOrder { return byteArrayType{}.ColumnOrder() } 1539 1540 func (t *jsonType) PhysicalType() *format.Type { return byteArrayType{}.PhysicalType() } 1541 1542 func (t *jsonType) LogicalType() *format.LogicalType { 1543 return &format.LogicalType{Json: (*format.JsonType)(t)} 1544 } 1545 1546 func (t *jsonType) ConvertedType() *deprecated.ConvertedType { 1547 return &convertedTypes[deprecated.Json] 1548 } 1549 1550 func (t *jsonType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 1551 return byteArrayType{}.NewColumnIndexer(sizeLimit) 1552 } 1553 1554 func (t *jsonType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 1555 return byteArrayType{}.NewDictionary(columnIndex, numValues, data) 1556 } 1557 1558 func (t *jsonType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 1559 return byteArrayType{}.NewColumnBuffer(columnIndex, numValues) 1560 } 1561 1562 func (t *jsonType) NewPage(columnIndex, numValues int, data encoding.Values) Page { 1563 return byteArrayType{}.NewPage(columnIndex, numValues, data) 1564 } 1565 1566 func (t *jsonType) NewValues(values []byte, offsets []uint32) encoding.Values { 1567 return byteArrayType{}.NewValues(values, offsets) 1568 } 1569 1570 func (t *jsonType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { 1571 return byteArrayType{}.Encode(dst, src, enc) 1572 } 1573 1574 func (t *jsonType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { 1575 return byteArrayType{}.Decode(dst, src, enc) 1576 } 1577 1578 func (t *jsonType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { 1579 return byteArrayType{}.EstimateDecodeSize(numValues, src, enc) 1580 } 1581 1582 func (t *jsonType) AssignValue(dst reflect.Value, src Value) error { 1583 // Assign value using ByteArrayType for BC... 1584 switch dst.Kind() { 1585 case reflect.String: 1586 return byteArrayType{}.AssignValue(dst, src) 1587 case reflect.Slice: 1588 if dst.Type().Elem().Kind() == reflect.Uint8 { 1589 return byteArrayType{}.AssignValue(dst, src) 1590 } 1591 } 1592 1593 // Otherwise handle with json.Unmarshal 1594 b := src.byteArray() 1595 val := reflect.New(dst.Type()).Elem() 1596 err := json.Unmarshal(b, val.Addr().Interface()) 1597 if err != nil { 1598 return err 1599 } 1600 dst.Set(val) 1601 return nil 1602 } 1603 1604 func (t *jsonType) ConvertValue(val Value, typ Type) (Value, error) { 1605 switch typ.(type) { 1606 case *byteArrayType, *stringType, *jsonType: 1607 return val, nil 1608 default: 1609 return val, invalidConversion(val, "JSON", typ.String()) 1610 } 1611 } 1612 1613 // BSON constructs a leaf node of BSON logical type. 1614 // 1615 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#bson 1616 func BSON() Node { return Leaf(&bsonType{}) } 1617 1618 type bsonType format.BsonType 1619 1620 func (t *bsonType) String() string { return (*format.BsonType)(t).String() } 1621 1622 func (t *bsonType) Kind() Kind { return byteArrayType{}.Kind() } 1623 1624 func (t *bsonType) Length() int { return byteArrayType{}.Length() } 1625 1626 func (t *bsonType) EstimateSize(n int) int { return byteArrayType{}.EstimateSize(n) } 1627 1628 func (t *bsonType) EstimateNumValues(n int) int { return byteArrayType{}.EstimateNumValues(n) } 1629 1630 func (t *bsonType) Compare(a, b Value) int { return byteArrayType{}.Compare(a, b) } 1631 1632 func (t *bsonType) ColumnOrder() *format.ColumnOrder { return byteArrayType{}.ColumnOrder() } 1633 1634 func (t *bsonType) PhysicalType() *format.Type { return byteArrayType{}.PhysicalType() } 1635 1636 func (t *bsonType) LogicalType() *format.LogicalType { 1637 return &format.LogicalType{Bson: (*format.BsonType)(t)} 1638 } 1639 1640 func (t *bsonType) ConvertedType() *deprecated.ConvertedType { 1641 return &convertedTypes[deprecated.Bson] 1642 } 1643 1644 func (t *bsonType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 1645 return byteArrayType{}.NewColumnIndexer(sizeLimit) 1646 } 1647 1648 func (t *bsonType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 1649 return byteArrayType{}.NewDictionary(columnIndex, numValues, data) 1650 } 1651 1652 func (t *bsonType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 1653 return byteArrayType{}.NewColumnBuffer(columnIndex, numValues) 1654 } 1655 1656 func (t *bsonType) NewPage(columnIndex, numValues int, data encoding.Values) Page { 1657 return byteArrayType{}.NewPage(columnIndex, numValues, data) 1658 } 1659 1660 func (t *bsonType) NewValues(values []byte, offsets []uint32) encoding.Values { 1661 return byteArrayType{}.NewValues(values, offsets) 1662 } 1663 1664 func (t *bsonType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { 1665 return byteArrayType{}.Encode(dst, src, enc) 1666 } 1667 1668 func (t *bsonType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { 1669 return byteArrayType{}.Decode(dst, src, enc) 1670 } 1671 1672 func (t *bsonType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { 1673 return byteArrayType{}.EstimateDecodeSize(numValues, src, enc) 1674 } 1675 1676 func (t *bsonType) AssignValue(dst reflect.Value, src Value) error { 1677 return byteArrayType{}.AssignValue(dst, src) 1678 } 1679 1680 func (t *bsonType) ConvertValue(val Value, typ Type) (Value, error) { 1681 switch typ.(type) { 1682 case *byteArrayType, *bsonType: 1683 return val, nil 1684 default: 1685 return val, invalidConversion(val, "BSON", typ.String()) 1686 } 1687 } 1688 1689 // Date constructs a leaf node of DATE logical type. 1690 // 1691 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#date 1692 func Date() Node { return Leaf(&dateType{}) } 1693 1694 type dateType format.DateType 1695 1696 func (t *dateType) String() string { return (*format.DateType)(t).String() } 1697 1698 func (t *dateType) Kind() Kind { return int32Type{}.Kind() } 1699 1700 func (t *dateType) Length() int { return int32Type{}.Length() } 1701 1702 func (t *dateType) EstimateSize(n int) int { return int32Type{}.EstimateSize(n) } 1703 1704 func (t *dateType) EstimateNumValues(n int) int { return int32Type{}.EstimateNumValues(n) } 1705 1706 func (t *dateType) Compare(a, b Value) int { return int32Type{}.Compare(a, b) } 1707 1708 func (t *dateType) ColumnOrder() *format.ColumnOrder { return int32Type{}.ColumnOrder() } 1709 1710 func (t *dateType) PhysicalType() *format.Type { return int32Type{}.PhysicalType() } 1711 1712 func (t *dateType) LogicalType() *format.LogicalType { 1713 return &format.LogicalType{Date: (*format.DateType)(t)} 1714 } 1715 1716 func (t *dateType) ConvertedType() *deprecated.ConvertedType { 1717 return &convertedTypes[deprecated.Date] 1718 } 1719 1720 func (t *dateType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 1721 return int32Type{}.NewColumnIndexer(sizeLimit) 1722 } 1723 1724 func (t *dateType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 1725 return int32Type{}.NewDictionary(columnIndex, numValues, data) 1726 } 1727 1728 func (t *dateType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 1729 return int32Type{}.NewColumnBuffer(columnIndex, numValues) 1730 } 1731 1732 func (t *dateType) NewPage(columnIndex, numValues int, data encoding.Values) Page { 1733 return int32Type{}.NewPage(columnIndex, numValues, data) 1734 } 1735 1736 func (t *dateType) NewValues(values []byte, offsets []uint32) encoding.Values { 1737 return int32Type{}.NewValues(values, offsets) 1738 } 1739 1740 func (t *dateType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { 1741 return int32Type{}.Encode(dst, src, enc) 1742 } 1743 1744 func (t *dateType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { 1745 return int32Type{}.Decode(dst, src, enc) 1746 } 1747 1748 func (t *dateType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { 1749 return int32Type{}.EstimateDecodeSize(numValues, src, enc) 1750 } 1751 1752 func (t *dateType) AssignValue(dst reflect.Value, src Value) error { 1753 return int32Type{}.AssignValue(dst, src) 1754 } 1755 1756 func (t *dateType) ConvertValue(val Value, typ Type) (Value, error) { 1757 switch src := typ.(type) { 1758 case *stringType: 1759 return convertStringToDate(val, time.UTC) 1760 case *timestampType: 1761 return convertTimestampToDate(val, src.Unit, src.tz()) 1762 } 1763 return int32Type{}.ConvertValue(val, typ) 1764 } 1765 1766 // TimeUnit represents units of time in the parquet type system. 1767 type TimeUnit interface { 1768 // Returns the precision of the time unit as a time.Duration value. 1769 Duration() time.Duration 1770 // Converts the TimeUnit value to its representation in the parquet thrift 1771 // format. 1772 TimeUnit() format.TimeUnit 1773 } 1774 1775 var ( 1776 Millisecond TimeUnit = &millisecond{} 1777 Microsecond TimeUnit = µsecond{} 1778 Nanosecond TimeUnit = &nanosecond{} 1779 ) 1780 1781 type millisecond format.MilliSeconds 1782 1783 func (u *millisecond) Duration() time.Duration { return time.Millisecond } 1784 func (u *millisecond) TimeUnit() format.TimeUnit { 1785 return format.TimeUnit{Millis: (*format.MilliSeconds)(u)} 1786 } 1787 1788 type microsecond format.MicroSeconds 1789 1790 func (u *microsecond) Duration() time.Duration { return time.Microsecond } 1791 func (u *microsecond) TimeUnit() format.TimeUnit { 1792 return format.TimeUnit{Micros: (*format.MicroSeconds)(u)} 1793 } 1794 1795 type nanosecond format.NanoSeconds 1796 1797 func (u *nanosecond) Duration() time.Duration { return time.Nanosecond } 1798 func (u *nanosecond) TimeUnit() format.TimeUnit { 1799 return format.TimeUnit{Nanos: (*format.NanoSeconds)(u)} 1800 } 1801 1802 // Time constructs a leaf node of TIME logical type. 1803 // 1804 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#time 1805 func Time(unit TimeUnit) Node { 1806 return Leaf(&timeType{IsAdjustedToUTC: true, Unit: unit.TimeUnit()}) 1807 } 1808 1809 type timeType format.TimeType 1810 1811 func (t *timeType) tz() *time.Location { 1812 if t.IsAdjustedToUTC { 1813 return time.UTC 1814 } else { 1815 return time.Local 1816 } 1817 } 1818 1819 func (t *timeType) baseType() Type { 1820 if t.useInt32() { 1821 return int32Type{} 1822 } else { 1823 return int64Type{} 1824 } 1825 } 1826 1827 func (t *timeType) useInt32() bool { return t.Unit.Millis != nil } 1828 1829 func (t *timeType) useInt64() bool { return t.Unit.Micros != nil } 1830 1831 func (t *timeType) String() string { return (*format.TimeType)(t).String() } 1832 1833 func (t *timeType) Kind() Kind { return t.baseType().Kind() } 1834 1835 func (t *timeType) Length() int { return t.baseType().Length() } 1836 1837 func (t *timeType) EstimateSize(n int) int { return t.baseType().EstimateSize(n) } 1838 1839 func (t *timeType) EstimateNumValues(n int) int { return t.baseType().EstimateNumValues(n) } 1840 1841 func (t *timeType) Compare(a, b Value) int { return t.baseType().Compare(a, b) } 1842 1843 func (t *timeType) ColumnOrder() *format.ColumnOrder { return t.baseType().ColumnOrder() } 1844 1845 func (t *timeType) PhysicalType() *format.Type { return t.baseType().PhysicalType() } 1846 1847 func (t *timeType) LogicalType() *format.LogicalType { 1848 return &format.LogicalType{Time: (*format.TimeType)(t)} 1849 } 1850 1851 func (t *timeType) ConvertedType() *deprecated.ConvertedType { 1852 switch { 1853 case t.useInt32(): 1854 return &convertedTypes[deprecated.TimeMillis] 1855 case t.useInt64(): 1856 return &convertedTypes[deprecated.TimeMicros] 1857 default: 1858 return nil 1859 } 1860 } 1861 1862 func (t *timeType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 1863 return t.baseType().NewColumnIndexer(sizeLimit) 1864 } 1865 1866 func (t *timeType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 1867 return t.baseType().NewColumnBuffer(columnIndex, numValues) 1868 } 1869 1870 func (t *timeType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 1871 return t.baseType().NewDictionary(columnIndex, numValues, data) 1872 } 1873 1874 func (t *timeType) NewPage(columnIndex, numValues int, data encoding.Values) Page { 1875 return t.baseType().NewPage(columnIndex, numValues, data) 1876 } 1877 1878 func (t *timeType) NewValues(values []byte, offset []uint32) encoding.Values { 1879 return t.baseType().NewValues(values, offset) 1880 } 1881 1882 func (t *timeType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { 1883 return t.baseType().Encode(dst, src, enc) 1884 } 1885 1886 func (t *timeType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { 1887 return t.baseType().Decode(dst, src, enc) 1888 } 1889 1890 func (t *timeType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { 1891 return t.baseType().EstimateDecodeSize(numValues, src, enc) 1892 } 1893 1894 func (t *timeType) AssignValue(dst reflect.Value, src Value) error { 1895 return t.baseType().AssignValue(dst, src) 1896 } 1897 1898 func (t *timeType) ConvertValue(val Value, typ Type) (Value, error) { 1899 switch src := typ.(type) { 1900 case *stringType: 1901 tz := t.tz() 1902 if t.Unit.Micros != nil { 1903 return convertStringToTimeMicros(val, tz) 1904 } else { 1905 return convertStringToTimeMillis(val, tz) 1906 } 1907 case *timestampType: 1908 tz := t.tz() 1909 if t.Unit.Micros != nil { 1910 return convertTimestampToTimeMicros(val, src.Unit, src.tz(), tz) 1911 } else { 1912 return convertTimestampToTimeMillis(val, src.Unit, src.tz(), tz) 1913 } 1914 } 1915 return t.baseType().ConvertValue(val, typ) 1916 } 1917 1918 // Timestamp constructs of leaf node of TIMESTAMP logical type. 1919 // 1920 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp 1921 func Timestamp(unit TimeUnit) Node { 1922 return Leaf(×tampType{IsAdjustedToUTC: true, Unit: unit.TimeUnit()}) 1923 } 1924 1925 type timestampType format.TimestampType 1926 1927 func (t *timestampType) tz() *time.Location { 1928 if t.IsAdjustedToUTC { 1929 return time.UTC 1930 } else { 1931 return time.Local 1932 } 1933 } 1934 1935 func (t *timestampType) String() string { return (*format.TimestampType)(t).String() } 1936 1937 func (t *timestampType) Kind() Kind { return int64Type{}.Kind() } 1938 1939 func (t *timestampType) Length() int { return int64Type{}.Length() } 1940 1941 func (t *timestampType) EstimateSize(n int) int { return int64Type{}.EstimateSize(n) } 1942 1943 func (t *timestampType) EstimateNumValues(n int) int { return int64Type{}.EstimateNumValues(n) } 1944 1945 func (t *timestampType) Compare(a, b Value) int { return int64Type{}.Compare(a, b) } 1946 1947 func (t *timestampType) ColumnOrder() *format.ColumnOrder { return int64Type{}.ColumnOrder() } 1948 1949 func (t *timestampType) PhysicalType() *format.Type { return int64Type{}.PhysicalType() } 1950 1951 func (t *timestampType) LogicalType() *format.LogicalType { 1952 return &format.LogicalType{Timestamp: (*format.TimestampType)(t)} 1953 } 1954 1955 func (t *timestampType) ConvertedType() *deprecated.ConvertedType { 1956 switch { 1957 case t.Unit.Millis != nil: 1958 return &convertedTypes[deprecated.TimestampMillis] 1959 case t.Unit.Micros != nil: 1960 return &convertedTypes[deprecated.TimestampMicros] 1961 default: 1962 return nil 1963 } 1964 } 1965 1966 func (t *timestampType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 1967 return int64Type{}.NewColumnIndexer(sizeLimit) 1968 } 1969 1970 func (t *timestampType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { 1971 return int64Type{}.NewDictionary(columnIndex, numValues, data) 1972 } 1973 1974 func (t *timestampType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 1975 return int64Type{}.NewColumnBuffer(columnIndex, numValues) 1976 } 1977 1978 func (t *timestampType) NewPage(columnIndex, numValues int, data encoding.Values) Page { 1979 return int64Type{}.NewPage(columnIndex, numValues, data) 1980 } 1981 1982 func (t *timestampType) NewValues(values []byte, offsets []uint32) encoding.Values { 1983 return int64Type{}.NewValues(values, offsets) 1984 } 1985 1986 func (t *timestampType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { 1987 return int64Type{}.Encode(dst, src, enc) 1988 } 1989 1990 func (t *timestampType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { 1991 return int64Type{}.Decode(dst, src, enc) 1992 } 1993 1994 func (t *timestampType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { 1995 return int64Type{}.EstimateDecodeSize(numValues, src, enc) 1996 } 1997 1998 func (t *timestampType) AssignValue(dst reflect.Value, src Value) error { 1999 switch dst.Type() { 2000 case reflect.TypeOf(time.Time{}): 2001 unit := Nanosecond.TimeUnit() 2002 lt := t.LogicalType() 2003 if lt != nil && lt.Timestamp != nil { 2004 unit = lt.Timestamp.Unit 2005 } 2006 2007 nanos := src.int64() 2008 switch { 2009 case unit.Millis != nil: 2010 nanos = nanos * 1e6 2011 case unit.Micros != nil: 2012 nanos = nanos * 1e3 2013 } 2014 2015 val := time.Unix(0, nanos).UTC() 2016 dst.Set(reflect.ValueOf(val)) 2017 return nil 2018 default: 2019 return int64Type{}.AssignValue(dst, src) 2020 } 2021 } 2022 2023 func (t *timestampType) ConvertValue(val Value, typ Type) (Value, error) { 2024 switch src := typ.(type) { 2025 case *timestampType: 2026 return convertTimestampToTimestamp(val, src.Unit, t.Unit) 2027 case *dateType: 2028 return convertDateToTimestamp(val, t.Unit, t.tz()) 2029 } 2030 return int64Type{}.ConvertValue(val, typ) 2031 } 2032 2033 // List constructs a node of LIST logical type. 2034 // 2035 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists 2036 func List(of Node) Node { 2037 return listNode{Group{"list": Repeated(Group{"element": of})}} 2038 } 2039 2040 type listNode struct{ Group } 2041 2042 func (listNode) Type() Type { return &listType{} } 2043 2044 type listType format.ListType 2045 2046 func (t *listType) String() string { return (*format.ListType)(t).String() } 2047 2048 func (t *listType) Kind() Kind { panic("cannot call Kind on parquet LIST type") } 2049 2050 func (t *listType) Length() int { return 0 } 2051 2052 func (t *listType) EstimateSize(int) int { return 0 } 2053 2054 func (t *listType) EstimateNumValues(int) int { return 0 } 2055 2056 func (t *listType) Compare(Value, Value) int { panic("cannot compare values on parquet LIST type") } 2057 2058 func (t *listType) ColumnOrder() *format.ColumnOrder { return nil } 2059 2060 func (t *listType) PhysicalType() *format.Type { return nil } 2061 2062 func (t *listType) LogicalType() *format.LogicalType { 2063 return &format.LogicalType{List: (*format.ListType)(t)} 2064 } 2065 2066 func (t *listType) ConvertedType() *deprecated.ConvertedType { 2067 return &convertedTypes[deprecated.List] 2068 } 2069 2070 func (t *listType) NewColumnIndexer(int) ColumnIndexer { 2071 panic("create create column indexer from parquet LIST type") 2072 } 2073 2074 func (t *listType) NewDictionary(int, int, encoding.Values) Dictionary { 2075 panic("cannot create dictionary from parquet LIST type") 2076 } 2077 2078 func (t *listType) NewColumnBuffer(int, int) ColumnBuffer { 2079 panic("cannot create column buffer from parquet LIST type") 2080 } 2081 2082 func (t *listType) NewPage(int, int, encoding.Values) Page { 2083 panic("cannot create page from parquet LIST type") 2084 } 2085 2086 func (t *listType) NewValues(values []byte, _ []uint32) encoding.Values { 2087 panic("cannot create values from parquet LIST type") 2088 } 2089 2090 func (t *listType) Encode(_ []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) { 2091 panic("cannot encode parquet LIST type") 2092 } 2093 2094 func (t *listType) Decode(_ encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) { 2095 panic("cannot decode parquet LIST type") 2096 } 2097 2098 func (t *listType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int { 2099 panic("cannot estimate decode size of parquet LIST type") 2100 } 2101 2102 func (t *listType) AssignValue(reflect.Value, Value) error { 2103 panic("cannot assign value to a parquet LIST type") 2104 } 2105 2106 func (t *listType) ConvertValue(Value, Type) (Value, error) { 2107 panic("cannot convert value to a parquet LIST type") 2108 } 2109 2110 // Map constructs a node of MAP logical type. 2111 // 2112 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps 2113 func Map(key, value Node) Node { 2114 return mapNode{Group{ 2115 "key_value": Repeated(Group{ 2116 "key": Required(key), 2117 "value": value, 2118 }), 2119 }} 2120 } 2121 2122 type mapNode struct{ Group } 2123 2124 func (mapNode) Type() Type { return &mapType{} } 2125 2126 type mapType format.MapType 2127 2128 func (t *mapType) String() string { return (*format.MapType)(t).String() } 2129 2130 func (t *mapType) Kind() Kind { panic("cannot call Kind on parquet MAP type") } 2131 2132 func (t *mapType) Length() int { return 0 } 2133 2134 func (t *mapType) EstimateSize(int) int { return 0 } 2135 2136 func (t *mapType) EstimateNumValues(int) int { return 0 } 2137 2138 func (t *mapType) Compare(Value, Value) int { panic("cannot compare values on parquet MAP type") } 2139 2140 func (t *mapType) ColumnOrder() *format.ColumnOrder { return nil } 2141 2142 func (t *mapType) PhysicalType() *format.Type { return nil } 2143 2144 func (t *mapType) LogicalType() *format.LogicalType { 2145 return &format.LogicalType{Map: (*format.MapType)(t)} 2146 } 2147 2148 func (t *mapType) ConvertedType() *deprecated.ConvertedType { 2149 return &convertedTypes[deprecated.Map] 2150 } 2151 2152 func (t *mapType) NewColumnIndexer(int) ColumnIndexer { 2153 panic("create create column indexer from parquet MAP type") 2154 } 2155 2156 func (t *mapType) NewDictionary(int, int, encoding.Values) Dictionary { 2157 panic("cannot create dictionary from parquet MAP type") 2158 } 2159 2160 func (t *mapType) NewColumnBuffer(int, int) ColumnBuffer { 2161 panic("cannot create column buffer from parquet MAP type") 2162 } 2163 2164 func (t *mapType) NewPage(int, int, encoding.Values) Page { 2165 panic("cannot create page from parquet MAP type") 2166 } 2167 2168 func (t *mapType) NewValues(values []byte, _ []uint32) encoding.Values { 2169 panic("cannot create values from parquet MAP type") 2170 } 2171 2172 func (t *mapType) Encode(_ []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) { 2173 panic("cannot encode parquet MAP type") 2174 } 2175 2176 func (t *mapType) Decode(_ encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) { 2177 panic("cannot decode parquet MAP type") 2178 } 2179 2180 func (t *mapType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int { 2181 panic("cannot estimate decode size of parquet MAP type") 2182 } 2183 2184 func (t *mapType) AssignValue(reflect.Value, Value) error { 2185 panic("cannot assign value to a parquet MAP type") 2186 } 2187 2188 func (t *mapType) ConvertValue(Value, Type) (Value, error) { 2189 panic("cannot convert value to a parquet MAP type") 2190 } 2191 2192 type nullType format.NullType 2193 2194 func (t *nullType) String() string { return (*format.NullType)(t).String() } 2195 2196 func (t *nullType) Kind() Kind { return -1 } 2197 2198 func (t *nullType) Length() int { return 0 } 2199 2200 func (t *nullType) EstimateSize(int) int { return 0 } 2201 2202 func (t *nullType) EstimateNumValues(int) int { return 0 } 2203 2204 func (t *nullType) Compare(Value, Value) int { panic("cannot compare values on parquet NULL type") } 2205 2206 func (t *nullType) ColumnOrder() *format.ColumnOrder { return nil } 2207 2208 func (t *nullType) PhysicalType() *format.Type { return nil } 2209 2210 func (t *nullType) LogicalType() *format.LogicalType { 2211 return &format.LogicalType{Unknown: (*format.NullType)(t)} 2212 } 2213 2214 func (t *nullType) ConvertedType() *deprecated.ConvertedType { return nil } 2215 2216 func (t *nullType) NewColumnIndexer(int) ColumnIndexer { 2217 panic("create create column indexer from parquet NULL type") 2218 } 2219 2220 func (t *nullType) NewDictionary(int, int, encoding.Values) Dictionary { 2221 panic("cannot create dictionary from parquet NULL type") 2222 } 2223 2224 func (t *nullType) NewColumnBuffer(int, int) ColumnBuffer { 2225 panic("cannot create column buffer from parquet NULL type") 2226 } 2227 2228 func (t *nullType) NewPage(columnIndex, numValues int, _ encoding.Values) Page { 2229 return newNullPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 2230 } 2231 2232 func (t *nullType) NewValues(_ []byte, _ []uint32) encoding.Values { 2233 return encoding.Values{} 2234 } 2235 2236 func (t *nullType) Encode(dst []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) { 2237 return dst[:0], nil 2238 } 2239 2240 func (t *nullType) Decode(dst encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) { 2241 return dst, nil 2242 } 2243 2244 func (t *nullType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int { 2245 return 0 2246 } 2247 2248 func (t *nullType) AssignValue(reflect.Value, Value) error { 2249 return nil 2250 } 2251 2252 func (t *nullType) ConvertValue(val Value, _ Type) (Value, error) { 2253 return val, nil 2254 } 2255 2256 type groupType struct{} 2257 2258 func (groupType) String() string { return "group" } 2259 2260 func (groupType) Kind() Kind { 2261 panic("cannot call Kind on parquet group") 2262 } 2263 2264 func (groupType) Compare(Value, Value) int { 2265 panic("cannot compare values on parquet group") 2266 } 2267 2268 func (groupType) NewColumnIndexer(int) ColumnIndexer { 2269 panic("cannot create column indexer from parquet group") 2270 } 2271 2272 func (groupType) NewDictionary(int, int, encoding.Values) Dictionary { 2273 panic("cannot create dictionary from parquet group") 2274 } 2275 2276 func (t groupType) NewColumnBuffer(int, int) ColumnBuffer { 2277 panic("cannot create column buffer from parquet group") 2278 } 2279 2280 func (t groupType) NewPage(int, int, encoding.Values) Page { 2281 panic("cannot create page from parquet group") 2282 } 2283 2284 func (t groupType) NewValues(_ []byte, _ []uint32) encoding.Values { 2285 panic("cannot create values from parquet group") 2286 } 2287 2288 func (groupType) Encode(_ []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) { 2289 panic("cannot encode parquet group") 2290 } 2291 2292 func (groupType) Decode(_ encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) { 2293 panic("cannot decode parquet group") 2294 } 2295 2296 func (groupType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int { 2297 panic("cannot estimate decode size of parquet group") 2298 } 2299 2300 func (groupType) AssignValue(reflect.Value, Value) error { 2301 panic("cannot assign value to a parquet group") 2302 } 2303 2304 func (t groupType) ConvertValue(Value, Type) (Value, error) { 2305 panic("cannot convert value to a parquet group") 2306 } 2307 2308 func (groupType) Length() int { return 0 } 2309 2310 func (groupType) EstimateSize(int) int { return 0 } 2311 2312 func (groupType) EstimateNumValues(int) int { return 0 } 2313 2314 func (groupType) ColumnOrder() *format.ColumnOrder { return nil } 2315 2316 func (groupType) PhysicalType() *format.Type { return nil } 2317 2318 func (groupType) LogicalType() *format.LogicalType { return nil } 2319 2320 func (groupType) ConvertedType() *deprecated.ConvertedType { return nil } 2321 2322 func checkTypeKindEqual(to, from Type) error { 2323 if to.Kind() != from.Kind() { 2324 return fmt.Errorf("cannot convert from parquet value of type %s to %s", from, to) 2325 } 2326 return nil 2327 }