github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/type.go (about) 1 package parquet 2 3 import ( 4 "bytes" 5 "fmt" 6 "math/bits" 7 "time" 8 9 "github.com/vc42/parquet-go/deprecated" 10 "github.com/vc42/parquet-go/encoding" 11 "github.com/vc42/parquet-go/format" 12 ) 13 14 // Kind is an enumeration type representing the physical types supported by the 15 // parquet type system. 16 type Kind int8 17 18 const ( 19 Boolean Kind = Kind(format.Boolean) 20 Int32 Kind = Kind(format.Int32) 21 Int64 Kind = Kind(format.Int64) 22 Int96 Kind = Kind(format.Int96) 23 Float Kind = Kind(format.Float) 24 Double Kind = Kind(format.Double) 25 ByteArray Kind = Kind(format.ByteArray) 26 FixedLenByteArray Kind = Kind(format.FixedLenByteArray) 27 ) 28 29 // String returns a human-readable representation of the physical type. 30 func (k Kind) String() string { return format.Type(k).String() } 31 32 // Value constructs a value from k and v. 33 // 34 // The method panics if the data is not a valid representation of the value 35 // kind; for example, if the kind is Int32 but the data is not 4 bytes long. 36 func (k Kind) Value(v []byte) Value { 37 x, err := parseValue(k, v) 38 if err != nil { 39 panic(err) 40 } 41 return x 42 } 43 44 // The Type interface represents logical types of the parquet type system. 45 // 46 // Types are immutable and therefore safe to access from multiple goroutines. 47 type Type interface { 48 // Returns a human-readable representation of the parquet type. 49 String() string 50 51 // Returns the Kind value representing the underlying physical type. 52 // 53 // The method panics if it is called on a group type. 54 Kind() Kind 55 56 // For integer and floating point physical types, the method returns the 57 // size of values in bits. 58 // 59 // For fixed-length byte arrays, the method returns the size of elements 60 // in bytes. 61 // 62 // For other types, the value is zero. 63 Length() int 64 65 // Returns an estimation of the number of bytes required to hold the given 66 // number of values of this type in memory. 67 // 68 // The method returns zero for group types. 69 EstimateSize(numValues int) int64 70 71 // Compares two values and returns a negative integer if a < b, positive if 72 // a > b, or zero if a == b. 73 // 74 // The values' Kind must match the type, otherwise the result is undefined. 75 // 76 // The method panics if it is called on a group type. 77 Compare(a, b Value) int 78 79 // ColumnOrder returns the type's column order. For group types, this method 80 // returns nil. 81 // 82 // The order describes the comparison logic implemented by the Less method. 83 // 84 // As an optimization, the method may return the same pointer across 85 // multiple calls. Applications must treat the returned value as immutable, 86 // mutating the value will result in undefined behavior. 87 ColumnOrder() *format.ColumnOrder 88 89 // Returns the physical type as a *format.Type value. For group types, this 90 // method returns nil. 91 // 92 // As an optimization, the method may return the same pointer across 93 // multiple calls. Applications must treat the returned value as immutable, 94 // mutating the value will result in undefined behavior. 95 PhysicalType() *format.Type 96 97 // Returns the logical type as a *format.LogicalType value. When the logical 98 // type is unknown, the method returns nil. 99 // 100 // As an optimization, the method may return the same pointer across 101 // multiple calls. Applications must treat the returned value as immutable, 102 // mutating the value will result in undefined behavior. 103 LogicalType() *format.LogicalType 104 105 // Returns the logical type's equivalent converted type. When there are 106 // no equivalent converted type, the method returns nil. 107 // 108 // As an optimization, the method may return the same pointer across 109 // multiple calls. Applications must treat the returned value as immutable, 110 // mutating the value will result in undefined behavior. 111 ConvertedType() *deprecated.ConvertedType 112 113 // Creates a column indexer for values of this type. 114 // 115 // The size limit is a hint to the column indexer that it is allowed to 116 // truncate the page boundaries to the given size. Only BYTE_ARRAY and 117 // FIXED_LEN_BYTE_ARRAY types currently take this value into account. 118 // 119 // A value of zero or less means no limits. 120 // 121 // The method panics if it is called on a group type. 122 NewColumnIndexer(sizeLimit int) ColumnIndexer 123 124 // Creates a row group buffer column for values of this type. 125 // 126 // Column buffers are created using the index of the column they are 127 // accumulating values in memory for (relative to the parent schema), 128 // and the size of their memory buffer. 129 // 130 // The application may give an estimate of the number of values it expects 131 // to write to the buffer as second argument. This estimate helps set the 132 // initialize buffer capacity but is not a hard limit, the underlying memory 133 // buffer will grown as needed to allow more values to be written. Programs 134 // may use the Size method of the column buffer (or the parent row group, 135 // when relevant) to determine how many bytes are being used, and perform a 136 // flush of the buffers to a storage layer. 137 // 138 // The method panics if it is called on a group type. 139 NewColumnBuffer(columnIndex, numValues int) ColumnBuffer 140 141 // Creates a dictionary holding values of this type. 142 // 143 // If the length of data is not zero, it must contain PLAIN encoded values 144 // of the dictionary. 145 // 146 // The dictionary retains the data buffer, it does not make a copy of it. 147 // If the application needs to share ownership of the memory buffer, it must 148 // ensure that it will not be modified while the page is in use, or it must 149 // make a copy of it prior to creating the dictionary. 150 // 151 // The method panics if it is called on a group type. 152 NewDictionary(columnIndex, numValues int, data []byte) Dictionary 153 154 // Creates a page belonging to a column at the given index, backed by the 155 // data buffer. 156 // 157 // If the length of data is not zero, it must contain PLAIN encoded values 158 // of the page. 159 // 160 // The page retains the data buffer, it does not make a copy of it. If the 161 // application needs to share ownership of the memory buffer, it must ensure 162 // that it will not be modified while the page is in use, or it must make a 163 // copy of it prior to creating the page. 164 // 165 // The method panics if the data is not a valid PLAIN encoded representation 166 // of the page values. 167 NewPage(columnIndex, numValues int, data []byte) Page 168 169 // Assuming the src buffer contains PLAIN encoded values of the type it is 170 // called on, applies the given encoding and produces the output to the dst 171 // buffer passed as first argument by dispatching the call to one of the 172 // encoding methods. 173 Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) 174 175 // Assuming the src buffer contains values encoding in the given encoding, 176 // decodes the input and produces the PLAIN encoded values into the dst 177 // output buffer passed as first argument by dispatching the call to one 178 // of the encoding methods. 179 Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) 180 } 181 182 var ( 183 BooleanType Type = booleanType{} 184 Int32Type Type = int32Type{} 185 Int64Type Type = int64Type{} 186 Int96Type Type = int96Type{} 187 FloatType Type = floatType{} 188 DoubleType Type = doubleType{} 189 ByteArrayType Type = byteArrayType{} 190 ) 191 192 // In the current parquet version supported by this library, only type-defined 193 // orders are supported. 194 var typeDefinedColumnOrder = format.ColumnOrder{ 195 TypeOrder: new(format.TypeDefinedOrder), 196 } 197 198 var physicalTypes = [...]format.Type{ 199 0: format.Boolean, 200 1: format.Int32, 201 2: format.Int64, 202 3: format.Int96, 203 4: format.Float, 204 5: format.Double, 205 6: format.ByteArray, 206 7: format.FixedLenByteArray, 207 } 208 209 var convertedTypes = [...]deprecated.ConvertedType{ 210 0: deprecated.UTF8, 211 1: deprecated.Map, 212 2: deprecated.MapKeyValue, 213 3: deprecated.List, 214 4: deprecated.Enum, 215 5: deprecated.Decimal, 216 6: deprecated.Date, 217 7: deprecated.TimeMillis, 218 8: deprecated.TimeMicros, 219 9: deprecated.TimestampMillis, 220 10: deprecated.TimestampMicros, 221 11: deprecated.Uint8, 222 12: deprecated.Uint16, 223 13: deprecated.Uint32, 224 14: deprecated.Uint64, 225 15: deprecated.Int8, 226 16: deprecated.Int16, 227 17: deprecated.Int32, 228 18: deprecated.Int64, 229 19: deprecated.Json, 230 20: deprecated.Bson, 231 21: deprecated.Interval, 232 } 233 234 type booleanType struct{} 235 236 func (t booleanType) String() string { return "BOOLEAN" } 237 func (t booleanType) Kind() Kind { return Boolean } 238 func (t booleanType) Length() int { return 1 } 239 func (t booleanType) EstimateSize(n int) int64 { return (int64(n) + 7) / 8 } 240 func (t booleanType) Compare(a, b Value) int { return compareBool(a.Boolean(), b.Boolean()) } 241 func (t booleanType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 242 func (t booleanType) LogicalType() *format.LogicalType { return nil } 243 func (t booleanType) ConvertedType() *deprecated.ConvertedType { return nil } 244 func (t booleanType) PhysicalType() *format.Type { return &physicalTypes[Boolean] } 245 246 func (t booleanType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 247 return newBooleanColumnIndexer() 248 } 249 250 func (t booleanType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 251 return newBooleanColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 252 } 253 254 func (t booleanType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { 255 return newBooleanDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 256 } 257 258 func (t booleanType) NewPage(columnIndex, numValues int, data []byte) Page { 259 return newBooleanPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 260 } 261 262 func (t booleanType) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 263 return enc.EncodeBoolean(dst, src) 264 } 265 266 func (t booleanType) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 267 return enc.DecodeBoolean(dst, src) 268 } 269 270 type int32Type struct{} 271 272 func (t int32Type) String() string { return "INT32" } 273 func (t int32Type) Kind() Kind { return Int32 } 274 func (t int32Type) Length() int { return 32 } 275 func (t int32Type) EstimateSize(n int) int64 { return 4 * int64(n) } 276 func (t int32Type) Compare(a, b Value) int { return compareInt32(a.Int32(), b.Int32()) } 277 func (t int32Type) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 278 func (t int32Type) LogicalType() *format.LogicalType { return nil } 279 func (t int32Type) ConvertedType() *deprecated.ConvertedType { return nil } 280 func (t int32Type) PhysicalType() *format.Type { return &physicalTypes[Int32] } 281 282 func (t int32Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { 283 return newInt32ColumnIndexer() 284 } 285 286 func (t int32Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 287 return newInt32ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 288 } 289 290 func (t int32Type) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { 291 return newInt32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 292 } 293 294 func (t int32Type) NewPage(columnIndex, numValues int, data []byte) Page { 295 return newInt32Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 296 } 297 298 func (t int32Type) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 299 return enc.EncodeInt32(dst, src) 300 } 301 302 func (t int32Type) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 303 return enc.DecodeInt32(dst, src) 304 } 305 306 type int64Type struct{} 307 308 func (t int64Type) String() string { return "INT64" } 309 func (t int64Type) Kind() Kind { return Int64 } 310 func (t int64Type) Length() int { return 64 } 311 func (t int64Type) EstimateSize(n int) int64 { return 8 * int64(n) } 312 func (t int64Type) Compare(a, b Value) int { return compareInt64(a.Int64(), b.Int64()) } 313 func (t int64Type) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 314 func (t int64Type) LogicalType() *format.LogicalType { return nil } 315 func (t int64Type) ConvertedType() *deprecated.ConvertedType { return nil } 316 func (t int64Type) PhysicalType() *format.Type { return &physicalTypes[Int64] } 317 318 func (t int64Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { 319 return newInt64ColumnIndexer() 320 } 321 322 func (t int64Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 323 return newInt64ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 324 } 325 326 func (t int64Type) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { 327 return newInt64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 328 } 329 330 func (t int64Type) NewPage(columnIndex, numValues int, data []byte) Page { 331 return newInt64Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 332 } 333 334 func (t int64Type) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 335 return enc.EncodeInt64(dst, src) 336 } 337 338 func (t int64Type) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 339 return enc.DecodeInt64(dst, src) 340 } 341 342 type int96Type struct{} 343 344 func (t int96Type) String() string { return "INT96" } 345 346 func (t int96Type) Kind() Kind { return Int96 } 347 func (t int96Type) Length() int { return 96 } 348 func (t int96Type) EstimateSize(n int) int64 { return 12 * int64(n) } 349 func (t int96Type) Compare(a, b Value) int { return compareInt96(a.Int96(), b.Int96()) } 350 func (t int96Type) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 351 func (t int96Type) LogicalType() *format.LogicalType { return nil } 352 func (t int96Type) ConvertedType() *deprecated.ConvertedType { return nil } 353 func (t int96Type) PhysicalType() *format.Type { return &physicalTypes[Int96] } 354 355 func (t int96Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { 356 return newInt96ColumnIndexer() 357 } 358 359 func (t int96Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 360 return newInt96ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 361 } 362 363 func (t int96Type) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { 364 return newInt96Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 365 } 366 367 func (t int96Type) NewPage(columnIndex, numValues int, data []byte) Page { 368 return newInt96Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 369 } 370 371 func (t int96Type) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 372 return enc.EncodeInt96(dst, src) 373 } 374 375 func (t int96Type) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 376 return enc.DecodeInt96(dst, src) 377 } 378 379 type floatType struct{} 380 381 func (t floatType) String() string { return "FLOAT" } 382 func (t floatType) Kind() Kind { return Float } 383 func (t floatType) Length() int { return 32 } 384 func (t floatType) EstimateSize(n int) int64 { return 4 * int64(n) } 385 func (t floatType) Compare(a, b Value) int { return compareFloat32(a.Float(), b.Float()) } 386 func (t floatType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 387 func (t floatType) LogicalType() *format.LogicalType { return nil } 388 func (t floatType) ConvertedType() *deprecated.ConvertedType { return nil } 389 func (t floatType) PhysicalType() *format.Type { return &physicalTypes[Float] } 390 391 func (t floatType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 392 return newFloatColumnIndexer() 393 } 394 395 func (t floatType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 396 return newFloatColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 397 } 398 399 func (t floatType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { 400 return newFloatDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 401 } 402 403 func (t floatType) NewPage(columnIndex, numValues int, data []byte) Page { 404 return newFloatPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 405 } 406 407 func (t floatType) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 408 return enc.EncodeFloat(dst, src) 409 } 410 411 func (t floatType) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 412 return enc.DecodeFloat(dst, src) 413 } 414 415 type doubleType struct{} 416 417 func (t doubleType) String() string { return "DOUBLE" } 418 func (t doubleType) Kind() Kind { return Double } 419 func (t doubleType) Length() int { return 64 } 420 func (t doubleType) EstimateSize(n int) int64 { return 8 * int64(n) } 421 func (t doubleType) Compare(a, b Value) int { return compareFloat64(a.Double(), b.Double()) } 422 func (t doubleType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 423 func (t doubleType) LogicalType() *format.LogicalType { return nil } 424 func (t doubleType) ConvertedType() *deprecated.ConvertedType { return nil } 425 func (t doubleType) PhysicalType() *format.Type { return &physicalTypes[Double] } 426 427 func (t doubleType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 428 return newDoubleColumnIndexer() 429 } 430 431 func (t doubleType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 432 return newDoubleColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 433 } 434 435 func (t doubleType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { 436 return newDoubleDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 437 } 438 439 func (t doubleType) NewPage(columnIndex, numValues int, data []byte) Page { 440 return newDoublePage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 441 } 442 443 func (t doubleType) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 444 return enc.EncodeDouble(dst, src) 445 } 446 447 func (t doubleType) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 448 return enc.DecodeDouble(dst, src) 449 } 450 451 type byteArrayType struct{} 452 453 func (t byteArrayType) String() string { return "BYTE_ARRAY" } 454 func (t byteArrayType) Kind() Kind { return ByteArray } 455 func (t byteArrayType) Length() int { return 0 } 456 func (t byteArrayType) EstimateSize(n int) int64 { return 10 * int64(n) } 457 func (t byteArrayType) Compare(a, b Value) int { return bytes.Compare(a.ByteArray(), b.ByteArray()) } 458 func (t byteArrayType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 459 func (t byteArrayType) LogicalType() *format.LogicalType { return nil } 460 func (t byteArrayType) ConvertedType() *deprecated.ConvertedType { return nil } 461 func (t byteArrayType) PhysicalType() *format.Type { return &physicalTypes[ByteArray] } 462 463 func (t byteArrayType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 464 return newByteArrayColumnIndexer(sizeLimit) 465 } 466 467 func (t byteArrayType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 468 return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 469 } 470 471 func (t byteArrayType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { 472 return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 473 } 474 475 func (t byteArrayType) NewPage(columnIndex, numValues int, data []byte) Page { 476 return newByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 477 } 478 479 func (t byteArrayType) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 480 return enc.EncodeByteArray(dst, src) 481 } 482 483 func (t byteArrayType) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 484 return enc.DecodeByteArray(dst, src) 485 } 486 487 type fixedLenByteArrayType struct{ length int } 488 489 func (t fixedLenByteArrayType) String() string { 490 return fmt.Sprintf("FIXED_LEN_BYTE_ARRAY(%d)", t.length) 491 } 492 493 func (t fixedLenByteArrayType) Kind() Kind { return FixedLenByteArray } 494 495 func (t fixedLenByteArrayType) Length() int { return t.length } 496 497 func (t fixedLenByteArrayType) EstimateSize(n int) int64 { return int64(t.length) * int64(n) } 498 499 func (t fixedLenByteArrayType) Compare(a, b Value) int { 500 return bytes.Compare(a.ByteArray(), b.ByteArray()) 501 } 502 503 func (t fixedLenByteArrayType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 504 505 func (t fixedLenByteArrayType) LogicalType() *format.LogicalType { return nil } 506 507 func (t fixedLenByteArrayType) ConvertedType() *deprecated.ConvertedType { return nil } 508 509 func (t fixedLenByteArrayType) PhysicalType() *format.Type { return &physicalTypes[FixedLenByteArray] } 510 511 func (t fixedLenByteArrayType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 512 return newFixedLenByteArrayColumnIndexer(t.length, sizeLimit) 513 } 514 515 func (t fixedLenByteArrayType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 516 return newFixedLenByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 517 } 518 519 func (t fixedLenByteArrayType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { 520 return newFixedLenByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 521 } 522 523 func (t fixedLenByteArrayType) NewPage(columnIndex, numValues int, data []byte) Page { 524 return newFixedLenByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 525 } 526 527 func (t fixedLenByteArrayType) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 528 return enc.EncodeFixedLenByteArray(dst, src, t.length) 529 } 530 531 func (t fixedLenByteArrayType) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 532 return enc.DecodeFixedLenByteArray(dst, src, t.length) 533 } 534 535 // BE128 stands for "big-endian 128 bits". This type is used as a special case 536 // for fixed-length byte arrays of 16 bytes, which are commonly used to 537 // represent columns of random unique identifiers such as UUIDs. 538 // 539 // Comparisons of BE128 values use the natural byte order, the zeroth byte is 540 // the most significant byte. 541 // 542 // The special case is intended to provide optimizations based on the knowledge 543 // that the values are 16 bytes long. Stronger type checking can also be applied 544 // by the compiler when using [16]byte values rather than []byte, reducing the 545 // risk of errors on these common code paths. 546 type be128Type struct{} 547 548 func (t be128Type) String() string { return "FIXED_LEN_BYTE_ARRAY(16)" } 549 550 func (t be128Type) Kind() Kind { return FixedLenByteArray } 551 552 func (t be128Type) Length() int { return 16 } 553 554 func (t be128Type) EstimateSize(n int) int64 { return 16 * int64(n) } 555 556 func (t be128Type) Compare(a, b Value) int { 557 return compareBE128((*[16]byte)(a.ByteArray()), (*[16]byte)(b.ByteArray())) 558 } 559 560 func (t be128Type) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 561 562 func (t be128Type) LogicalType() *format.LogicalType { return nil } 563 564 func (t be128Type) ConvertedType() *deprecated.ConvertedType { return nil } 565 566 func (t be128Type) PhysicalType() *format.Type { return &physicalTypes[FixedLenByteArray] } 567 568 func (t be128Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { 569 return newBE128ColumnIndexer() 570 } 571 572 func (t be128Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 573 return newBE128ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 574 } 575 576 func (t be128Type) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { 577 return newBE128Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 578 } 579 580 func (t be128Type) NewPage(columnIndex, numValues int, data []byte) Page { 581 return newBE128Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 582 } 583 584 func (t be128Type) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 585 return enc.EncodeFixedLenByteArray(dst, src, 16) 586 } 587 588 func (t be128Type) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 589 return enc.DecodeFixedLenByteArray(dst, src, 16) 590 } 591 592 // FixedLenByteArrayType constructs a type for fixed-length values of the given 593 // size (in bytes). 594 func FixedLenByteArrayType(length int) Type { 595 switch length { 596 case 16: 597 return be128Type{} 598 default: 599 return fixedLenByteArrayType{length: length} 600 } 601 } 602 603 // Int constructs a leaf node of signed integer logical type of the given bit 604 // width. 605 // 606 // The bit width must be one of 8, 16, 32, 64, or the function will panic. 607 func Int(bitWidth int) Node { 608 return Leaf(integerType(bitWidth, &signedIntTypes)) 609 } 610 611 // Uint constructs a leaf node of unsigned integer logical type of the given 612 // bit width. 613 // 614 // The bit width must be one of 8, 16, 32, 64, or the function will panic. 615 func Uint(bitWidth int) Node { 616 return Leaf(integerType(bitWidth, &unsignedIntTypes)) 617 } 618 619 func integerType(bitWidth int, types *[4]intType) *intType { 620 switch bitWidth { 621 case 8: 622 return &types[0] 623 case 16: 624 return &types[1] 625 case 32: 626 return &types[2] 627 case 64: 628 return &types[3] 629 default: 630 panic(fmt.Sprintf("cannot create a %d bits parquet integer node", bitWidth)) 631 } 632 } 633 634 var signedIntTypes = [...]intType{ 635 {BitWidth: 8, IsSigned: true}, 636 {BitWidth: 16, IsSigned: true}, 637 {BitWidth: 32, IsSigned: true}, 638 {BitWidth: 64, IsSigned: true}, 639 } 640 641 var unsignedIntTypes = [...]intType{ 642 {BitWidth: 8, IsSigned: false}, 643 {BitWidth: 16, IsSigned: false}, 644 {BitWidth: 32, IsSigned: false}, 645 {BitWidth: 64, IsSigned: false}, 646 } 647 648 type intType format.IntType 649 650 func (t *intType) String() string { return (*format.IntType)(t).String() } 651 652 func (t *intType) Kind() Kind { 653 if t.BitWidth == 64 { 654 return Int64 655 } else { 656 return Int32 657 } 658 } 659 660 func (t *intType) Length() int { return int(t.BitWidth) } 661 662 func (t *intType) EstimateSize(n int) int64 { return int64(t.BitWidth/8) * int64(n) } 663 664 func (t *intType) Compare(a, b Value) int { 665 if t.BitWidth == 64 { 666 i1 := a.Int64() 667 i2 := b.Int64() 668 if t.IsSigned { 669 return compareInt64(i1, i2) 670 } else { 671 return compareUint64(uint64(i1), uint64(i2)) 672 } 673 } else { 674 i1 := a.Int32() 675 i2 := b.Int32() 676 if t.IsSigned { 677 return compareInt32(i1, i2) 678 } else { 679 return compareUint32(uint32(i1), uint32(i2)) 680 } 681 } 682 } 683 684 func (t *intType) ColumnOrder() *format.ColumnOrder { 685 return &typeDefinedColumnOrder 686 } 687 688 func (t *intType) PhysicalType() *format.Type { 689 if t.BitWidth == 64 { 690 return &physicalTypes[Int64] 691 } else { 692 return &physicalTypes[Int32] 693 } 694 } 695 696 func (t *intType) LogicalType() *format.LogicalType { 697 return &format.LogicalType{Integer: (*format.IntType)(t)} 698 } 699 700 func (t *intType) ConvertedType() *deprecated.ConvertedType { 701 convertedType := bits.Len8(uint8(t.BitWidth)/8) - 1 // 8=>0, 16=>1, 32=>2, 64=>4 702 if t.IsSigned { 703 convertedType += int(deprecated.Int8) 704 } else { 705 convertedType += int(deprecated.Uint8) 706 } 707 return &convertedTypes[convertedType] 708 } 709 710 func (t *intType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 711 if t.IsSigned { 712 if t.BitWidth == 64 { 713 return newInt64ColumnIndexer() 714 } else { 715 return newInt32ColumnIndexer() 716 } 717 } else { 718 if t.BitWidth == 64 { 719 return newUint64ColumnIndexer() 720 } else { 721 return newUint32ColumnIndexer() 722 } 723 } 724 } 725 726 func (t *intType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 727 if t.IsSigned { 728 if t.BitWidth == 64 { 729 return newInt64ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 730 } else { 731 return newInt32ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 732 } 733 } else { 734 if t.BitWidth == 64 { 735 return newUint64ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 736 } else { 737 return newUint32ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 738 } 739 } 740 } 741 742 func (t *intType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { 743 if t.IsSigned { 744 if t.BitWidth == 64 { 745 return newInt64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 746 } else { 747 return newInt32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 748 } 749 } else { 750 if t.BitWidth == 64 { 751 return newUint64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 752 } else { 753 return newUint32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 754 } 755 } 756 } 757 758 func (t *intType) NewPage(columnIndex, numValues int, data []byte) Page { 759 if t.IsSigned { 760 if t.BitWidth == 64 { 761 return newInt64Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 762 } else { 763 return newInt32Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 764 } 765 } else { 766 if t.BitWidth == 64 { 767 return newUint64Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 768 } else { 769 return newUint32Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 770 } 771 } 772 } 773 774 func (t *intType) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 775 if t.BitWidth == 64 { 776 return enc.EncodeInt64(dst, src) 777 } else { 778 return enc.EncodeInt32(dst, src) 779 } 780 } 781 782 func (t *intType) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 783 if t.BitWidth == 64 { 784 return enc.DecodeInt64(dst, src) 785 } else { 786 return enc.DecodeInt32(dst, src) 787 } 788 } 789 790 // Decimal constructs a leaf node of decimal logical type with the given 791 // scale, precision, and underlying type. 792 // 793 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal 794 func Decimal(scale, precision int, typ Type) Node { 795 switch typ.Kind() { 796 case Int32, Int64, FixedLenByteArray: 797 default: 798 panic("DECIMAL node must annotate Int32, Int64 or FixedLenByteArray but got " + typ.String()) 799 } 800 return Leaf(&decimalType{ 801 decimal: format.DecimalType{ 802 Scale: int32(scale), 803 Precision: int32(precision), 804 }, 805 Type: typ, 806 }) 807 } 808 809 type decimalType struct { 810 decimal format.DecimalType 811 Type 812 } 813 814 func (t *decimalType) String() string { return t.decimal.String() } 815 816 func (t *decimalType) LogicalType() *format.LogicalType { 817 return &format.LogicalType{Decimal: &t.decimal} 818 } 819 820 func (t *decimalType) ConvertedType() *deprecated.ConvertedType { 821 return &convertedTypes[deprecated.Decimal] 822 } 823 824 // String constructs a leaf node of UTF8 logical type. 825 // 826 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#string 827 func String() Node { return Leaf(&stringType{}) } 828 829 type stringType format.StringType 830 831 func (t *stringType) String() string { return (*format.StringType)(t).String() } 832 833 func (t *stringType) Kind() Kind { return ByteArray } 834 835 func (t *stringType) Length() int { return 0 } 836 837 func (t *stringType) EstimateSize(n int) int64 { return 10 * int64(n) } 838 839 func (t *stringType) Compare(a, b Value) int { 840 return bytes.Compare(a.ByteArray(), b.ByteArray()) 841 } 842 843 func (t *stringType) ColumnOrder() *format.ColumnOrder { 844 return &typeDefinedColumnOrder 845 } 846 847 func (t *stringType) PhysicalType() *format.Type { 848 return &physicalTypes[ByteArray] 849 } 850 851 func (t *stringType) LogicalType() *format.LogicalType { 852 return &format.LogicalType{UTF8: (*format.StringType)(t)} 853 } 854 855 func (t *stringType) ConvertedType() *deprecated.ConvertedType { 856 return &convertedTypes[deprecated.UTF8] 857 } 858 859 func (t *stringType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 860 return newByteArrayColumnIndexer(sizeLimit) 861 } 862 863 func (t *stringType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { 864 return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 865 } 866 867 func (t *stringType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 868 return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 869 } 870 871 func (t *stringType) NewPage(columnIndex, numValues int, data []byte) Page { 872 return newByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 873 } 874 875 func (t *stringType) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 876 return enc.EncodeByteArray(dst, src) 877 } 878 879 func (t *stringType) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 880 return enc.DecodeByteArray(dst, src) 881 } 882 883 // UUID constructs a leaf node of UUID logical type. 884 // 885 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#uuid 886 func UUID() Node { return Leaf(&uuidType{}) } 887 888 type uuidType format.UUIDType 889 890 func (t *uuidType) String() string { return (*format.UUIDType)(t).String() } 891 892 func (t *uuidType) Kind() Kind { return FixedLenByteArray } 893 894 func (t *uuidType) Length() int { return 16 } 895 896 func (t *uuidType) EstimateSize(n int) int64 { return 16 * int64(n) } 897 898 func (t *uuidType) Compare(a, b Value) int { 899 return compareBE128((*[16]byte)(a.ByteArray()), (*[16]byte)(b.ByteArray())) 900 } 901 902 func (t *uuidType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 903 904 func (t *uuidType) PhysicalType() *format.Type { return &physicalTypes[FixedLenByteArray] } 905 906 func (t *uuidType) LogicalType() *format.LogicalType { 907 return &format.LogicalType{UUID: (*format.UUIDType)(t)} 908 } 909 910 func (t *uuidType) ConvertedType() *deprecated.ConvertedType { return nil } 911 912 func (t *uuidType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 913 return newBE128ColumnIndexer() 914 } 915 916 func (t *uuidType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { 917 return newBE128Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 918 } 919 920 func (t *uuidType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 921 return newBE128ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 922 } 923 924 func (t *uuidType) NewPage(columnIndex, numValues int, data []byte) Page { 925 return newBE128Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 926 } 927 928 func (t *uuidType) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 929 return enc.EncodeFixedLenByteArray(dst, src, 16) 930 } 931 932 func (t *uuidType) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 933 return enc.DecodeFixedLenByteArray(dst, src, 16) 934 } 935 936 // Enum constructs a leaf node with a logical type representing enumerations. 937 // 938 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#enum 939 func Enum() Node { return Leaf(&enumType{}) } 940 941 type enumType format.EnumType 942 943 func (t *enumType) String() string { return (*format.EnumType)(t).String() } 944 945 func (t *enumType) Kind() Kind { return ByteArray } 946 947 func (t *enumType) Length() int { return 0 } 948 949 func (t *enumType) EstimateSize(n int) int64 { return 10 * int64(n) } 950 951 func (t *enumType) Compare(a, b Value) int { 952 return bytes.Compare(a.ByteArray(), b.ByteArray()) 953 } 954 955 func (t *enumType) ColumnOrder() *format.ColumnOrder { 956 return &typeDefinedColumnOrder 957 } 958 959 func (t *enumType) PhysicalType() *format.Type { 960 return &physicalTypes[ByteArray] 961 } 962 963 func (t *enumType) LogicalType() *format.LogicalType { 964 return &format.LogicalType{Enum: (*format.EnumType)(t)} 965 } 966 967 func (t *enumType) ConvertedType() *deprecated.ConvertedType { 968 return &convertedTypes[deprecated.Enum] 969 } 970 971 func (t *enumType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 972 return newByteArrayColumnIndexer(sizeLimit) 973 } 974 975 func (t *enumType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { 976 return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 977 } 978 979 func (t *enumType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 980 return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 981 } 982 983 func (t *enumType) NewPage(columnIndex, numValues int, data []byte) Page { 984 return newByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 985 } 986 987 func (t *enumType) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 988 return enc.EncodeByteArray(dst, src) 989 } 990 991 func (t *enumType) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 992 return enc.DecodeByteArray(dst, src) 993 } 994 995 // JSON constructs a leaf node of JSON logical type. 996 // 997 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#json 998 func JSON() Node { return Leaf(&jsonType{}) } 999 1000 type jsonType format.JsonType 1001 1002 func (t *jsonType) String() string { return (*format.JsonType)(t).String() } 1003 1004 func (t *jsonType) Kind() Kind { return ByteArray } 1005 1006 func (t *jsonType) Length() int { return 0 } 1007 1008 func (t *jsonType) EstimateSize(n int) int64 { return 10 * int64(n) } 1009 1010 func (t *jsonType) Compare(a, b Value) int { 1011 return bytes.Compare(a.ByteArray(), b.ByteArray()) 1012 } 1013 1014 func (t *jsonType) ColumnOrder() *format.ColumnOrder { 1015 return &typeDefinedColumnOrder 1016 } 1017 1018 func (t *jsonType) PhysicalType() *format.Type { 1019 return &physicalTypes[ByteArray] 1020 } 1021 1022 func (t *jsonType) LogicalType() *format.LogicalType { 1023 return &format.LogicalType{Json: (*format.JsonType)(t)} 1024 } 1025 1026 func (t *jsonType) ConvertedType() *deprecated.ConvertedType { 1027 return &convertedTypes[deprecated.Json] 1028 } 1029 1030 func (t *jsonType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 1031 return newByteArrayColumnIndexer(sizeLimit) 1032 } 1033 1034 func (t *jsonType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { 1035 return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 1036 } 1037 1038 func (t *jsonType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 1039 return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 1040 } 1041 1042 func (t *jsonType) NewPage(columnIndex, numValues int, data []byte) Page { 1043 return newByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 1044 } 1045 1046 func (t *jsonType) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 1047 return enc.EncodeByteArray(dst, src) 1048 } 1049 1050 func (t *jsonType) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 1051 return enc.DecodeByteArray(dst, src) 1052 } 1053 1054 // BSON constructs a leaf node of BSON logical type. 1055 // 1056 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#bson 1057 func BSON() Node { return Leaf(&bsonType{}) } 1058 1059 type bsonType format.BsonType 1060 1061 func (t *bsonType) String() string { return (*format.BsonType)(t).String() } 1062 1063 func (t *bsonType) Kind() Kind { return ByteArray } 1064 1065 func (t *bsonType) Length() int { return 0 } 1066 1067 func (t *bsonType) EstimateSize(n int) int64 { return 10 * int64(n) } 1068 1069 func (t *bsonType) Compare(a, b Value) int { 1070 return bytes.Compare(a.ByteArray(), b.ByteArray()) 1071 } 1072 1073 func (t *bsonType) ColumnOrder() *format.ColumnOrder { 1074 return &typeDefinedColumnOrder 1075 } 1076 1077 func (t *bsonType) PhysicalType() *format.Type { 1078 return &physicalTypes[ByteArray] 1079 } 1080 1081 func (t *bsonType) LogicalType() *format.LogicalType { 1082 return &format.LogicalType{Bson: (*format.BsonType)(t)} 1083 } 1084 1085 func (t *bsonType) ConvertedType() *deprecated.ConvertedType { 1086 return &convertedTypes[deprecated.Bson] 1087 } 1088 1089 func (t *bsonType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 1090 return newByteArrayColumnIndexer(sizeLimit) 1091 } 1092 1093 func (t *bsonType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { 1094 return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 1095 } 1096 1097 func (t *bsonType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 1098 return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 1099 } 1100 1101 func (t *bsonType) NewPage(columnIndex, numValues int, data []byte) Page { 1102 return newByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 1103 } 1104 1105 func (t *bsonType) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 1106 return enc.EncodeByteArray(dst, src) 1107 } 1108 1109 func (t *bsonType) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 1110 return enc.DecodeByteArray(dst, src) 1111 } 1112 1113 // Date constructs a leaf node of DATE logical type. 1114 // 1115 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#date 1116 func Date() Node { return Leaf(&dateType{}) } 1117 1118 type dateType format.DateType 1119 1120 func (t *dateType) String() string { return (*format.DateType)(t).String() } 1121 1122 func (t *dateType) Kind() Kind { return Int32 } 1123 1124 func (t *dateType) Length() int { return 32 } 1125 1126 func (t *dateType) EstimateSize(n int) int64 { return 4 * int64(n) } 1127 1128 func (t *dateType) Compare(a, b Value) int { return compareInt32(a.Int32(), b.Int32()) } 1129 1130 func (t *dateType) ColumnOrder() *format.ColumnOrder { 1131 return &typeDefinedColumnOrder 1132 } 1133 1134 func (t *dateType) PhysicalType() *format.Type { return &physicalTypes[Int32] } 1135 1136 func (t *dateType) LogicalType() *format.LogicalType { 1137 return &format.LogicalType{Date: (*format.DateType)(t)} 1138 } 1139 1140 func (t *dateType) ConvertedType() *deprecated.ConvertedType { 1141 return &convertedTypes[deprecated.Date] 1142 } 1143 1144 func (t *dateType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 1145 return newInt32ColumnIndexer() 1146 } 1147 1148 func (t *dateType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 1149 return newInt32ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 1150 } 1151 1152 func (t *dateType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { 1153 return newInt32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 1154 } 1155 1156 func (t *dateType) NewPage(columnIndex, numValues int, data []byte) Page { 1157 return newInt32Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 1158 } 1159 1160 func (t *dateType) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 1161 return enc.EncodeInt32(dst, src) 1162 } 1163 1164 func (t *dateType) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 1165 return enc.DecodeInt32(dst, src) 1166 } 1167 1168 // TimeUnit represents units of time in the parquet type system. 1169 type TimeUnit interface { 1170 // Returns the precision of the time unit as a time.Duration value. 1171 Duration() time.Duration 1172 // Converts the TimeUnit value to its representation in the parquet thrift 1173 // format. 1174 TimeUnit() format.TimeUnit 1175 } 1176 1177 var ( 1178 Millisecond TimeUnit = &millisecond{} 1179 Microsecond TimeUnit = µsecond{} 1180 Nanosecond TimeUnit = &nanosecond{} 1181 ) 1182 1183 type millisecond format.MilliSeconds 1184 1185 func (u *millisecond) Duration() time.Duration { return time.Millisecond } 1186 func (u *millisecond) TimeUnit() format.TimeUnit { 1187 return format.TimeUnit{Millis: (*format.MilliSeconds)(u)} 1188 } 1189 1190 type microsecond format.MicroSeconds 1191 1192 func (u *microsecond) Duration() time.Duration { return time.Microsecond } 1193 func (u *microsecond) TimeUnit() format.TimeUnit { 1194 return format.TimeUnit{Micros: (*format.MicroSeconds)(u)} 1195 } 1196 1197 type nanosecond format.NanoSeconds 1198 1199 func (u *nanosecond) Duration() time.Duration { return time.Nanosecond } 1200 func (u *nanosecond) TimeUnit() format.TimeUnit { 1201 return format.TimeUnit{Nanos: (*format.NanoSeconds)(u)} 1202 } 1203 1204 // Time constructs a leaf node of TIME logical type. 1205 // 1206 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#time 1207 func Time(unit TimeUnit) Node { 1208 return Leaf(&timeType{IsAdjustedToUTC: true, Unit: unit.TimeUnit()}) 1209 } 1210 1211 type timeType format.TimeType 1212 1213 func (t *timeType) useInt32() bool { 1214 return t.Unit.Millis != nil 1215 } 1216 1217 func (t *timeType) useInt64() bool { 1218 return t.Unit.Micros != nil 1219 } 1220 1221 func (t *timeType) String() string { 1222 return (*format.TimeType)(t).String() 1223 } 1224 1225 func (t *timeType) Kind() Kind { 1226 if t.useInt32() { 1227 return Int32 1228 } else { 1229 return Int64 1230 } 1231 } 1232 1233 func (t *timeType) Length() int { 1234 if t.useInt32() { 1235 return 32 1236 } else { 1237 return 64 1238 } 1239 } 1240 1241 func (t *timeType) EstimateSize(n int) int64 { 1242 if t.useInt32() { 1243 return 4 * int64(n) 1244 } else { 1245 return 8 * int64(n) 1246 } 1247 } 1248 1249 func (t *timeType) Compare(a, b Value) int { 1250 if t.useInt32() { 1251 return compareInt32(a.Int32(), b.Int32()) 1252 } else { 1253 return compareInt64(a.Int64(), b.Int64()) 1254 } 1255 } 1256 1257 func (t *timeType) ColumnOrder() *format.ColumnOrder { 1258 return &typeDefinedColumnOrder 1259 } 1260 1261 func (t *timeType) PhysicalType() *format.Type { 1262 if t.useInt32() { 1263 return &physicalTypes[Int32] 1264 } else { 1265 return &physicalTypes[Int64] 1266 } 1267 } 1268 1269 func (t *timeType) LogicalType() *format.LogicalType { 1270 return &format.LogicalType{Time: (*format.TimeType)(t)} 1271 } 1272 1273 func (t *timeType) ConvertedType() *deprecated.ConvertedType { 1274 switch { 1275 case t.useInt32(): 1276 return &convertedTypes[deprecated.TimeMillis] 1277 case t.useInt64(): 1278 return &convertedTypes[deprecated.TimeMicros] 1279 default: 1280 return nil 1281 } 1282 } 1283 1284 func (t *timeType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 1285 if t.useInt32() { 1286 return newInt32ColumnIndexer() 1287 } else { 1288 return newInt64ColumnIndexer() 1289 } 1290 } 1291 1292 func (t *timeType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 1293 if t.useInt32() { 1294 return newInt32ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 1295 } else { 1296 return newInt64ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 1297 } 1298 } 1299 1300 func (t *timeType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { 1301 if t.useInt32() { 1302 return newInt32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 1303 } else { 1304 return newInt64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 1305 } 1306 } 1307 1308 func (t *timeType) NewPage(columnIndex, numValues int, data []byte) Page { 1309 if t.useInt32() { 1310 return newInt32Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 1311 } else { 1312 return newInt64Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 1313 } 1314 } 1315 1316 func (t *timeType) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 1317 if t.useInt32() { 1318 return enc.EncodeInt32(dst, src) 1319 } else { 1320 return enc.EncodeInt64(dst, src) 1321 } 1322 } 1323 1324 func (t *timeType) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 1325 if t.useInt32() { 1326 return enc.DecodeInt32(dst, src) 1327 } else { 1328 return enc.DecodeInt64(dst, src) 1329 } 1330 } 1331 1332 // Timestamp constructs of leaf node of TIMESTAMP logical type. 1333 // 1334 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp 1335 func Timestamp(unit TimeUnit) Node { 1336 return Leaf(×tampType{IsAdjustedToUTC: true, Unit: unit.TimeUnit()}) 1337 } 1338 1339 type timestampType format.TimestampType 1340 1341 func (t *timestampType) String() string { return (*format.TimestampType)(t).String() } 1342 1343 func (t *timestampType) Kind() Kind { return Int64 } 1344 1345 func (t *timestampType) Length() int { return 64 } 1346 1347 func (t *timestampType) EstimateSize(n int) int64 { return 8 * int64(n) } 1348 1349 func (t *timestampType) Compare(a, b Value) int { return compareInt64(a.Int64(), b.Int64()) } 1350 1351 func (t *timestampType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } 1352 1353 func (t *timestampType) PhysicalType() *format.Type { return &physicalTypes[Int64] } 1354 1355 func (t *timestampType) LogicalType() *format.LogicalType { 1356 return &format.LogicalType{Timestamp: (*format.TimestampType)(t)} 1357 } 1358 1359 func (t *timestampType) ConvertedType() *deprecated.ConvertedType { 1360 switch { 1361 case t.Unit.Millis != nil: 1362 return &convertedTypes[deprecated.TimestampMillis] 1363 case t.Unit.Micros != nil: 1364 return &convertedTypes[deprecated.TimestampMicros] 1365 default: 1366 return nil 1367 } 1368 } 1369 1370 func (t *timestampType) NewColumnIndexer(sizeLimit int) ColumnIndexer { 1371 return newInt64ColumnIndexer() 1372 } 1373 1374 func (t *timestampType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { 1375 return newInt64ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 1376 } 1377 1378 func (t *timestampType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { 1379 return newInt64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 1380 } 1381 1382 func (t *timestampType) NewPage(columnIndex, numValues int, data []byte) Page { 1383 return newInt64Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) 1384 } 1385 1386 func (t *timestampType) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 1387 return enc.EncodeInt64(dst, src) 1388 } 1389 1390 func (t *timestampType) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) { 1391 return enc.DecodeInt64(dst, src) 1392 } 1393 1394 // List constructs a node of LIST logical type. 1395 // 1396 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists 1397 func List(of Node) Node { 1398 return listNode{Group{"list": Repeated(Group{"element": of})}} 1399 } 1400 1401 type listNode struct{ Group } 1402 1403 func (listNode) Type() Type { return &listType{} } 1404 1405 type listType format.ListType 1406 1407 func (t *listType) String() string { return (*format.ListType)(t).String() } 1408 1409 func (t *listType) Kind() Kind { panic("cannot call Kind on parquet LIST type") } 1410 1411 func (t *listType) Length() int { return 0 } 1412 1413 func (t *listType) EstimateSize(int) int64 { return 0 } 1414 1415 func (t *listType) Compare(Value, Value) int { panic("cannot compare values on parquet LIST type") } 1416 1417 func (t *listType) ColumnOrder() *format.ColumnOrder { return nil } 1418 1419 func (t *listType) PhysicalType() *format.Type { return nil } 1420 1421 func (t *listType) LogicalType() *format.LogicalType { 1422 return &format.LogicalType{List: (*format.ListType)(t)} 1423 } 1424 1425 func (t *listType) ConvertedType() *deprecated.ConvertedType { 1426 return &convertedTypes[deprecated.List] 1427 } 1428 1429 func (t *listType) NewColumnIndexer(int) ColumnIndexer { 1430 panic("create create column indexer from parquet LIST type") 1431 } 1432 1433 func (t *listType) NewDictionary(int, int, []byte) Dictionary { 1434 panic("cannot create dictionary from parquet LIST type") 1435 } 1436 1437 func (t *listType) NewColumnBuffer(int, int) ColumnBuffer { 1438 panic("cannot create column buffer from parquet LIST type") 1439 } 1440 1441 func (t *listType) NewPage(int, int, []byte) Page { 1442 panic("cannot create page from parquet LIST type") 1443 } 1444 1445 func (t *listType) Encode(dst, _ []byte, _ encoding.Encoding) ([]byte, error) { 1446 panic("cannot encode parquet LIST type") 1447 } 1448 1449 func (t *listType) Decode(dst, _ []byte, _ encoding.Encoding) ([]byte, error) { 1450 panic("cannot decode parquet LIST type") 1451 } 1452 1453 // Map constructs a node of MAP logical type. 1454 // 1455 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps 1456 func Map(key, value Node) Node { 1457 return mapNode{Group{ 1458 "key_value": Repeated(Group{ 1459 "key": Required(key), 1460 "value": value, 1461 }), 1462 }} 1463 } 1464 1465 type mapNode struct{ Group } 1466 1467 func (mapNode) Type() Type { return &mapType{} } 1468 1469 type mapType format.MapType 1470 1471 func (t *mapType) String() string { return (*format.MapType)(t).String() } 1472 1473 func (t *mapType) Kind() Kind { panic("cannot call Kind on parquet MAP type") } 1474 1475 func (t *mapType) Length() int { return 0 } 1476 1477 func (t *mapType) EstimateSize(int) int64 { return 0 } 1478 1479 func (t *mapType) Compare(Value, Value) int { panic("cannot compare values on parquet MAP type") } 1480 1481 func (t *mapType) ColumnOrder() *format.ColumnOrder { return nil } 1482 1483 func (t *mapType) PhysicalType() *format.Type { return nil } 1484 1485 func (t *mapType) LogicalType() *format.LogicalType { 1486 return &format.LogicalType{Map: (*format.MapType)(t)} 1487 } 1488 1489 func (t *mapType) ConvertedType() *deprecated.ConvertedType { 1490 return &convertedTypes[deprecated.Map] 1491 } 1492 1493 func (t *mapType) NewColumnIndexer(int) ColumnIndexer { 1494 panic("create create column indexer from parquet MAP type") 1495 } 1496 1497 func (t *mapType) NewDictionary(int, int, []byte) Dictionary { 1498 panic("cannot create dictionary from parquet MAP type") 1499 } 1500 1501 func (t *mapType) NewColumnBuffer(int, int) ColumnBuffer { 1502 panic("cannot create column buffer from parquet MAP type") 1503 } 1504 1505 func (t *mapType) NewPage(int, int, []byte) Page { 1506 panic("cannot create page from parquet MAP type") 1507 } 1508 1509 func (t *mapType) Encode(dst, _ []byte, _ encoding.Encoding) ([]byte, error) { 1510 panic("cannot encode parquet MAP type") 1511 } 1512 1513 func (t *mapType) Decode(dst, _ []byte, _ encoding.Encoding) ([]byte, error) { 1514 panic("cannot decode parquet MAP type") 1515 } 1516 1517 type nullType format.NullType 1518 1519 func (t *nullType) String() string { return (*format.NullType)(t).String() } 1520 1521 func (t *nullType) Kind() Kind { return -1 } 1522 1523 func (t *nullType) Length() int { return 0 } 1524 1525 func (t *nullType) EstimateSize(int) int64 { return 0 } 1526 1527 func (t *nullType) Compare(Value, Value) int { panic("cannot compare values on parquet NULL type") } 1528 1529 func (t *nullType) ColumnOrder() *format.ColumnOrder { return nil } 1530 1531 func (t *nullType) PhysicalType() *format.Type { return nil } 1532 1533 func (t *nullType) LogicalType() *format.LogicalType { 1534 return &format.LogicalType{Unknown: (*format.NullType)(t)} 1535 } 1536 1537 func (t *nullType) ConvertedType() *deprecated.ConvertedType { return nil } 1538 1539 func (t *nullType) NewColumnIndexer(int) ColumnIndexer { 1540 panic("create create column indexer from parquet NULL type") 1541 } 1542 1543 func (t *nullType) NewDictionary(int, int, []byte) Dictionary { 1544 panic("cannot create dictionary from parquet NULL type") 1545 } 1546 1547 func (t *nullType) NewColumnBuffer(int, int) ColumnBuffer { 1548 panic("cannot create column buffer from parquet NULL type") 1549 } 1550 1551 func (t *nullType) NewPage(columnIndex, numValues int, _ []byte) Page { 1552 return newNullPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) 1553 } 1554 1555 func (t *nullType) Encode(dst, _ []byte, _ encoding.Encoding) ([]byte, error) { 1556 return dst[:0], nil 1557 } 1558 1559 func (t *nullType) Decode(dst, _ []byte, _ encoding.Encoding) ([]byte, error) { 1560 return dst[:0], nil 1561 } 1562 1563 type groupType struct{} 1564 1565 func (groupType) String() string { return "group" } 1566 1567 func (groupType) Kind() Kind { 1568 panic("cannot call Kind on parquet group") 1569 } 1570 1571 func (groupType) Compare(Value, Value) int { 1572 panic("cannot compare values on parquet group") 1573 } 1574 1575 func (groupType) NewColumnIndexer(int) ColumnIndexer { 1576 panic("cannot create column indexer from parquet group") 1577 } 1578 1579 func (groupType) NewDictionary(int, int, []byte) Dictionary { 1580 panic("cannot create dictionary from parquet group") 1581 } 1582 1583 func (t groupType) NewColumnBuffer(int, int) ColumnBuffer { 1584 panic("cannot create column buffer from parquet group") 1585 } 1586 1587 func (t groupType) NewPage(int, int, []byte) Page { 1588 panic("cannot create page from parquet group") 1589 } 1590 1591 func (groupType) Encode(_, _ []byte, _ encoding.Encoding) ([]byte, error) { 1592 panic("cannot encode parquet group") 1593 } 1594 1595 func (groupType) Decode(_, _ []byte, _ encoding.Encoding) ([]byte, error) { 1596 panic("cannot decode parquet group") 1597 } 1598 1599 func (groupType) Length() int { return 0 } 1600 1601 func (groupType) EstimateSize(int) int64 { return 0 } 1602 1603 func (groupType) ColumnOrder() *format.ColumnOrder { return nil } 1604 1605 func (groupType) PhysicalType() *format.Type { return nil } 1606 1607 func (groupType) LogicalType() *format.LogicalType { return nil } 1608 1609 func (groupType) ConvertedType() *deprecated.ConvertedType { return nil }