github.com/apache/arrow/go/v14@v14.0.2/parquet/types.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package parquet 18 19 import ( 20 "encoding/binary" 21 "io" 22 "reflect" 23 "strings" 24 "time" 25 "unsafe" 26 27 "github.com/apache/arrow/go/v14/arrow" 28 format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet" 29 ) 30 31 const ( 32 julianUnixEpoch int64 = 2440588 33 nanosPerDay int64 = 3600 * 24 * 1000 * 1000 * 1000 34 // Int96SizeBytes is the number of bytes that make up an Int96 35 Int96SizeBytes int = 12 36 ) 37 38 var ( 39 // Int96Traits provides information about the Int96 type 40 Int96Traits int96Traits 41 // ByteArrayTraits provides information about the ByteArray type, which is just an []byte 42 ByteArrayTraits byteArrayTraits 43 // FixedLenByteArrayTraits provides information about the FixedLenByteArray type which is just an []byte 44 FixedLenByteArrayTraits fixedLenByteArrayTraits 45 // ByteArraySizeBytes is the number of bytes returned by reflect.TypeOf(ByteArray{}).Size() 46 ByteArraySizeBytes int = int(reflect.TypeOf(ByteArray{}).Size()) 47 // FixedLenByteArraySizeBytes is the number of bytes returned by reflect.TypeOf(FixedLenByteArray{}).Size() 48 FixedLenByteArraySizeBytes int = int(reflect.TypeOf(FixedLenByteArray{}).Size()) 49 ) 50 51 // ReaderAtSeeker is a combination of the ReaderAt and ReadSeeker interfaces 52 // from the io package defining the only functionality that is required 53 // in order for a parquet file to be read by the file functions. We just need 54 // to be able to call ReadAt, Read, and Seek 55 type ReaderAtSeeker interface { 56 io.ReaderAt 57 io.Seeker 58 } 59 60 // NewInt96 creates a new Int96 from the given 3 uint32 values. 61 func NewInt96(v [3]uint32) (out Int96) { 62 binary.LittleEndian.PutUint32(out[0:], v[0]) 63 binary.LittleEndian.PutUint32(out[4:], v[1]) 64 binary.LittleEndian.PutUint32(out[8:], v[2]) 65 return 66 } 67 68 // Int96 is a 12 byte integer value utilized for representing timestamps as a 64 bit integer and a 32 bit 69 // integer. 70 type Int96 [12]byte 71 72 // SetNanoSeconds sets the Nanosecond field of the Int96 timestamp to the provided value 73 func (i96 *Int96) SetNanoSeconds(nanos int64) { 74 binary.LittleEndian.PutUint64(i96[:8], uint64(nanos)) 75 } 76 77 // String provides the string representation as a timestamp via converting to a time.Time 78 // and then calling String 79 func (i96 Int96) String() string { 80 return i96.ToTime().String() 81 } 82 83 // ToTime returns a go time.Time object that represents the same time instant as the given Int96 value 84 func (i96 Int96) ToTime() time.Time { 85 nanos := binary.LittleEndian.Uint64(i96[:8]) 86 jdays := binary.LittleEndian.Uint32(i96[8:]) 87 88 nanos = (uint64(jdays)-uint64(julianUnixEpoch))*uint64(nanosPerDay) + nanos 89 t := time.Unix(0, int64(nanos)) 90 return t.UTC() 91 } 92 93 type int96Traits struct{} 94 95 func (int96Traits) BytesRequired(n int) int { return Int96SizeBytes * n } 96 97 func (int96Traits) CastFromBytes(b []byte) []Int96 { 98 h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) 99 100 var res []Int96 101 s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) 102 s.Data = h.Data 103 s.Len = h.Len / Int96SizeBytes 104 s.Cap = h.Cap / Int96SizeBytes 105 106 return res 107 } 108 109 func (int96Traits) CastToBytes(b []Int96) []byte { 110 h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) 111 112 var res []byte 113 s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) 114 s.Data = h.Data 115 s.Len = h.Len * Int96SizeBytes 116 s.Cap = h.Cap * Int96SizeBytes 117 118 return res 119 } 120 121 // ByteArray is a type to be utilized for representing the Parquet ByteArray physical type, represented as a byte slice 122 type ByteArray []byte 123 124 // Len returns the current length of the ByteArray, equivalent to len(bytearray) 125 func (b ByteArray) Len() int { 126 return len(b) 127 } 128 129 // String returns a string representation of the ByteArray 130 func (b ByteArray) String() string { 131 return *(*string)(unsafe.Pointer(&b)) 132 } 133 134 func (b ByteArray) Bytes() []byte { 135 return b 136 } 137 138 type byteArrayTraits struct{} 139 140 func (byteArrayTraits) BytesRequired(n int) int { 141 return ByteArraySizeBytes * n 142 } 143 144 func (byteArrayTraits) CastFromBytes(b []byte) []ByteArray { 145 h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) 146 147 var res []ByteArray 148 s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) 149 s.Data = h.Data 150 s.Len = h.Len / ByteArraySizeBytes 151 s.Cap = h.Cap / ByteArraySizeBytes 152 153 return res 154 } 155 156 // FixedLenByteArray is a go type to represent a FixedLengthByteArray as a byte slice 157 type FixedLenByteArray []byte 158 159 // Len returns the current length of this FixedLengthByteArray, equivalent to len(fixedlenbytearray) 160 func (b FixedLenByteArray) Len() int { 161 return len(b) 162 } 163 164 // String returns a string representation of the FixedLenByteArray 165 func (b FixedLenByteArray) String() string { 166 return *(*string)(unsafe.Pointer(&b)) 167 } 168 169 func (b FixedLenByteArray) Bytes() []byte { 170 return b 171 } 172 173 type fixedLenByteArrayTraits struct{} 174 175 func (fixedLenByteArrayTraits) BytesRequired(n int) int { 176 return FixedLenByteArraySizeBytes * n 177 } 178 179 func (fixedLenByteArrayTraits) CastFromBytes(b []byte) []FixedLenByteArray { 180 h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) 181 182 var res []FixedLenByteArray 183 s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) 184 s.Data = h.Data 185 s.Len = h.Len / FixedLenByteArraySizeBytes 186 s.Cap = h.Cap / FixedLenByteArraySizeBytes 187 188 return res 189 } 190 191 // Creating our own enums allows avoiding the transitive dependency on the 192 // compiled thrift definitions in the public API, allowing us to not export 193 // the entire Thrift definitions, while making everything a simple cast between. 194 // 195 // It also let's us add special values like NONE to distinguish between values 196 // that are set or not set 197 type ( 198 // Type is the physical type as in parquet.thrift 199 Type format.Type 200 // Cipher is the parquet Cipher Algorithms 201 Cipher int 202 // ColumnOrder is the Column Order from the parquet.thrift 203 ColumnOrder *format.ColumnOrder 204 // Version is the parquet version type 205 Version int8 206 // DataPageVersion is the version of the Parquet Data Pages 207 DataPageVersion int8 208 // Encoding is the parquet Encoding type 209 Encoding format.Encoding 210 // Repetition is the underlying parquet field repetition type as in parquet.thrift 211 Repetition format.FieldRepetitionType 212 // ColumnPath is the path from the root of the schema to a given column 213 ColumnPath []string 214 ) 215 216 func (c ColumnPath) String() string { 217 if c == nil { 218 return "" 219 } 220 return strings.Join(c, ".") 221 } 222 223 // Extend creates a new ColumnPath from an existing one, with the new ColumnPath having s appended to the end. 224 func (c ColumnPath) Extend(s string) ColumnPath { 225 p := make([]string, len(c), len(c)+1) 226 copy(p, c) 227 return append(p, s) 228 } 229 230 // ColumnPathFromString constructs a ColumnPath from a dot separated string 231 func ColumnPathFromString(s string) ColumnPath { 232 return strings.Split(s, ".") 233 } 234 235 // constants for choosing the Aes Algorithm to use for encryption/decryption 236 const ( 237 AesGcm Cipher = iota 238 AesCtr 239 ) 240 241 // Constants for the parquet Version which governs which data types are allowed 242 // and how they are represented. For example, uint32 data will be written differently 243 // depending on this value (as INT64 for V1_0, as UINT32 for other versions). 244 // 245 // However, some features - such as compression algorithms, encryption, 246 // or the improved v2 data page format must be enabled separately in writer 247 // properties. 248 const ( 249 // Enable only pre-2.2 parquet format features when writing. 250 // 251 // This is useful for maximum compatibility with legacy readers. 252 // Note that logical types may still be emitted, as long as they have 253 // a corresponding converted type. 254 V1_0 Version = iota // v1.0 255 // Enable parquet format 2.4 and earlier features when writing. 256 // 257 // This enables uint32 as well as logical types which don't have a 258 // corresponding converted type. 259 // 260 // Note: Parquet format 2.4.0 was released in October 2017 261 V2_4 // v2.4 262 // Enable Parquet format 2.6 and earlier features when writing. 263 // 264 // This enables the nanos time unit in addition to the V2_4 features. 265 // 266 // Note: Parquet format 2.6.0 was released in September 2018 267 V2_6 // v2.6 268 // Enable the latest parquet format 2.x features. 269 // 270 // This is equal to the greatest 2.x version supported by this library. 271 V2_LATEST = V2_6 272 ) 273 274 // constants for the parquet DataPage Version to use 275 const ( 276 DataPageV1 DataPageVersion = iota 277 DataPageV2 278 ) 279 280 func (e Encoding) String() string { 281 return format.Encoding(e).String() 282 } 283 284 var ( 285 // Types contains constants for the Physical Types that are used in the Parquet Spec 286 // 287 // They can be specified when needed as such: `parquet.Types.Int32` etc. The values 288 // all correspond to the values in parquet.thrift 289 Types = struct { 290 Boolean Type 291 Int32 Type 292 Int64 Type 293 Int96 Type 294 Float Type 295 Double Type 296 ByteArray Type 297 FixedLenByteArray Type 298 // this only exists as a convienence so we can denote it when necessary 299 // nearly all functions that take a parquet.Type will error/panic if given 300 // Undefined 301 Undefined Type 302 }{ 303 Boolean: Type(format.Type_BOOLEAN), 304 Int32: Type(format.Type_INT32), 305 Int64: Type(format.Type_INT64), 306 Int96: Type(format.Type_INT96), 307 Float: Type(format.Type_FLOAT), 308 Double: Type(format.Type_DOUBLE), 309 ByteArray: Type(format.Type_BYTE_ARRAY), 310 FixedLenByteArray: Type(format.Type_FIXED_LEN_BYTE_ARRAY), 311 Undefined: Type(format.Type_FIXED_LEN_BYTE_ARRAY + 1), 312 } 313 314 // Encodings contains constants for the encoding types of the column data 315 // 316 // The values used all correspond to the values in parquet.thrift for the 317 // corresponding encoding type. 318 Encodings = struct { 319 Plain Encoding 320 PlainDict Encoding 321 RLE Encoding 322 RLEDict Encoding 323 BitPacked Encoding // deprecated, not implemented 324 DeltaByteArray Encoding 325 DeltaBinaryPacked Encoding 326 DeltaLengthByteArray Encoding 327 }{ 328 Plain: Encoding(format.Encoding_PLAIN), 329 PlainDict: Encoding(format.Encoding_PLAIN_DICTIONARY), 330 RLE: Encoding(format.Encoding_RLE), 331 RLEDict: Encoding(format.Encoding_RLE_DICTIONARY), 332 BitPacked: Encoding(format.Encoding_BIT_PACKED), 333 DeltaByteArray: Encoding(format.Encoding_DELTA_BYTE_ARRAY), 334 DeltaBinaryPacked: Encoding(format.Encoding_DELTA_BINARY_PACKED), 335 DeltaLengthByteArray: Encoding(format.Encoding_DELTA_LENGTH_BYTE_ARRAY), 336 } 337 338 // ColumnOrders contains constants for the Column Ordering fields 339 ColumnOrders = struct { 340 Undefined ColumnOrder 341 TypeDefinedOrder ColumnOrder 342 }{ 343 Undefined: format.NewColumnOrder(), 344 TypeDefinedOrder: &format.ColumnOrder{TYPE_ORDER: format.NewTypeDefinedOrder()}, 345 } 346 347 // DefaultColumnOrder is to use TypeDefinedOrder 348 DefaultColumnOrder = ColumnOrders.TypeDefinedOrder 349 350 // Repetitions contains the constants for Field Repetition Types 351 Repetitions = struct { 352 Required Repetition 353 Optional Repetition 354 Repeated Repetition 355 Undefined Repetition // convenience value 356 }{ 357 Required: Repetition(format.FieldRepetitionType_REQUIRED), 358 Optional: Repetition(format.FieldRepetitionType_OPTIONAL), 359 Repeated: Repetition(format.FieldRepetitionType_REPEATED), 360 Undefined: Repetition(format.FieldRepetitionType_REPEATED + 1), 361 } 362 ) 363 364 func (t Type) String() string { 365 switch t { 366 case Types.Undefined: 367 return "UNDEFINED" 368 default: 369 return format.Type(t).String() 370 } 371 } 372 373 func (r Repetition) String() string { 374 return strings.ToLower(format.FieldRepetitionType(r).String()) 375 } 376 377 // ByteSize returns the number of bytes required to store a single value of 378 // the given parquet.Type in memory. 379 func (t Type) ByteSize() int { 380 switch t { 381 case Types.Boolean: 382 return 1 383 case Types.Int32: 384 return arrow.Int32SizeBytes 385 case Types.Int64: 386 return arrow.Int64SizeBytes 387 case Types.Int96: 388 return Int96SizeBytes 389 case Types.Float: 390 return arrow.Float32SizeBytes 391 case Types.Double: 392 return arrow.Float64SizeBytes 393 case Types.ByteArray: 394 return ByteArraySizeBytes 395 case Types.FixedLenByteArray: 396 return FixedLenByteArraySizeBytes 397 } 398 panic("no bytesize info for type") 399 }